From fac9b83ea79aa3112ed245d9a4fc2a5c3ec2b7ec Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 18 May 2005 22:46:34 -0700 Subject: [PATCH 01/15] [TG3]: Add tagged status support. When supported, use the TAGGED interrupt processing support the chip provides. In this mode, instead of a "on/off" binary semaphore, an incrementing tag scheme is used to ACK interrupts. All MSI supporting chips support TAGGED mode, so the tg3_msi() interrupt handler uses it unconditionally. This invariant is verified when MSI support is tested. Since we can invoke tg3_poll() multiple times per interrupt under high packet load, we fetch a new copy of the tag value in the status block right before we actually do the work. Also, because the tagged status tells the chip exactly which work we have processed, we can make two optimizations: 1) tg3_restart_ints() need not check tg3_has_work() 2) the tg3_timer() need not poke the chip 10 times per second to keep from losing interrupt events Based upon valuable feedback from Michael Chan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 195 +++++++++++++++++++++++++++++++++------------- drivers/net/tg3.h | 2 + 2 files changed, 142 insertions(+), 55 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index f79b02e80e75..488b8c65252d 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -420,7 +420,8 @@ static void tg3_enable_ints(struct tg3 *tp) { tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000000); + tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + (tp->last_tag << 24)); tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); tg3_cond_int(tp); @@ -455,10 +456,16 @@ static void tg3_restart_ints(struct tg3 *tp) { tw32(TG3PCI_MISC_HOST_CTRL, (tp->misc_host_ctrl & ~MISC_HOST_CTRL_MASK_PCI_INT)); - tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000000); + tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + tp->last_tag << 24); mmiowb(); - if (tg3_has_work(tp)) + /* When doing tagged status, this work check is unnecessary. + * The last_tag we write above tells the chip which piece of + * work we've completed. + */ + if (!(tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) && + tg3_has_work(tp)) tw32(HOSTCC_MODE, tp->coalesce_mode | (HOSTCC_MODE_ENABLE | HOSTCC_MODE_NOW)); } @@ -2862,6 +2869,9 @@ static int tg3_poll(struct net_device *netdev, int *budget) spin_lock_irqsave(&tp->lock, flags); + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + tp->last_tag = sblk->status_tag; + /* handle link change and other phy events */ if (!(tp->tg3_flags & (TG3_FLAG_USE_LINKCHG_REG | @@ -2928,22 +2938,21 @@ static irqreturn_t tg3_msi(int irq, void *dev_id, struct pt_regs *regs) spin_lock_irqsave(&tp->lock, flags); /* - * writing any value to intr-mbox-0 clears PCI INTA# and + * Writing any value to intr-mbox-0 clears PCI INTA# and * chip-internal interrupt pending events. - * writing non-zero to intr-mbox-0 additional tells the + * Writing non-zero to intr-mbox-0 additional tells the * NIC to stop sending us irqs, engaging "in-intr-handler" * event coalescing. */ tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); + tp->last_tag = sblk->status_tag; sblk->status &= ~SD_STATUS_UPDATED; - if (likely(tg3_has_work(tp))) netif_rx_schedule(dev); /* schedule NAPI poll */ else { - /* no work, re-enable interrupts - */ + /* No work, re-enable interrupts. */ tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - 0x00000000); + tp->last_tag << 24); } spin_unlock_irqrestore(&tp->lock, flags); @@ -2961,6 +2970,52 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs) spin_lock_irqsave(&tp->lock, flags); + /* In INTx mode, it is possible for the interrupt to arrive at + * the CPU before the status block posted prior to the interrupt. + * Reading the PCI State register will confirm whether the + * interrupt is ours and will flush the status block. + */ + if ((sblk->status & SD_STATUS_UPDATED) || + !(tr32(TG3PCI_PCISTATE) & PCISTATE_INT_NOT_ACTIVE)) { + /* + * Writing any value to intr-mbox-0 clears PCI INTA# and + * chip-internal interrupt pending events. + * Writing non-zero to intr-mbox-0 additional tells the + * NIC to stop sending us irqs, engaging "in-intr-handler" + * event coalescing. + */ + tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + 0x00000001); + sblk->status &= ~SD_STATUS_UPDATED; + if (likely(tg3_has_work(tp))) + netif_rx_schedule(dev); /* schedule NAPI poll */ + else { + /* No work, shared interrupt perhaps? re-enable + * interrupts, and flush that PCI write + */ + tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, + 0x00000000); + tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + } + } else { /* shared interrupt */ + handled = 0; + } + + spin_unlock_irqrestore(&tp->lock, flags); + + return IRQ_RETVAL(handled); +} + +static irqreturn_t tg3_interrupt_tagged(int irq, void *dev_id, struct pt_regs *regs) +{ + struct net_device *dev = dev_id; + struct tg3 *tp = netdev_priv(dev); + struct tg3_hw_status *sblk = tp->hw_status; + unsigned long flags; + unsigned int handled = 1; + + spin_lock_irqsave(&tp->lock, flags); + /* In INTx mode, it is possible for the interrupt to arrive at * the CPU before the status block posted prior to the interrupt. * Reading the PCI State register will confirm whether the @@ -2977,13 +3032,8 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs) */ tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0x00000001); - /* - * Flush PCI write. This also guarantees that our - * status block has been flushed to host memory. - */ - tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tp->last_tag = sblk->status_tag; sblk->status &= ~SD_STATUS_UPDATED; - if (likely(tg3_has_work(tp))) netif_rx_schedule(dev); /* schedule NAPI poll */ else { @@ -2991,7 +3041,7 @@ static irqreturn_t tg3_interrupt(int irq, void *dev_id, struct pt_regs *regs) * interrupts, and flush that PCI write */ tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, - 0x00000000); + tp->last_tag << 24); tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); } } else { /* shared interrupt */ @@ -5445,7 +5495,8 @@ static int tg3_reset_hw(struct tg3 *tp) udelay(100); tw32_mailbox(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW, 0); - tr32(MAILBOX_INTERRUPT_0); + tr32(MAILBOX_INTERRUPT_0 + TG3_64BIT_REG_LOW); + tp->last_tag = 0; if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { tw32_f(DMAC_MODE, DMAC_MODE_ENABLE); @@ -5723,31 +5774,33 @@ static void tg3_timer(unsigned long __opaque) spin_lock_irqsave(&tp->lock, flags); spin_lock(&tp->tx_lock); - /* All of this garbage is because when using non-tagged - * IRQ status the mailbox/status_block protocol the chip - * uses with the cpu is race prone. - */ - if (tp->hw_status->status & SD_STATUS_UPDATED) { - tw32(GRC_LOCAL_CTRL, - tp->grc_local_ctrl | GRC_LCLCTRL_SETINT); - } else { - tw32(HOSTCC_MODE, tp->coalesce_mode | - (HOSTCC_MODE_ENABLE | HOSTCC_MODE_NOW)); - } + if (!(tp->tg3_flags & TG3_FLAG_TAGGED_STATUS)) { + /* All of this garbage is because when using non-tagged + * IRQ status the mailbox/status_block protocol the chip + * uses with the cpu is race prone. + */ + if (tp->hw_status->status & SD_STATUS_UPDATED) { + tw32(GRC_LOCAL_CTRL, + tp->grc_local_ctrl | GRC_LCLCTRL_SETINT); + } else { + tw32(HOSTCC_MODE, tp->coalesce_mode | + (HOSTCC_MODE_ENABLE | HOSTCC_MODE_NOW)); + } - if (!(tr32(WDMAC_MODE) & WDMAC_MODE_ENABLE)) { - tp->tg3_flags2 |= TG3_FLG2_RESTART_TIMER; - spin_unlock(&tp->tx_lock); - spin_unlock_irqrestore(&tp->lock, flags); - schedule_work(&tp->reset_task); - return; + if (!(tr32(WDMAC_MODE) & WDMAC_MODE_ENABLE)) { + tp->tg3_flags2 |= TG3_FLG2_RESTART_TIMER; + spin_unlock(&tp->tx_lock); + spin_unlock_irqrestore(&tp->lock, flags); + schedule_work(&tp->reset_task); + return; + } } - if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) - tg3_periodic_fetch_stats(tp); - /* This part only runs once per second. */ if (!--tp->timer_counter) { + if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) + tg3_periodic_fetch_stats(tp); + if (tp->tg3_flags & TG3_FLAG_USE_LINKCHG_REG) { u32 mac_stat; int phy_event; @@ -5846,9 +5899,13 @@ static int tg3_test_interrupt(struct tg3 *tp) if (tp->tg3_flags2 & TG3_FLG2_USING_MSI) err = request_irq(tp->pdev->irq, tg3_msi, SA_SAMPLE_RANDOM, dev->name, dev); - else - err = request_irq(tp->pdev->irq, tg3_interrupt, + else { + irqreturn_t (*fn)(int, void *, struct pt_regs *)=tg3_interrupt; + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + fn = tg3_interrupt_tagged; + err = request_irq(tp->pdev->irq, fn, SA_SHIRQ | SA_SAMPLE_RANDOM, dev->name, dev); + } if (err) return err; @@ -5900,9 +5957,14 @@ static int tg3_test_msi(struct tg3 *tp) tp->tg3_flags2 &= ~TG3_FLG2_USING_MSI; - err = request_irq(tp->pdev->irq, tg3_interrupt, - SA_SHIRQ | SA_SAMPLE_RANDOM, dev->name, dev); + { + irqreturn_t (*fn)(int, void *, struct pt_regs *)=tg3_interrupt; + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + fn = tg3_interrupt_tagged; + err = request_irq(tp->pdev->irq, fn, + SA_SHIRQ | SA_SAMPLE_RANDOM, dev->name, dev); + } if (err) return err; @@ -5948,7 +6010,13 @@ static int tg3_open(struct net_device *dev) if ((tp->tg3_flags2 & TG3_FLG2_5750_PLUS) && (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5750_AX) && (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5750_BX)) { - if (pci_enable_msi(tp->pdev) == 0) { + /* All MSI supporting chips should support tagged + * status. Assert that this is the case. + */ + if (!(tp->tg3_flags & TG3_FLAG_TAGGED_STATUS)) { + printk(KERN_WARNING PFX "%s: MSI without TAGGED? " + "Not using MSI.\n", tp->dev->name); + } else if (pci_enable_msi(tp->pdev) == 0) { u32 msi_mode; msi_mode = tr32(MSGINT_MODE); @@ -5959,9 +6027,14 @@ static int tg3_open(struct net_device *dev) if (tp->tg3_flags2 & TG3_FLG2_USING_MSI) err = request_irq(tp->pdev->irq, tg3_msi, SA_SAMPLE_RANDOM, dev->name, dev); - else - err = request_irq(tp->pdev->irq, tg3_interrupt, + else { + irqreturn_t (*fn)(int, void *, struct pt_regs *)=tg3_interrupt; + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + fn = tg3_interrupt_tagged; + + err = request_irq(tp->pdev->irq, fn, SA_SHIRQ | SA_SAMPLE_RANDOM, dev->name, dev); + } if (err) { if (tp->tg3_flags2 & TG3_FLG2_USING_MSI) { @@ -5980,9 +6053,16 @@ static int tg3_open(struct net_device *dev) tg3_halt(tp, 1); tg3_free_rings(tp); } else { - tp->timer_offset = HZ / 10; - tp->timer_counter = tp->timer_multiplier = 10; - tp->asf_counter = tp->asf_multiplier = (10 * 120); + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + tp->timer_offset = HZ; + else + tp->timer_offset = HZ / 10; + + BUG_ON(tp->timer_offset > HZ); + tp->timer_counter = tp->timer_multiplier = + (HZ / tp->timer_offset); + tp->asf_counter = tp->asf_multiplier = + ((HZ / tp->timer_offset) * 120); init_timer(&tp->timer); tp->timer.expires = jiffies + tp->timer_offset; @@ -6005,6 +6085,7 @@ static int tg3_open(struct net_device *dev) if (tp->tg3_flags2 & TG3_FLG2_USING_MSI) { err = tg3_test_msi(tp); + if (err) { spin_lock_irq(&tp->lock); spin_lock(&tp->tx_lock); @@ -8422,15 +8503,7 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS) tp->tg3_flags2 |= TG3_FLG2_PHY_BER_BUG; - /* Only 5701 and later support tagged irq status mode. - * Also, 5788 chips cannot use tagged irq status. - * - * However, since we are using NAPI avoid tagged irq status - * because the interrupt condition is more difficult to - * fully clear in that mode. - */ tp->coalesce_mode = 0; - if (GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5700_AX && GET_CHIP_REV(tp->pci_chip_rev_id) != CHIPREV_5700_BX) tp->coalesce_mode |= HOSTCC_MODE_32BYTE; @@ -8494,6 +8567,18 @@ static int __devinit tg3_get_invariants(struct tg3 *tp) grc_misc_cfg == GRC_MISC_CFG_BOARD_ID_5788M)) tp->tg3_flags2 |= TG3_FLG2_IS_5788; + if (!(tp->tg3_flags2 & TG3_FLG2_IS_5788) && + (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5700)) + tp->tg3_flags |= TG3_FLAG_TAGGED_STATUS; + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) { + tp->coalesce_mode |= (HOSTCC_MODE_CLRTICK_RXBD | + HOSTCC_MODE_CLRTICK_TXBD); + + tp->misc_host_ctrl |= MISC_HOST_CTRL_TAGGED_STATUS; + pci_write_config_dword(tp->pdev, TG3PCI_MISC_HOST_CTRL, + tp->misc_host_ctrl); + } + /* these are limited to 10/100 only */ if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5703 && (grc_misc_cfg == 0x8000 || grc_misc_cfg == 0x4000)) || diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 8de6f21037ba..44c65160a20f 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2023,6 +2023,7 @@ struct tg3 { struct tg3_hw_status *hw_status; dma_addr_t status_mapping; + u32 last_tag; u32 msg_enable; @@ -2068,6 +2069,7 @@ struct tg3 { u32 rx_offset; u32 tg3_flags; +#define TG3_FLAG_TAGGED_STATUS 0x00000001 #define TG3_FLAG_TXD_MBOX_HWBUG 0x00000002 #define TG3_FLAG_RX_CHECKSUMS 0x00000004 #define TG3_FLAG_USE_LINKCHG_REG 0x00000008 From 15f9850d3c2d46f5851a424d2990a18b5bb5ebfd Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 18 May 2005 22:49:26 -0700 Subject: [PATCH 02/15] [TG3]: Set minimal hw interrupt mitigation. Even though we do software interrupt mitigation via NAPI, it still helps to have some minimal hw assisted mitigation. This helps, particularly, on systems where register I/O overhead is much greater than the CPU horsepower. For example, it helps on NUMA systems. In such cases the PIO overhead to disable interrupts for NAPI accounts for the majority of the packet processing cost. The CPU is fast enough such that only a single packet is processed by each NAPI poll call. Thanks to Michael Chan for reviewing this patch. Signed-off-by: David S. Miller --- drivers/net/tg3.c | 72 ++++++++++++++++++++++++++++++++++++++--------- drivers/net/tg3.h | 6 +++- 2 files changed, 64 insertions(+), 14 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 488b8c65252d..3df5b78d2693 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -2507,7 +2507,7 @@ static int tg3_setup_phy(struct tg3 *tp, int force_reset) if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { if (netif_carrier_ok(tp->dev)) { tw32(HOSTCC_STAT_COAL_TICKS, - DEFAULT_STAT_COAL_TICKS); + tp->coal.stats_block_coalesce_usecs); } else { tw32(HOSTCC_STAT_COAL_TICKS, 0); } @@ -5094,6 +5094,27 @@ static void tg3_set_bdinfo(struct tg3 *tp, u32 bdinfo_addr, } static void __tg3_set_rx_mode(struct net_device *); +static void tg3_set_coalesce(struct tg3 *tp, struct ethtool_coalesce *ec) +{ + tw32(HOSTCC_RXCOL_TICKS, ec->rx_coalesce_usecs); + tw32(HOSTCC_TXCOL_TICKS, ec->tx_coalesce_usecs); + tw32(HOSTCC_RXMAX_FRAMES, ec->rx_max_coalesced_frames); + tw32(HOSTCC_TXMAX_FRAMES, ec->tx_max_coalesced_frames); + if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { + tw32(HOSTCC_RXCOAL_TICK_INT, ec->rx_coalesce_usecs_irq); + tw32(HOSTCC_TXCOAL_TICK_INT, ec->tx_coalesce_usecs_irq); + } + tw32(HOSTCC_RXCOAL_MAXF_INT, ec->rx_max_coalesced_frames_irq); + tw32(HOSTCC_TXCOAL_MAXF_INT, ec->tx_max_coalesced_frames_irq); + if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { + u32 val = ec->stats_block_coalesce_usecs; + + if (!netif_carrier_ok(tp->dev)) + val = 0; + + tw32(HOSTCC_STAT_COAL_TICKS, val); + } +} /* tp->lock is held. */ static int tg3_reset_hw(struct tg3 *tp) @@ -5416,16 +5437,7 @@ static int tg3_reset_hw(struct tg3 *tp) udelay(10); } - tw32(HOSTCC_RXCOL_TICKS, 0); - tw32(HOSTCC_TXCOL_TICKS, LOW_TXCOL_TICKS); - tw32(HOSTCC_RXMAX_FRAMES, 1); - tw32(HOSTCC_TXMAX_FRAMES, LOW_RXMAX_FRAMES); - if (!(tp->tg3_flags2 & TG3_FLG2_5705_PLUS)) { - tw32(HOSTCC_RXCOAL_TICK_INT, 0); - tw32(HOSTCC_TXCOAL_TICK_INT, 0); - } - tw32(HOSTCC_RXCOAL_MAXF_INT, 1); - tw32(HOSTCC_TXCOAL_MAXF_INT, 0); + tg3_set_coalesce(tp, &tp->coal); /* set status block DMA address */ tw32(HOSTCC_STATUS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH, @@ -5438,8 +5450,6 @@ static int tg3_reset_hw(struct tg3 *tp) * the tg3_periodic_fetch_stats call there, and * tg3_get_stats to see how this works for 5705/5750 chips. */ - tw32(HOSTCC_STAT_COAL_TICKS, - DEFAULT_STAT_COAL_TICKS); tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_HIGH, ((u64) tp->stats_mapping >> 32)); tw32(HOSTCC_STATS_BLK_HOST_ADDR + TG3_64BIT_REG_LOW, @@ -7284,6 +7294,14 @@ static void tg3_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid) } #endif +static int tg3_get_coalesce(struct net_device *dev, struct ethtool_coalesce *ec) +{ + struct tg3 *tp = netdev_priv(dev); + + memcpy(ec, &tp->coal, sizeof(*ec)); + return 0; +} + static struct ethtool_ops tg3_ethtool_ops = { .get_settings = tg3_get_settings, .set_settings = tg3_set_settings, @@ -7316,6 +7334,7 @@ static struct ethtool_ops tg3_ethtool_ops = { .get_strings = tg3_get_strings, .get_stats_count = tg3_get_stats_count, .get_ethtool_stats = tg3_get_ethtool_stats, + .get_coalesce = tg3_get_coalesce, }; static void __devinit tg3_get_eeprom_size(struct tg3 *tp) @@ -9096,6 +9115,31 @@ static struct pci_dev * __devinit tg3_find_5704_peer(struct tg3 *tp) return peer; } +static void __devinit tg3_init_coal(struct tg3 *tp) +{ + struct ethtool_coalesce *ec = &tp->coal; + + memset(ec, 0, sizeof(*ec)); + ec->cmd = ETHTOOL_GCOALESCE; + ec->rx_coalesce_usecs = LOW_RXCOL_TICKS; + ec->tx_coalesce_usecs = LOW_TXCOL_TICKS; + ec->rx_max_coalesced_frames = LOW_RXMAX_FRAMES; + ec->tx_max_coalesced_frames = LOW_TXMAX_FRAMES; + ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT; + ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT; + ec->rx_max_coalesced_frames_irq = DEFAULT_RXCOAL_MAXF_INT; + ec->tx_max_coalesced_frames_irq = DEFAULT_TXCOAL_MAXF_INT; + ec->stats_block_coalesce_usecs = DEFAULT_STAT_COAL_TICKS; + + if (tp->coalesce_mode & (HOSTCC_MODE_CLRTICK_RXBD | + HOSTCC_MODE_CLRTICK_TXBD)) { + ec->rx_coalesce_usecs = LOW_RXCOL_TICKS_CLRTCKS; + ec->rx_coalesce_usecs_irq = DEFAULT_RXCOAL_TICK_INT_CLRTCKS; + ec->tx_coalesce_usecs = LOW_TXCOL_TICKS_CLRTCKS; + ec->tx_coalesce_usecs_irq = DEFAULT_TXCOAL_TICK_INT_CLRTCKS; + } +} + static int __devinit tg3_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -9341,6 +9385,8 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, /* flow control autonegotiation is default behavior */ tp->tg3_flags |= TG3_FLAG_PAUSE_AUTONEG; + tg3_init_coal(tp); + err = register_netdev(dev); if (err) { printk(KERN_ERR PFX "Cannot register net device, " diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index 44c65160a20f..993f84c93dc4 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -876,10 +876,12 @@ #define HOSTCC_STATUS_ERROR_ATTN 0x00000004 #define HOSTCC_RXCOL_TICKS 0x00003c08 #define LOW_RXCOL_TICKS 0x00000032 +#define LOW_RXCOL_TICKS_CLRTCKS 0x00000014 #define DEFAULT_RXCOL_TICKS 0x00000048 #define HIGH_RXCOL_TICKS 0x00000096 #define HOSTCC_TXCOL_TICKS 0x00003c0c #define LOW_TXCOL_TICKS 0x00000096 +#define LOW_TXCOL_TICKS_CLRTCKS 0x00000048 #define DEFAULT_TXCOL_TICKS 0x0000012c #define HIGH_TXCOL_TICKS 0x00000145 #define HOSTCC_RXMAX_FRAMES 0x00003c10 @@ -892,8 +894,10 @@ #define HIGH_TXMAX_FRAMES 0x00000052 #define HOSTCC_RXCOAL_TICK_INT 0x00003c18 #define DEFAULT_RXCOAL_TICK_INT 0x00000019 +#define DEFAULT_RXCOAL_TICK_INT_CLRTCKS 0x00000014 #define HOSTCC_TXCOAL_TICK_INT 0x00003c1c #define DEFAULT_TXCOAL_TICK_INT 0x00000019 +#define DEFAULT_TXCOAL_TICK_INT_CLRTCKS 0x00000014 #define HOSTCC_RXCOAL_MAXF_INT 0x00003c20 #define DEFAULT_RXCOAL_MAXF_INT 0x00000005 #define HOSTCC_TXCOAL_MAXF_INT 0x00003c24 @@ -2227,7 +2231,7 @@ struct tg3 { #define SST_25VF0X0_PAGE_SIZE 4098 - + struct ethtool_coalesce coal; }; #endif /* !(_T3_H) */ From 59e6b4343299373bc10dd131ab5142f53ddd838a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 18 May 2005 22:50:10 -0700 Subject: [PATCH 03/15] [TG3]: Refine DMA boundary setting. Extract DMA boundary bit selection into a seperate function, tg3_calc_dma_bndry(). Call this from tg3_test_dma(). Make DMA test more reliable by using no DMA boundry setting during the test. If the test passes, then use the setting we selected before the test. Signed-off-by: David S. Miller Signed-off-by: Michael Chan --- drivers/net/tg3.c | 207 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 163 insertions(+), 44 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 3df5b78d2693..4ab9680ffbd2 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -8775,6 +8775,146 @@ static int __devinit tg3_get_device_address(struct tg3 *tp) return 0; } +#define BOUNDARY_SINGLE_CACHELINE 1 +#define BOUNDARY_MULTI_CACHELINE 2 + +static u32 __devinit tg3_calc_dma_bndry(struct tg3 *tp, u32 val) +{ + int cacheline_size; + u8 byte; + int goal; + + pci_read_config_byte(tp->pdev, PCI_CACHE_LINE_SIZE, &byte); + if (byte == 0) + cacheline_size = 1024; + else + cacheline_size = (int) byte * 4; + + /* On 5703 and later chips, the boundary bits have no + * effect. + */ + if (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5700 && + GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5701 && + !(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) + goto out; + +#if defined(CONFIG_PPC64) || defined(CONFIG_IA64) || defined(CONFIG_PARISC) + goal = BOUNDARY_MULTI_CACHELINE; +#else +#if defined(CONFIG_SPARC64) || defined(CONFIG_ALPHA) + goal = BOUNDARY_SINGLE_CACHELINE; +#else + goal = 0; +#endif +#endif + + if (!goal) + goto out; + + /* PCI controllers on most RISC systems tend to disconnect + * when a device tries to burst across a cache-line boundary. + * Therefore, letting tg3 do so just wastes PCI bandwidth. + * + * Unfortunately, for PCI-E there are only limited + * write-side controls for this, and thus for reads + * we will still get the disconnects. We'll also waste + * these PCI cycles for both read and write for chips + * other than 5700 and 5701 which do not implement the + * boundary bits. + */ + if ((tp->tg3_flags & TG3_FLAG_PCIX_MODE) && + !(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) { + switch (cacheline_size) { + case 16: + case 32: + case 64: + case 128: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val |= (DMA_RWCTRL_READ_BNDRY_128_PCIX | + DMA_RWCTRL_WRITE_BNDRY_128_PCIX); + } else { + val |= (DMA_RWCTRL_READ_BNDRY_384_PCIX | + DMA_RWCTRL_WRITE_BNDRY_384_PCIX); + } + break; + + case 256: + val |= (DMA_RWCTRL_READ_BNDRY_256_PCIX | + DMA_RWCTRL_WRITE_BNDRY_256_PCIX); + break; + + default: + val |= (DMA_RWCTRL_READ_BNDRY_384_PCIX | + DMA_RWCTRL_WRITE_BNDRY_384_PCIX); + break; + }; + } else if (tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) { + switch (cacheline_size) { + case 16: + case 32: + case 64: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val &= ~DMA_RWCTRL_WRITE_BNDRY_DISAB_PCIE; + val |= DMA_RWCTRL_WRITE_BNDRY_64_PCIE; + break; + } + /* fallthrough */ + case 128: + default: + val &= ~DMA_RWCTRL_WRITE_BNDRY_DISAB_PCIE; + val |= DMA_RWCTRL_WRITE_BNDRY_128_PCIE; + break; + }; + } else { + switch (cacheline_size) { + case 16: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val |= (DMA_RWCTRL_READ_BNDRY_16 | + DMA_RWCTRL_WRITE_BNDRY_16); + break; + } + /* fallthrough */ + case 32: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val |= (DMA_RWCTRL_READ_BNDRY_32 | + DMA_RWCTRL_WRITE_BNDRY_32); + break; + } + /* fallthrough */ + case 64: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val |= (DMA_RWCTRL_READ_BNDRY_64 | + DMA_RWCTRL_WRITE_BNDRY_64); + break; + } + /* fallthrough */ + case 128: + if (goal == BOUNDARY_SINGLE_CACHELINE) { + val |= (DMA_RWCTRL_READ_BNDRY_128 | + DMA_RWCTRL_WRITE_BNDRY_128); + break; + } + /* fallthrough */ + case 256: + val |= (DMA_RWCTRL_READ_BNDRY_256 | + DMA_RWCTRL_WRITE_BNDRY_256); + break; + case 512: + val |= (DMA_RWCTRL_READ_BNDRY_512 | + DMA_RWCTRL_WRITE_BNDRY_512); + break; + case 1024: + default: + val |= (DMA_RWCTRL_READ_BNDRY_1024 | + DMA_RWCTRL_WRITE_BNDRY_1024); + break; + }; + } + +out: + return val; +} + static int __devinit tg3_do_test_dma(struct tg3 *tp, u32 *buf, dma_addr_t buf_dma, int size, int to_device) { struct tg3_internal_buffer_desc test_desc; @@ -8861,7 +9001,7 @@ static int __devinit tg3_do_test_dma(struct tg3 *tp, u32 *buf, dma_addr_t buf_dm static int __devinit tg3_test_dma(struct tg3 *tp) { dma_addr_t buf_dma; - u32 *buf; + u32 *buf, saved_dma_rwctrl; int ret; buf = pci_alloc_consistent(tp->pdev, TEST_BUFFER_SIZE, &buf_dma); @@ -8873,46 +9013,7 @@ static int __devinit tg3_test_dma(struct tg3 *tp) tp->dma_rwctrl = ((0x7 << DMA_RWCTRL_PCI_WRITE_CMD_SHIFT) | (0x6 << DMA_RWCTRL_PCI_READ_CMD_SHIFT)); -#ifndef CONFIG_X86 - { - u8 byte; - int cacheline_size; - pci_read_config_byte(tp->pdev, PCI_CACHE_LINE_SIZE, &byte); - - if (byte == 0) - cacheline_size = 1024; - else - cacheline_size = (int) byte * 4; - - switch (cacheline_size) { - case 16: - case 32: - case 64: - case 128: - if ((tp->tg3_flags & TG3_FLAG_PCIX_MODE) && - !(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) { - tp->dma_rwctrl |= - DMA_RWCTRL_WRITE_BNDRY_384_PCIX; - break; - } else if (tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) { - tp->dma_rwctrl &= - ~(DMA_RWCTRL_PCI_WRITE_CMD); - tp->dma_rwctrl |= - DMA_RWCTRL_WRITE_BNDRY_128_PCIE; - break; - } - /* fallthrough */ - case 256: - if (!(tp->tg3_flags & TG3_FLAG_PCIX_MODE) && - !(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) - tp->dma_rwctrl |= - DMA_RWCTRL_WRITE_BNDRY_256; - else if (!(tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS)) - tp->dma_rwctrl |= - DMA_RWCTRL_WRITE_BNDRY_256_PCIX; - }; - } -#endif + tp->dma_rwctrl = tg3_calc_dma_bndry(tp, tp->dma_rwctrl); if (tp->tg3_flags2 & TG3_FLG2_PCI_EXPRESS) { /* DMA read watermark not used on PCIE */ @@ -8931,7 +9032,7 @@ static int __devinit tg3_test_dma(struct tg3 *tp) if (ccval == 0x6 || ccval == 0x7) tp->dma_rwctrl |= DMA_RWCTRL_ONE_DMA; - /* Set bit 23 to renable PCIX hw bug fix */ + /* Set bit 23 to enable PCIX hw bug fix */ tp->dma_rwctrl |= 0x009f0000; } else { tp->dma_rwctrl |= 0x001b000f; @@ -8972,6 +9073,13 @@ static int __devinit tg3_test_dma(struct tg3 *tp) GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5701) goto out; + /* It is best to perform DMA test with maximum write burst size + * to expose the 5700/5701 write DMA bug. + */ + saved_dma_rwctrl = tp->dma_rwctrl; + tp->dma_rwctrl &= ~DMA_RWCTRL_WRITE_BNDRY_MASK; + tw32(TG3PCI_DMA_RW_CTRL, tp->dma_rwctrl); + while (1) { u32 *p = buf, i; @@ -9010,8 +9118,9 @@ static int __devinit tg3_test_dma(struct tg3 *tp) if (p[i] == i) continue; - if ((tp->dma_rwctrl & DMA_RWCTRL_WRITE_BNDRY_MASK) == - DMA_RWCTRL_WRITE_BNDRY_DISAB) { + if ((tp->dma_rwctrl & DMA_RWCTRL_WRITE_BNDRY_MASK) != + DMA_RWCTRL_WRITE_BNDRY_16) { + tp->dma_rwctrl &= ~DMA_RWCTRL_WRITE_BNDRY_MASK; tp->dma_rwctrl |= DMA_RWCTRL_WRITE_BNDRY_16; tw32(TG3PCI_DMA_RW_CTRL, tp->dma_rwctrl); break; @@ -9028,6 +9137,14 @@ static int __devinit tg3_test_dma(struct tg3 *tp) break; } } + if ((tp->dma_rwctrl & DMA_RWCTRL_WRITE_BNDRY_MASK) != + DMA_RWCTRL_WRITE_BNDRY_16) { + /* DMA test passed without adjusting DMA boundary, + * just restore the calculated DMA boundary + */ + tp->dma_rwctrl = saved_dma_rwctrl; + tw32(TG3PCI_DMA_RW_CTRL, tp->dma_rwctrl); + } out: pci_free_consistent(tp->pdev, TEST_BUFFER_SIZE, buf, buf_dma); @@ -9429,6 +9546,8 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, (tp->tg3_flags & TG3_FLAG_SPLIT_MODE) != 0, (tp->tg3_flags2 & TG3_FLG2_NO_ETH_WIRE_SPEED) == 0, (tp->tg3_flags2 & TG3_FLG2_TSO_CAPABLE) != 0); + printk(KERN_INFO "%s: dma_rwctrl[%08x]\n", + dev->name, tp->dma_rwctrl); return 0; From f7383c22246cfccbe912541dd83103009ed2b537 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 18 May 2005 22:50:53 -0700 Subject: [PATCH 04/15] [TG3]: In tg3_poll(), resample status_tag after doing work. Signed-off-by: David S. Miller --- drivers/net/tg3.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 4ab9680ffbd2..4d2bdbdd34e8 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -2869,9 +2869,6 @@ static int tg3_poll(struct net_device *netdev, int *budget) spin_lock_irqsave(&tp->lock, flags); - if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) - tp->last_tag = sblk->status_tag; - /* handle link change and other phy events */ if (!(tp->tg3_flags & (TG3_FLAG_USE_LINKCHG_REG | @@ -2896,7 +2893,6 @@ static int tg3_poll(struct net_device *netdev, int *budget) * All RX "locking" is done by ensuring outside * code synchronizes with dev->poll() */ - done = 1; if (sblk->idx[0].rx_producer != tp->rx_rcb_ptr) { int orig_budget = *budget; int work_done; @@ -2908,12 +2904,14 @@ static int tg3_poll(struct net_device *netdev, int *budget) *budget -= work_done; netdev->quota -= work_done; - - if (work_done >= orig_budget) - done = 0; } + if (tp->tg3_flags & TG3_FLAG_TAGGED_STATUS) + tp->last_tag = sblk->status_tag; + rmb(); + /* if no more work, tell net stack and NIC we're done */ + done = !tg3_has_work(tp); if (done) { spin_lock_irqsave(&tp->lock, flags); __netif_rx_complete(netdev); From d48102007d068df7ba3055cdc1723e12db1ba30f Mon Sep 17 00:00:00 2001 From: Evgeniy Polyakov Date: Wed, 18 May 2005 22:51:45 -0700 Subject: [PATCH 05/15] [XFRM]: skb_cow_data() does not set proper owner for new skbs. It looks like skb_cow_data() does not set proper owner for newly created skb. If we have several fragments for skb and some of them are shared(?) or cloned (like in async IPsec) there might be a situation when we require recreating skb and thus using skb_copy() for it. Newly created skb has neither a destructor nor a socket assotiated with it, which must be copied from the old skb. As far as I can see, current code sets destructor and socket for the first one skb only and uses truesize of the first skb only to increment sk_wmem_alloc value. If above "analysis" is correct then attached patch fixes that. Signed-off-by: Evgeniy Polyakov Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_algo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 080aae243ce0..2f4531fcaca2 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -698,7 +698,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; if (skb1->sk) - skb_set_owner_w(skb, skb1->sk); + skb_set_owner_w(skb2, skb1->sk); /* Looking around. Are we still alive? * OK, link new skb, drop old one */ From 2fdba6b085eb7068e9594cfa55ffe40466184b4d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Wed, 18 May 2005 22:52:33 -0700 Subject: [PATCH 06/15] [IPV4/IPV6] Ensure all frag_list members have NULL sk Having frag_list members which holds wmem of an sk leads to nightmares with partially cloned frag skb's. The reason is that once you unleash a skb with a frag_list that has individual sk ownerships into the stack you can never undo those ownerships safely as they may have been cloned by things like netfilter. Since we have to undo them in order to make skb_linearize happy this approach leads to a dead-end. So let's go the other way and make this an invariant: For any skb on a frag_list, skb->sk must be NULL. That is, the socket ownership always belongs to the head skb. It turns out that the implementation is actually pretty simple. The above invariant is actually violated in the following patch for a short duration inside ip_fragment. This is OK because the offending frag_list member is either destroyed at the end of the slow path without being sent anywhere, or it is detached from the frag_list before being sent. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 ++++++++ net/ipv6/ip6_output.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index daebd93fd8a0..760dc8238d65 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -490,6 +490,14 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } /* Everything is OK. Generate! */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0f0711417c9d..b78a53586804 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -552,13 +552,17 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) skb_headroom(frag) < hlen) goto slow_path; - /* Correct socket ownership. */ - if (frag->sk == NULL) - goto slow_path; - /* Partially cloned skb? */ if (skb_shared(frag)) goto slow_path; + + BUG_ON(frag->sk); + if (skb->sk) { + sock_hold(skb->sk); + frag->sk = skb->sk; + frag->destructor = sock_wfree; + skb->truesize -= frag->truesize; + } } err = 0; @@ -1116,12 +1120,10 @@ int ip6_push_pending_frames(struct sock *sk) tail_skb = &(tmp_skb->next); skb->len += tmp_skb->len; skb->data_len += tmp_skb->len; -#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ skb->truesize += tmp_skb->truesize; __sock_put(tmp_skb->sk); tmp_skb->destructor = NULL; tmp_skb->sk = NULL; -#endif } ipv6_addr_copy(final_dst, &fl->fl6_dst); From f81a0bffa116ea22149aa7cfb0b4ee09096d9d92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 19 May 2005 12:26:43 -0700 Subject: [PATCH 07/15] [AF_UNIX]: Use lookup_create(). currently it opencodes it, but that's in the way of chaning the lookup_hash interface. I'd prefer to disallow modular af_unix over exporting lookup_create, but I'll leave that to you. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/namei.c | 1 + net/unix/af_unix.c | 28 +++------------------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index defe6781e003..dd78f01b6de8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1580,6 +1580,7 @@ enoent: fail: return dentry; } +EXPORT_SYMBOL_GPL(lookup_create); int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c478fc8db776..c420eba4876b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -770,33 +770,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); if (err) goto out_mknod_parent; - /* - * Yucky last component or no last component at all? - * (foo/., foo/.., /////) - */ - err = -EEXIST; - if (nd.last_type != LAST_NORM) - goto out_mknod; - /* - * Lock the directory. - */ - down(&nd.dentry->d_inode->i_sem); - /* - * Do the final lookup. - */ - dentry = lookup_hash(&nd.last, nd.dentry); + + dentry = lookup_create(&nd, 0); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_mknod_unlock; - err = -ENOENT; - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (nd.last.name[nd.last.len] && !dentry->d_inode) - goto out_mknod_dput; + /* * All right, let's create it. */ @@ -845,7 +824,6 @@ out_mknod_dput: dput(dentry); out_mknod_unlock: up(&nd.dentry->d_inode->i_sem); -out_mknod: path_release(&nd); out_mknod_parent: if (err==-EEXIST) From d9fa0f392b20b2b8e3df379c44194492a2446c6e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 19 May 2005 12:29:59 -0700 Subject: [PATCH 08/15] [IP_VS]: Remove extra __ip_vs_conn_put() for incoming ICMP. Remove extra __ip_vs_conn_put for incoming ICMP in direct routing mode. Mark de Vries reports that IPVS connections are not leaked anymore. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_xmit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index faa6176bbeb1..de21da00057f 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -508,7 +508,6 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, rc = NF_ACCEPT; /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); - __ip_vs_conn_put(cp); goto out; } From 8be58932ca596972e4953ae980d8bc286857cae8 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 19 May 2005 12:36:33 -0700 Subject: [PATCH 09/15] [NETFILTER]: Do not be clever about SKB ownership in ip_ct_gather_frags(). Just do an skb_orphan() and be done with it. Based upon discussions with Herbert Xu on netdev. Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 28 ++++++++------------------ 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 28d9425d5c39..09e824622977 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -940,37 +940,25 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) { - struct sock *sk = skb->sk; #ifdef CONFIG_NETFILTER_DEBUG unsigned int olddebug = skb->nf_debug; #endif - if (sk) { - sock_hold(sk); - skb_orphan(skb); - } + skb_orphan(skb); local_bh_disable(); skb = ip_defrag(skb, user); local_bh_enable(); - if (!skb) { - if (sk) - sock_put(sk); - return skb; - } - - if (sk) { - skb_set_owner_w(skb, sk); - sock_put(sk); - } - - ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; + if (skb) { + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; #ifdef CONFIG_NETFILTER_DEBUG - /* Packet path as if nothing had happened. */ - skb->nf_debug = olddebug; + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; #endif + } + return skb; } From b9e9dead05b19e7f52c9aa00cd3a5b7ac4fcacf4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:04 -0700 Subject: [PATCH 10/15] [IPSEC]: Fixed alg_key_len usage in attach_one_algo The variable alg_key_len is in bits and not bytes. The function attach_one_algo is currently using it as if it were in bytes. This causes it to read memory which may not be there. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5ddda2c98af9..15ba08602aa1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -162,6 +162,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, struct rtattr *rta = u_arg; struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; + int len; if (!rta) return 0; @@ -173,11 +174,12 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return -ENOSYS; *props = algo->desc.sadb_alg_id; - p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); + len = sizeof(*ualg) + (ualg->alg_key_len + 7U) / 8; + p = kmalloc(len, GFP_KERNEL); if (!p) return -ENOMEM; - memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len); + memcpy(p, ualg, len); *algpp = p; return 0; } From 31c26852cb2ac77f1d4acb37bcf31f165fd5eb68 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 19 May 2005 12:39:49 -0700 Subject: [PATCH 11/15] [IPSEC]: Verify key payload in verify_one_algo We need to verify that the payload contains enough data so that attach_one_algo can copy alg_key_len bits from the payload. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 15ba08602aa1..97509011c274 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -34,14 +34,21 @@ static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) { struct rtattr *rt = xfrma[type - 1]; struct xfrm_algo *algp; + int len; if (!rt) return 0; - if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp)) + len = (rt->rta_len - sizeof(*rt)) - sizeof(*algp); + if (len < 0) return -EINVAL; algp = RTA_DATA(rt); + + len -= (algp->alg_key_len + 7U) / 8; + if (len < 0) + return -EINVAL; + switch (type) { case XFRMA_ALG_AUTH: if (!algp->alg_key_len && From 1eda339e76a9aac05883c548028bf91aed734783 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Thu, 19 May 2005 12:42:39 -0700 Subject: [PATCH 12/15] [PKT_SCHED]: Fixup simple action define. Make it consistent with other net/sched files Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/act_generic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/act_generic.h b/include/net/act_generic.h index 95b120781c14..c9daa7e52300 100644 --- a/include/net/act_generic.h +++ b/include/net/act_generic.h @@ -2,8 +2,8 @@ * include/net/act_generic.h * */ -#ifndef ACT_GENERIC_H -#define ACT_GENERIC_H +#ifndef _NET_ACT_GENERIC_H +#define _NET_ACT_GENERIC_H static inline int tcf_defact_release(struct tcf_defact *p, int bind) { int ret = 0; From db61ecc3352d72513c1b07805bd6f760e30c001b Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 12:46:59 -0700 Subject: [PATCH 13/15] [NETLINK]: Fix race with recvmsg(). This bug causes: assertion (!atomic_read(&sk->sk_rmem_alloc)) failed at net/netlink/af_netlink.c (122) What's happening is that: 1) The skb is sent to socket 1. 2) Someone does a recvmsg on socket 1 and drops the ref on the skb. Note that the rmalloc is not returned at this point since the skb is still referenced. 3) The same skb is now sent to socket 2. This version of the fix resurrects the skb_orphan call that was moved out, last time we had 'shared-skb troubles'. It is practically a no-op in the common case, but still prevents the possible race with recvmsg. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 733bf52cef3e..b40c9a976969 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,6 +697,7 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { + skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); From 68acc024ea7391e03c2c695ba0b9fb31baa974bf Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 13:06:35 -0700 Subject: [PATCH 14/15] [NETLINK]: Move broadcast skb_orphan to the skb_get path. Cloned packets don't need the orphan call. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b40c9a976969..4b91f4b84cb7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -697,7 +697,6 @@ static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { - skb_orphan(skb); skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); @@ -736,11 +735,15 @@ static inline int do_one_broadcast(struct sock *sk, sock_hold(sk); if (p->skb2 == NULL) { - if (atomic_read(&p->skb->users) != 1) { + if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { - p->skb2 = p->skb; - atomic_inc(&p->skb->users); + p->skb2 = skb_get(p->skb); + /* + * skb ownership may have been set when + * delivered to a previous socket. + */ + skb_orphan(p->skb2); } } if (p->skb2 == NULL) { From aa1c6a6f7f0518b42994d02756a41cbfdcac1916 Mon Sep 17 00:00:00 2001 From: "Tommy S. Christensen" Date: Thu, 19 May 2005 13:07:32 -0700 Subject: [PATCH 15/15] [NETLINK]: Defer socket destruction a bit In netlink_broadcast() we're sending shared skb's to netlink listeners when possible (saves some copying). This is OK, since we hold the only other reference to the skb. However, this implies that we must drop our reference on the skb, before allowing a receiving socket to disappear. Otherwise, the socket buffer accounting is disrupted. Signed-off-by: Tommy S. Christensen Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4b91f4b84cb7..e41ce458c2a9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -789,11 +789,12 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); + kfree_skb(skb); + netlink_unlock_table(); if (info.skb2) kfree_skb(info.skb2); - kfree_skb(skb); if (info.delivered) { if (info.congested && (allocation & __GFP_WAIT))