From 5f286e113fa377e50bd18fc45e5a0d4d83f6950c Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 28 Apr 2007 20:57:37 -0700 Subject: [PATCH 01/21] [NETPOLL]: Fix TX queue overflow in trapped mode. CONFIG_NETPOLL_TRAP causes the TX queue controls to be completely bypassed in the netpoll's "trapped" mode which easily causes overflows in the drivers with short TX queues (most notably, in 8139too with its 4-deep queue). So, make this option more sensible by making it only bypass the TX softirq wakeup. Signed-off-by: Sergei Shtylyov Acked-by: Jeff Garzik Acked-by: Tom Rini Acked-by: Matt Mackall Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e027a3750a77..24cef42f1e0f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -654,8 +654,10 @@ static inline void netif_start_queue(struct net_device *dev) static inline void netif_wake_queue(struct net_device *dev) { #ifdef CONFIG_NETPOLL_TRAP - if (netpoll_trap()) + if (netpoll_trap()) { + clear_bit(__LINK_STATE_XOFF, &dev->state); return; + } #endif if (test_and_clear_bit(__LINK_STATE_XOFF, &dev->state)) __netif_schedule(dev); @@ -663,10 +665,6 @@ static inline void netif_wake_queue(struct net_device *dev) static inline void netif_stop_queue(struct net_device *dev) { -#ifdef CONFIG_NETPOLL_TRAP - if (netpoll_trap()) - return; -#endif set_bit(__LINK_STATE_XOFF, &dev->state); } From d16bfd0c77b35a0a0f0e159b94b9921510bf7934 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Sat, 28 Apr 2007 20:58:22 -0700 Subject: [PATCH 02/21] [NETPOLL]: Remove CONFIG_NETPOLL_RX Get rid of the CONFIG_NETPOLL_RX option completely since all the dependencies have been removed long ago... Signed-off-by: Sergei Shtylyov Acked-by: Jeff Garzik Acked-by: Matt Mackall Signed-off-by: David S. Miller --- drivers/net/Kconfig | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index a3d46ea37126..32a3003893d8 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2929,11 +2929,6 @@ endif #NETDEVICES config NETPOLL def_bool NETCONSOLE -config NETPOLL_RX - bool "Netpoll support for trapping incoming packets" - default n - depends on NETPOLL - config NETPOLL_TRAP bool "Netpoll traffic trapping" default n From 5a1b5898ee9e0bf68a86609ecb9775457b1857a5 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 28 Apr 2007 21:04:03 -0700 Subject: [PATCH 03/21] [NET]: Remove NETIF_F_INTERNAL_STATS, default to internal stats. Herbert Xu conviced me that a new flag was overkill; every driver currently overrides get_stats, so we might as well make the internal one the default. If someone did fail to set get_stats, they would now get all 0 stats instead of "No statistics available". Signed-off-by: Rusty Russell Acked-by: Herbert Xu Signed-off-by: David S. Miller --- arch/s390/appldata/appldata_net_sum.c | 3 -- drivers/net/bonding/bond_main.c | 51 +++++++++++---------------- drivers/parisc/led.c | 2 -- include/linux/netdevice.h | 1 - net/core/dev.c | 45 +++++++++++------------ 5 files changed, 41 insertions(+), 61 deletions(-) diff --git a/arch/s390/appldata/appldata_net_sum.c b/arch/s390/appldata/appldata_net_sum.c index 516b3ac9a9b5..a43f3488fecf 100644 --- a/arch/s390/appldata/appldata_net_sum.c +++ b/arch/s390/appldata/appldata_net_sum.c @@ -109,9 +109,6 @@ static void appldata_get_net_sum_data(void *data) read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { stats = dev->get_stats(dev); - if (stats == NULL) { - continue; - } rx_packets += stats->rx_packets; tx_packets += stats->tx_packets; rx_bytes += stats->rx_bytes; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index cea3783c92c5..724bce51f936 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1360,13 +1360,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) goto err_undo_flags; } - if (slave_dev->get_stats == NULL) { - printk(KERN_NOTICE DRV_NAME - ": %s: the driver for slave device %s does not provide " - "get_stats function, network statistics will be " - "inaccurate.\n", bond_dev->name, slave_dev->name); - } - new_slave = kzalloc(sizeof(struct slave), GFP_KERNEL); if (!new_slave) { res = -ENOMEM; @@ -3641,33 +3634,31 @@ static struct net_device_stats *bond_get_stats(struct net_device *bond_dev) bond_for_each_slave(bond, slave, i) { sstats = slave->dev->get_stats(slave->dev); - if (sstats) { - stats->rx_packets += sstats->rx_packets; - stats->rx_bytes += sstats->rx_bytes; - stats->rx_errors += sstats->rx_errors; - stats->rx_dropped += sstats->rx_dropped; + stats->rx_packets += sstats->rx_packets; + stats->rx_bytes += sstats->rx_bytes; + stats->rx_errors += sstats->rx_errors; + stats->rx_dropped += sstats->rx_dropped; - stats->tx_packets += sstats->tx_packets; - stats->tx_bytes += sstats->tx_bytes; - stats->tx_errors += sstats->tx_errors; - stats->tx_dropped += sstats->tx_dropped; + stats->tx_packets += sstats->tx_packets; + stats->tx_bytes += sstats->tx_bytes; + stats->tx_errors += sstats->tx_errors; + stats->tx_dropped += sstats->tx_dropped; - stats->multicast += sstats->multicast; - stats->collisions += sstats->collisions; + stats->multicast += sstats->multicast; + stats->collisions += sstats->collisions; - stats->rx_length_errors += sstats->rx_length_errors; - stats->rx_over_errors += sstats->rx_over_errors; - stats->rx_crc_errors += sstats->rx_crc_errors; - stats->rx_frame_errors += sstats->rx_frame_errors; - stats->rx_fifo_errors += sstats->rx_fifo_errors; - stats->rx_missed_errors += sstats->rx_missed_errors; + stats->rx_length_errors += sstats->rx_length_errors; + stats->rx_over_errors += sstats->rx_over_errors; + stats->rx_crc_errors += sstats->rx_crc_errors; + stats->rx_frame_errors += sstats->rx_frame_errors; + stats->rx_fifo_errors += sstats->rx_fifo_errors; + stats->rx_missed_errors += sstats->rx_missed_errors; - stats->tx_aborted_errors += sstats->tx_aborted_errors; - stats->tx_carrier_errors += sstats->tx_carrier_errors; - stats->tx_fifo_errors += sstats->tx_fifo_errors; - stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors; - stats->tx_window_errors += sstats->tx_window_errors; - } + stats->tx_aborted_errors += sstats->tx_aborted_errors; + stats->tx_carrier_errors += sstats->tx_carrier_errors; + stats->tx_fifo_errors += sstats->tx_fifo_errors; + stats->tx_heartbeat_errors += sstats->tx_heartbeat_errors; + stats->tx_window_errors += sstats->tx_window_errors; } read_unlock_bh(&bond->lock); diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index 453e6829756c..3df82fe9ce8c 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -373,8 +373,6 @@ static __inline__ int led_get_net_activity(void) if (LOOPBACK(in_dev->ifa_list->ifa_local)) continue; stats = dev->get_stats(dev); - if (!stats) - continue; rx_total += stats->rx_packets; tx_total += stats->tx_packets; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 24cef42f1e0f..ac0c92b1e002 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -325,7 +325,6 @@ struct net_device #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ #define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ -#define NETIF_F_INTERNAL_STATS 8192 /* Use stats structure in net_device */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 diff --git a/net/core/dev.c b/net/core/dev.c index d5e42d13bd67..eb999003bbb7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2101,26 +2101,23 @@ static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { struct net_device_stats *stats = dev->get_stats(dev); - if (stats) { - seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " - "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", - dev->name, stats->rx_bytes, stats->rx_packets, - stats->rx_errors, - stats->rx_dropped + stats->rx_missed_errors, - stats->rx_fifo_errors, - stats->rx_length_errors + stats->rx_over_errors + - stats->rx_crc_errors + stats->rx_frame_errors, - stats->rx_compressed, stats->multicast, - stats->tx_bytes, stats->tx_packets, - stats->tx_errors, stats->tx_dropped, - stats->tx_fifo_errors, stats->collisions, - stats->tx_carrier_errors + - stats->tx_aborted_errors + - stats->tx_window_errors + - stats->tx_heartbeat_errors, - stats->tx_compressed); - } else - seq_printf(seq, "%6s: No statistics available.\n", dev->name); + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); } /* @@ -3257,11 +3254,9 @@ out: mutex_unlock(&net_todo_run_mutex); } -static struct net_device_stats *maybe_internal_stats(struct net_device *dev) +static struct net_device_stats *internal_stats(struct net_device *dev) { - if (dev->features & NETIF_F_INTERNAL_STATS) - return &dev->stats; - return NULL; + return &dev->stats; } /** @@ -3299,7 +3294,7 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, if (sizeof_priv) dev->priv = netdev_priv(dev); - dev->get_stats = maybe_internal_stats; + dev->get_stats = internal_stats; setup(dev); strcpy(dev->name, name); return dev; From aad97f38b71dd2ecd730b3a3dce8264d13fbcd56 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 28 Apr 2007 21:09:04 -0700 Subject: [PATCH 04/21] [SCTP]: Fix sctp_getsockopt_local_addrs_old() to use local storage. sctp_getsockopt_local_addrs_old() in net/sctp/socket.c calls copy_to_user() while the spinlock addr_lock is held. this should not be done as copy_to_user() might sleep. the call to sctp_copy_laddrs_to_user() while holding the lock is also problematic as it calls copy_to_user() Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 96 +++++++++++++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 36 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 11938fb20395..2fc0a92caa78 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3987,7 +3987,7 @@ static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, memcpy(&temp, &from->ipaddr, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; - if(space_left < addrlen) + if (space_left < addrlen) return -ENOMEM; if (copy_to_user(to, &temp, addrlen)) return -EFAULT; @@ -4076,8 +4076,9 @@ done: /* Helper function that copies local addresses to user and returns the number * of addresses copied. */ -static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_addrs, - void __user *to) +static int sctp_copy_laddrs_old(struct sock *sk, __u16 port, + int max_addrs, void *to, + int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -4094,10 +4095,10 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) - return -EFAULT; + memcpy(to, &temp, addrlen); to += addrlen; + *bytes_copied += addrlen; cnt ++; if (cnt >= max_addrs) break; } @@ -4105,8 +4106,8 @@ static int sctp_copy_laddrs_to_user_old(struct sock *sk, __u16 port, int max_add return cnt; } -static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, - void __user **to, size_t space_left) +static int sctp_copy_laddrs(struct sock *sk, __u16 port, void *to, + size_t space_left, int *bytes_copied) { struct list_head *pos, *next; struct sctp_sockaddr_entry *addr; @@ -4123,14 +4124,14 @@ static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_leftaddress_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user_old(sk, bp->port, - getaddrs.addr_num, - to); - if (cnt < 0) { - err = cnt; - goto unlock; - } + cnt = sctp_copy_laddrs_old(sk, bp->port, + getaddrs.addr_num, + addrs, &bytes_copied); goto copy_getaddrs; } } @@ -4206,22 +4214,29 @@ static int sctp_getsockopt_local_addrs_old(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; - } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; if (cnt >= getaddrs.addr_num) break; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + /* copy the entire address list into the user provided space */ + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } + + /* copy the leading structure back to user */ getaddrs.addr_num = cnt; if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs_old))) err = -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } @@ -4241,7 +4256,8 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, rwlock_t *addr_lock; int err = 0; size_t space_left; - int bytes_copied; + int bytes_copied = 0; + void *addrs; if (len <= sizeof(struct sctp_getaddrs)) return -EINVAL; @@ -4269,6 +4285,9 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, to = optval + offsetof(struct sctp_getaddrs,addrs); space_left = len - sizeof(struct sctp_getaddrs) - offsetof(struct sctp_getaddrs,addrs); + addrs = kmalloc(space_left, GFP_KERNEL); + if (!addrs) + return -ENOMEM; sctp_read_lock(addr_lock); @@ -4279,11 +4298,11 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, addr = list_entry(bp->address_list.next, struct sctp_sockaddr_entry, list); if (sctp_is_any(&addr->a)) { - cnt = sctp_copy_laddrs_to_user(sk, bp->port, - &to, space_left); + cnt = sctp_copy_laddrs(sk, bp->port, addrs, + space_left, &bytes_copied); if (cnt < 0) { err = cnt; - goto unlock; + goto error; } goto copy_getaddrs; } @@ -4294,26 +4313,31 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len, memcpy(&temp, &addr->a, sizeof(temp)); sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; - if(space_left < addrlen) - return -ENOMEM; /*fixme: right error?*/ - if (copy_to_user(to, &temp, addrlen)) { - err = -EFAULT; - goto unlock; + if (space_left < addrlen) { + err = -ENOMEM; /*fixme: right error?*/ + goto error; } + memcpy(addrs, &temp, addrlen); to += addrlen; + bytes_copied += addrlen; cnt ++; space_left -= addrlen; } copy_getaddrs: + sctp_read_unlock(addr_lock); + + if (copy_to_user(to, addrs, bytes_copied)) { + err = -EFAULT; + goto error; + } if (put_user(cnt, &((struct sctp_getaddrs __user *)optval)->addr_num)) return -EFAULT; - bytes_copied = ((char __user *)to) - optval; if (put_user(bytes_copied, optlen)) return -EFAULT; -unlock: - sctp_read_unlock(addr_lock); +error: + kfree(addrs); return err; } From 5632c5152aa621885d87ea0b8fdd5a6bb9f69c6f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sat, 28 Apr 2007 21:16:39 -0700 Subject: [PATCH 05/21] [IPV6]: Track device renames in snmp6. When network device's are renamed, the IPV6 snmp6 code gets confused. It doesn't track name changes so it will OOPS when network device's are removed. The fix is trivial, just unregister/re-register in notify handler. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 ++++-- net/ipv6/proc.c | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e04e49373505..3452433cbc96 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2359,8 +2359,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGENAME: -#ifdef CONFIG_SYSCTL if (idev) { + snmp6_unregister_dev(idev); +#ifdef CONFIG_SYSCTL addrconf_sysctl_unregister(&idev->cnf); neigh_sysctl_unregister(idev->nd_parms); neigh_sysctl_register(dev, idev->nd_parms, @@ -2368,8 +2369,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, &ndisc_ifinfo_sysctl_change, NULL); addrconf_sysctl_register(idev, &idev->cnf); - } #endif + snmp6_register_dev(idev); + } break; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index acb306a5dd56..920dc9cf6a84 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -223,6 +223,7 @@ int snmp6_unregister_dev(struct inet6_dev *idev) return -EINVAL; remove_proc_entry(idev->stats.proc_dir_entry->name, proc_net_devsnmp6); + idev->stats.proc_dir_entry = NULL; return 0; } From ecfd6b183780c6d9e85873693b3ce6c5f4d08b58 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 28 Apr 2007 21:20:32 -0700 Subject: [PATCH 06/21] [XFRM]: Export SPD info With this patch you can use iproute2 in user space to efficiently see how many policies exist in different directions. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/linux/xfrm.h | 35 +++++++++++++++++++ include/net/xfrm.h | 13 +++++++ net/xfrm/xfrm_policy.c | 16 ++++++++- net/xfrm/xfrm_user.c | 77 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 1 deletion(-) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 9c656a5cf842..a5d53e0fe152 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -185,6 +185,11 @@ enum { #define XFRM_MSG_NEWSADINFO XFRM_MSG_NEWSADINFO XFRM_MSG_GETSADINFO, #define XFRM_MSG_GETSADINFO XFRM_MSG_GETSADINFO + + XFRM_MSG_NEWSPDINFO, +#define XFRM_MSG_NEWSPDINFO XFRM_MSG_NEWSPDINFO + XFRM_MSG_GETSPDINFO, +#define XFRM_MSG_GETSPDINFO XFRM_MSG_GETSPDINFO __XFRM_MSG_MAX }; #define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1) @@ -290,6 +295,36 @@ enum xfrm_sadattr_type_t { #define XFRMA_SAD_MAX (__XFRMA_SAD_MAX - 1) }; +/* SPD Table filter flags */ +enum xfrm_spd_ftype_t { + XFRM_SPD_UNSPEC, + XFRM_SPD_HMASK=1, + XFRM_SPD_HMAX=2, + XFRM_SPD_ICNT=4, + XFRM_SPD_OCNT=8, + XFRM_SPD_FCNT=16, + XFRM_SPD_ISCNT=32, + XFRM_SPD_OSCNT=64, + XFRM_SPD_FSCNT=128, + __XFRM_SPD_MAX + +#define XFRM_SPD_MAX (__XFRM_SPD_MAX - 1) +}; +enum xfrm_spdattr_type_t { + XFRMA_SPD_UNSPEC, + XFRMA_SPDHMASK, + XFRMA_SPDHMAX, + XFRMA_SPDICNT, + XFRMA_SPDOCNT, + XFRMA_SPDFCNT, + XFRMA_SPDISCNT, + XFRMA_SPDOSCNT, + XFRMA_SPDFSCNT, + __XFRMA_SPD_MAX + +#define XFRMA_SPD_MAX (__XFRMA_SPD_MAX - 1) +}; + struct xfrm_usersa_info { struct xfrm_selector sel; struct xfrm_id id; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 8287081d77f2..9561bf817b02 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -423,6 +423,18 @@ struct xfrm_sadinfo u32 sadhmcnt; /* max allowed hash bkts */ u32 sadcnt; /* current running count */ }; + +struct xfrm_spdinfo +{ + u32 incnt; + u32 outcnt; + u32 fwdcnt; + u32 inscnt; + u32 outscnt; + u32 fwdscnt; + u32 spdhcnt; + u32 spdhmcnt; +}; #ifdef CONFIG_AUDITSYSCALL extern void xfrm_audit_log(uid_t auid, u32 secid, int type, int result, struct xfrm_policy *xp, struct xfrm_state *x); @@ -946,6 +958,7 @@ extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq); extern int xfrm_state_delete(struct xfrm_state *x); extern void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info); extern void xfrm_sad_getinfo(struct xfrm_sadinfo *si); +extern void xfrm_spd_getinfo(struct xfrm_spdinfo *si); extern int xfrm_replay_check(struct xfrm_state *x, __be32 seq); extern void xfrm_replay_advance(struct xfrm_state *x, __be32 seq); extern void xfrm_replay_notify(struct xfrm_state *x, int event); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 762926009c04..dbf9d96a2f0b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -579,8 +579,22 @@ static inline int xfrm_byidx_should_resize(int total) return 0; } -static DEFINE_MUTEX(hash_resize_mutex); +void xfrm_spd_getinfo(struct xfrm_spdinfo *si) +{ + read_lock_bh(&xfrm_policy_lock); + si->incnt = xfrm_policy_count[XFRM_POLICY_IN]; + si->outcnt = xfrm_policy_count[XFRM_POLICY_OUT]; + si->fwdcnt = xfrm_policy_count[XFRM_POLICY_FWD]; + si->inscnt = xfrm_policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; + si->outscnt = xfrm_policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; + si->fwdscnt = xfrm_policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; + si->spdhcnt = xfrm_idx_hmask; + si->spdhmcnt = xfrm_policy_hashmax; + read_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_spd_getinfo); +static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *__unused) { int dir, total; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 69110fed64b6..4210d91624cd 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -672,6 +672,81 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, return skb; } +static int build_spdinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) +{ + struct xfrm_spdinfo si; + struct nlmsghdr *nlh; + u32 *f; + + nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSPDINFO, sizeof(u32), 0); + if (nlh == NULL) /* shouldnt really happen ... */ + return -EMSGSIZE; + + f = nlmsg_data(nlh); + *f = flags; + xfrm_spd_getinfo(&si); + + if (flags & XFRM_SPD_HMASK) + NLA_PUT_U32(skb, XFRMA_SPDHMASK, si.spdhcnt); + if (flags & XFRM_SPD_HMAX) + NLA_PUT_U32(skb, XFRMA_SPDHMAX, si.spdhmcnt); + if (flags & XFRM_SPD_ICNT) + NLA_PUT_U32(skb, XFRMA_SPDICNT, si.incnt); + if (flags & XFRM_SPD_OCNT) + NLA_PUT_U32(skb, XFRMA_SPDOCNT, si.outcnt); + if (flags & XFRM_SPD_FCNT) + NLA_PUT_U32(skb, XFRMA_SPDFCNT, si.fwdcnt); + if (flags & XFRM_SPD_ISCNT) + NLA_PUT_U32(skb, XFRMA_SPDISCNT, si.inscnt); + if (flags & XFRM_SPD_OSCNT) + NLA_PUT_U32(skb, XFRMA_SPDOSCNT, si.inscnt); + if (flags & XFRM_SPD_FSCNT) + NLA_PUT_U32(skb, XFRMA_SPDFSCNT, si.inscnt); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, + struct rtattr **xfrma) +{ + struct sk_buff *r_skb; + u32 *flags = NLMSG_DATA(nlh); + u32 spid = NETLINK_CB(skb).pid; + u32 seq = nlh->nlmsg_seq; + int len = NLMSG_LENGTH(sizeof(u32)); + + + if (*flags & XFRM_SPD_HMASK) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_HMAX) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_ICNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_OCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_FCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_ISCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_OSCNT) + len += RTA_SPACE(sizeof(u32)); + if (*flags & XFRM_SPD_FSCNT) + len += RTA_SPACE(sizeof(u32)); + + r_skb = alloc_skb(len, GFP_ATOMIC); + if (r_skb == NULL) + return -ENOMEM; + + if (build_spdinfo(r_skb, spid, seq, *flags) < 0) + BUG(); + + return nlmsg_unicast(xfrm_nl, r_skb, spid); +} + static int build_sadinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags) { struct xfrm_sadinfo si; @@ -1879,6 +1954,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report), [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id), [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)), }; #undef XMSGSIZE @@ -1907,6 +1983,7 @@ static struct xfrm_link { [XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae }, [XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate }, [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo }, + [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) From 65bb723c9502b7ba0a3aad13bdac8832e213ba74 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 28 Apr 2007 21:21:46 -0700 Subject: [PATCH 07/21] [TCP]: Update references in two old comments This updates references to drafts in comments which must be about 10 years old. Internet draft draft-ietf-tcpimpl-prob-03.txt expired in 1998 and was replaced by RFC 2525 in March 1999. Section 3.10 of the draft maps almost identically into section 2.17 of RFC 2525: both are entitled "Failure to RST on close with data pending", the differences in text body amount to a typo and minor sentence change. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 14 ++++++-------- net/ipv4/tcp_output.c | 2 +- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2cf9a898ce50..d6e488668171 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout) sk_stream_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e70a6840cb64..b5fa3c19afee 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2035,7 +2035,7 @@ void tcp_send_fin(struct sock *sk) /* We get here when a process closes a file descriptor (either due to * an explicit close() or as a byproduct of exit()'ing) and there * was unread data in the receive queue. This behavior is recommended - * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + * by RFC 2525, section 2.17. -DaveM */ void tcp_send_active_reset(struct sock *sk, gfp_t priority) { From d0772b70faaf8e9f2013b6c4273d94d5eac8047a Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 28 Apr 2007 21:26:23 -0700 Subject: [PATCH 08/21] [IPV6]: Fix slab corruption running ip6sic From: Eric Sesterhenn Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 538499a89975..5502cc948dfb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -261,7 +261,7 @@ static int xfrm6_tunnel_rcv(struct sk_buff *skb) __be32 spi; spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); - return xfrm6_rcv_spi(skb, spi); + return xfrm6_rcv_spi(skb, spi) > 0 ? : 0; } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, From 04b090d50c88ac8e5ec9c2e985bb65bd153893aa Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Sat, 28 Apr 2007 23:03:59 -0700 Subject: [PATCH 09/21] [AF_IUCV/IUCV]: smp_call_function deadlock Calling smp_call_function can lead to a deadlock if it is called from tasklet context. Fixing this deadlock requires to move the smp_call_function from the tasklet context to a work queue. To do that queue the path pending interrupts to a separate list and move the path cleanup out of iucv_path_sever to iucv_path_connect and iucv_path_pending. This creates a new requirement for iucv_path_connect: it may not be called from tasklet context anymore. Also fixed compile problem for CONFIG_HOTPLUG_CPU=n and another one when walking the cpu_online mask. When doing this, we must disable cpu hotplug. Signed-off-by: Frank Pavlic Signed-off-by: Martin Schwidefsky Signed-off-by: David S. Miller --- include/net/iucv/iucv.h | 2 +- net/iucv/iucv.c | 205 ++++++++++++++++++++++++++-------------- 2 files changed, 133 insertions(+), 74 deletions(-) diff --git a/include/net/iucv/iucv.h b/include/net/iucv/iucv.h index 746e7416261e..fd70adbb3566 100644 --- a/include/net/iucv/iucv.h +++ b/include/net/iucv/iucv.h @@ -16,7 +16,7 @@ * completed a register, it can exploit the other functions. * For furthur reference on all IUCV functionality, refer to the * CP Programming Services book, also available on the web thru - * www.ibm.com/s390/vm/pubs, manual # SC24-5760 + * www.vm.ibm.com/pubs, manual # SC24-6084 * * Definition of Return Codes * - All positive return codes including zero are reflected back diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 60f293842a39..903bdb6eaaa1 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -90,20 +90,43 @@ struct iucv_irq_data { u32 res2[8]; }; -struct iucv_work { +struct iucv_irq_list { struct list_head list; struct iucv_irq_data data; }; -static LIST_HEAD(iucv_work_queue); -static DEFINE_SPINLOCK(iucv_work_lock); - static struct iucv_irq_data *iucv_irq_data; static cpumask_t iucv_buffer_cpumask = CPU_MASK_NONE; static cpumask_t iucv_irq_cpumask = CPU_MASK_NONE; -static void iucv_tasklet_handler(unsigned long); -static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_handler,0); +/* + * Queue of interrupt buffers lock for delivery via the tasklet + * (fast but can't call smp_call_function). + */ +static LIST_HEAD(iucv_task_queue); + +/* + * The tasklet for fast delivery of iucv interrupts. + */ +static void iucv_tasklet_fn(unsigned long); +static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0); + +/* + * Queue of interrupt buffers for delivery via a work queue + * (slower but can call smp_call_function). + */ +static LIST_HEAD(iucv_work_queue); + +/* + * The work element to deliver path pending interrupts. + */ +static void iucv_work_fn(struct work_struct *work); +static DECLARE_WORK(iucv_work, iucv_work_fn); + +/* + * Spinlock protecting task and work queue. + */ +static DEFINE_SPINLOCK(iucv_queue_lock); enum iucv_command_codes { IUCV_QUERY = 0, @@ -147,10 +170,10 @@ static unsigned long iucv_max_pathid; static DEFINE_SPINLOCK(iucv_table_lock); /* - * iucv_tasklet_cpu: contains the number of the cpu executing the tasklet. - * Needed for iucv_path_sever called from tasklet. + * iucv_active_cpu: contains the number of the cpu executing the tasklet + * or the work handler. Needed for iucv_path_sever called from tasklet. */ -static int iucv_tasklet_cpu = -1; +static int iucv_active_cpu = -1; /* * Mutex and wait queue for iucv_register/iucv_unregister. @@ -449,17 +472,19 @@ static void iucv_setmask_mp(void) { int cpu; + preempt_disable(); for_each_online_cpu(cpu) /* Enable all cpus with a declared buffer. */ if (cpu_isset(cpu, iucv_buffer_cpumask) && !cpu_isset(cpu, iucv_irq_cpumask)) smp_call_function_on(iucv_allow_cpu, NULL, 0, 1, cpu); + preempt_enable(); } /** * iucv_setmask_up * - * Allow iucv interrupts on a single cpus. + * Allow iucv interrupts on a single cpu. */ static void iucv_setmask_up(void) { @@ -493,8 +518,10 @@ static int iucv_enable(void) goto out; /* Declare per cpu buffers. */ rc = -EIO; + preempt_disable(); for_each_online_cpu(cpu) smp_call_function_on(iucv_declare_cpu, NULL, 0, 1, cpu); + preempt_enable(); if (cpus_empty(iucv_buffer_cpumask)) /* No cpu could declare an iucv buffer. */ goto out_path; @@ -584,48 +611,49 @@ static int iucv_sever_pathid(u16 pathid, u8 userdata[16]) return iucv_call_b2f0(IUCV_SEVER, parm); } +#ifdef CONFIG_SMP /** - * __iucv_cleanup_pathid + * __iucv_cleanup_queue * @dummy: unused dummy argument * * Nop function called via smp_call_function to force work items from * pending external iucv interrupts to the work queue. */ -static void __iucv_cleanup_pathid(void *dummy) +static void __iucv_cleanup_queue(void *dummy) { } +#endif /** - * iucv_cleanup_pathid - * @pathid: 16 bit pathid + * iucv_cleanup_queue * * Function called after a path has been severed to find all remaining * work items for the now stale pathid. The caller needs to hold the * iucv_table_lock. */ -static void iucv_cleanup_pathid(u16 pathid) +static void iucv_cleanup_queue(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; /* - * Path is severed, the pathid can be reused immediatly on - * a iucv connect or a connection pending interrupt. - * iucv_path_connect and connection pending interrupt will - * wait until the iucv_table_lock is released before the - * recycled pathid enters the system. - * Force remaining interrupts to the work queue, then - * scan the work queue for items of this path. + * When a path is severed, the pathid can be reused immediatly + * on a iucv connect or a connection pending interrupt. Remove + * all entries from the task queue that refer to a stale pathid + * (iucv_path_table[ix] == NULL). Only then do the iucv connect + * or deliver the connection pending interrupt. To get all the + * pending interrupts force them to the work queue by calling + * an empty function on all cpus. */ - smp_call_function(__iucv_cleanup_pathid, NULL, 0, 1); - spin_lock_irq(&iucv_work_lock); - list_for_each_entry_safe(p, n, &iucv_work_queue, list) { - /* Remove work items for pathid except connection pending */ - if (p->data.ippathid == pathid && p->data.iptype != 0x01) { + smp_call_function(__iucv_cleanup_queue, NULL, 0, 1); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) { + /* Remove stale work items from the task queue. */ + if (iucv_path_table[p->data.ippathid] == NULL) { list_del(&p->list); kfree(p); } } - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); } /** @@ -684,7 +712,6 @@ void iucv_unregister(struct iucv_handler *handler, int smp) iucv_sever_pathid(p->pathid, NULL); iucv_path_table[p->pathid] = NULL; list_del(&p->list); - iucv_cleanup_pathid(p->pathid); iucv_path_free(p); } spin_unlock_bh(&iucv_table_lock); @@ -757,9 +784,9 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, union iucv_param *parm; int rc; - preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) - spin_lock_bh(&iucv_table_lock); + BUG_ON(in_atomic()); + spin_lock_bh(&iucv_table_lock); + iucv_cleanup_queue(); parm = percpu_ptr(iucv_param, smp_processor_id()); memset(parm, 0, sizeof(union iucv_param)); parm->ctrl.ipmsglim = path->msglim; @@ -794,9 +821,7 @@ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, rc = -EIO; } } - if (iucv_tasklet_cpu != smp_processor_id()) - spin_unlock_bh(&iucv_table_lock); - preempt_enable(); + spin_unlock_bh(&iucv_table_lock); return rc; } @@ -867,15 +892,14 @@ int iucv_path_sever(struct iucv_path *path, u8 userdata[16]) preempt_disable(); - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_lock_bh(&iucv_table_lock); rc = iucv_sever_pathid(path->pathid, userdata); if (!rc) { iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); } - if (iucv_tasklet_cpu != smp_processor_id()) + if (iucv_active_cpu != smp_processor_id()) spin_unlock_bh(&iucv_table_lock); preempt_enable(); return rc; @@ -1244,8 +1268,7 @@ static void iucv_path_complete(struct iucv_irq_data *data) struct iucv_path_complete *ipc = (void *) data; struct iucv_path *path = iucv_path_table[ipc->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_complete) + if (path && path->handler && path->handler->path_complete) path->handler->path_complete(path, ipc->ipuser); } @@ -1273,14 +1296,14 @@ static void iucv_path_severed(struct iucv_irq_data *data) struct iucv_path_severed *ips = (void *) data; struct iucv_path *path = iucv_path_table[ips->ippathid]; - BUG_ON(!path || !path->handler); + if (!path || !path->handler) /* Already severed */ + return; if (path->handler->path_severed) path->handler->path_severed(path, ips->ipuser); else { iucv_sever_pathid(path->pathid, NULL); iucv_path_table[path->pathid] = NULL; list_del_init(&path->list); - iucv_cleanup_pathid(path->pathid); iucv_path_free(path); } } @@ -1309,8 +1332,7 @@ static void iucv_path_quiesced(struct iucv_irq_data *data) struct iucv_path_quiesced *ipq = (void *) data; struct iucv_path *path = iucv_path_table[ipq->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_quiesced) + if (path && path->handler && path->handler->path_quiesced) path->handler->path_quiesced(path, ipq->ipuser); } @@ -1338,8 +1360,7 @@ static void iucv_path_resumed(struct iucv_irq_data *data) struct iucv_path_resumed *ipr = (void *) data; struct iucv_path *path = iucv_path_table[ipr->ippathid]; - BUG_ON(!path || !path->handler); - if (path->handler->path_resumed) + if (path && path->handler && path->handler->path_resumed) path->handler->path_resumed(path, ipr->ipuser); } @@ -1371,8 +1392,7 @@ static void iucv_message_complete(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imc->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_complete) { + if (path && path->handler && path->handler->message_complete) { msg.flags = imc->ipflags1; msg.id = imc->ipmsgid; msg.audit = imc->ipaudit; @@ -1417,8 +1437,7 @@ static void iucv_message_pending(struct iucv_irq_data *data) struct iucv_path *path = iucv_path_table[imp->ippathid]; struct iucv_message msg; - BUG_ON(!path || !path->handler); - if (path->handler->message_pending) { + if (path && path->handler && path->handler->message_pending) { msg.flags = imp->ipflags1; msg.id = imp->ipmsgid; msg.class = imp->iptrgcls; @@ -1433,17 +1452,16 @@ static void iucv_message_pending(struct iucv_irq_data *data) } /** - * iucv_tasklet_handler: + * iucv_tasklet_fn: * * This tasklet loops over the queue of irq buffers created by * iucv_external_interrupt, calls the appropriate action handler * and then frees the buffer. */ -static void iucv_tasklet_handler(unsigned long ignored) +static void iucv_tasklet_fn(unsigned long ignored) { typedef void iucv_irq_fn(struct iucv_irq_data *); static iucv_irq_fn *irq_fn[] = { - [0x01] = iucv_path_pending, [0x02] = iucv_path_complete, [0x03] = iucv_path_severed, [0x04] = iucv_path_quiesced, @@ -1453,38 +1471,70 @@ static void iucv_tasklet_handler(unsigned long ignored) [0x08] = iucv_message_pending, [0x09] = iucv_message_pending, }; - struct iucv_work *p; + struct list_head task_queue = LIST_HEAD_INIT(task_queue); + struct iucv_irq_list *p, *n; /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ spin_lock(&iucv_table_lock); - iucv_tasklet_cpu = smp_processor_id(); + iucv_active_cpu = smp_processor_id(); - spin_lock_irq(&iucv_work_lock); - while (!list_empty(&iucv_work_queue)) { - p = list_entry(iucv_work_queue.next, struct iucv_work, list); + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_task_queue, &task_queue); + spin_unlock_irq(&iucv_queue_lock); + + list_for_each_entry_safe(p, n, &task_queue, list) { list_del_init(&p->list); - spin_unlock_irq(&iucv_work_lock); irq_fn[p->data.iptype](&p->data); kfree(p); - spin_lock_irq(&iucv_work_lock); } - spin_unlock_irq(&iucv_work_lock); - iucv_tasklet_cpu = -1; + iucv_active_cpu = -1; spin_unlock(&iucv_table_lock); } +/** + * iucv_work_fn: + * + * This work function loops over the queue of path pending irq blocks + * created by iucv_external_interrupt, calls the appropriate action + * handler and then frees the buffer. + */ +static void iucv_work_fn(struct work_struct *work) +{ + typedef void iucv_irq_fn(struct iucv_irq_data *); + struct list_head work_queue = LIST_HEAD_INIT(work_queue); + struct iucv_irq_list *p, *n; + + /* Serialize tasklet, iucv_path_sever and iucv_path_connect. */ + spin_lock_bh(&iucv_table_lock); + iucv_active_cpu = smp_processor_id(); + + spin_lock_irq(&iucv_queue_lock); + list_splice_init(&iucv_work_queue, &work_queue); + spin_unlock_irq(&iucv_queue_lock); + + iucv_cleanup_queue(); + list_for_each_entry_safe(p, n, &work_queue, list) { + list_del_init(&p->list); + iucv_path_pending(&p->data); + kfree(p); + } + + iucv_active_cpu = -1; + spin_unlock_bh(&iucv_table_lock); +} + /** * iucv_external_interrupt * @code: irq code * * Handles external interrupts coming in from CP. - * Places the interrupt buffer on a queue and schedules iucv_tasklet_handler(). + * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ static void iucv_external_interrupt(u16 code) { struct iucv_irq_data *p; - struct iucv_work *work; + struct iucv_irq_list *work; p = percpu_ptr(iucv_irq_data, smp_processor_id()); if (p->ippathid >= iucv_max_pathid) { @@ -1498,16 +1548,23 @@ static void iucv_external_interrupt(u16 code) printk(KERN_ERR "iucv_do_int: unknown iucv interrupt\n"); return; } - work = kmalloc(sizeof(struct iucv_work), GFP_ATOMIC); + work = kmalloc(sizeof(struct iucv_irq_list), GFP_ATOMIC); if (!work) { printk(KERN_WARNING "iucv_external_interrupt: out of memory\n"); return; } memcpy(&work->data, p, sizeof(work->data)); - spin_lock(&iucv_work_lock); - list_add_tail(&work->list, &iucv_work_queue); - spin_unlock(&iucv_work_lock); - tasklet_schedule(&iucv_tasklet); + spin_lock(&iucv_queue_lock); + if (p->iptype == 0x01) { + /* Path pending interrupt. */ + list_add_tail(&work->list, &iucv_work_queue); + schedule_work(&iucv_work); + } else { + /* The other interrupts. */ + list_add_tail(&work->list, &iucv_task_queue); + tasklet_schedule(&iucv_tasklet); + } + spin_unlock(&iucv_queue_lock); } /** @@ -1577,12 +1634,14 @@ out: */ static void iucv_exit(void) { - struct iucv_work *p, *n; + struct iucv_irq_list *p, *n; - spin_lock_irq(&iucv_work_lock); + spin_lock_irq(&iucv_queue_lock); + list_for_each_entry_safe(p, n, &iucv_task_queue, list) + kfree(p); list_for_each_entry_safe(p, n, &iucv_work_queue, list) kfree(p); - spin_unlock_irq(&iucv_work_lock); + spin_unlock_irq(&iucv_queue_lock); unregister_hotcpu_notifier(&iucv_cpu_notifier); percpu_free(iucv_param); percpu_free(iucv_irq_data); From 46f8914e53c28d0716c586e08a7c819d8ebb9d54 Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 30 Apr 2007 00:07:31 -0700 Subject: [PATCH 10/21] [SKB]: Introduce skb_queue_walk_safe() This patch provides a method for walking skb lists while inserting or removing skbs from the list. Signed-off-by: James Chapman Signed-off-by: David S. Miller --- include/linux/skbuff.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2694cb3ca763..253a2b9be9d6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1471,6 +1471,11 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) prefetch(skb->next), (skb != (struct sk_buff *)(queue)); \ skb = skb->next) +#define skb_queue_walk_safe(queue, skb, tmp) \ + for (skb = (queue)->next, tmp = skb->next; \ + skb != (struct sk_buff *)(queue); \ + skb = tmp, tmp = skb->next) + #define skb_queue_reverse_walk(queue, skb) \ for (skb = (queue)->prev; \ prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \ From 65def812ab25d7565756e5748d91e22e302197ee Mon Sep 17 00:00:00 2001 From: James Chapman Date: Mon, 30 Apr 2007 00:21:02 -0700 Subject: [PATCH 11/21] [L2TP]: Add the ability to autoload a pppox protocol module. This patch allows a name "pppox-proto-nnn" to be used in modprobe.conf to autoload a PPPoX protocol nnn. Signed-off-by: James Chapman Signed-off-by: David S. Miller --- drivers/net/pppox.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c index 3f8115db4d54..f3e47d0c2b3c 100644 --- a/drivers/net/pppox.c +++ b/drivers/net/pppox.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -114,6 +115,13 @@ static int pppox_create(struct socket *sock, int protocol) goto out; rc = -EPROTONOSUPPORT; +#ifdef CONFIG_KMOD + if (!pppox_protos[protocol]) { + char buffer[32]; + sprintf(buffer, "pppox-proto-%d", protocol); + request_module(buffer); + } +#endif if (!pppox_protos[protocol] || !try_module_get(pppox_protos[protocol]->owner)) goto out; From 6aaf47fa48d3c44280810b1b470261d340e4ed87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Apr 2007 00:26:00 -0700 Subject: [PATCH 12/21] [PATCH] INET : IPV4 UDP lookups converted to a 2 pass algo Some people want to have many UDP sockets, binded to a single port but many different addresses. We currently hash all those sockets into a single chain. Processing of incoming packets is very expensive, because the whole chain must be examined to find the best match. I chose in this patch to hash UDP sockets with a hash function that take into account both their port number and address : This has a drawback because we need two lookups : one with a given address, one with a wildcard (null) address. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 171 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 114 insertions(+), 57 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cec0f2cc49b7..144970704c2c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock); static int udp_port_rover; -static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[]) +/* + * Note about this hash function : + * Typical use is probably daddr = 0, only dport is going to vary hash + */ +static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr) +{ + addr ^= addr >> 16; + addr ^= addr >> 8; + return port ^ addr; +} + +static inline int __udp_lib_port_inuse(unsigned int hash, int port, + __be32 daddr, struct hlist_head udptable[]) { struct sock *sk; struct hlist_node *node; + struct inet_sock *inet; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) - if (sk->sk_hash == num) + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { + if (sk->sk_hash != hash) + continue; + inet = inet_sk(sk); + if (inet->num != port) + continue; + if (inet->rcv_saddr == daddr) return 1; + } return 0; } @@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, struct hlist_node *node; struct hlist_head *head; struct sock *sk2; + unsigned int hash; int error = 1; write_lock_bh(&udp_hash_lock); @@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { int size; - head = &udptable[result & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; if (hlist_empty(head)) { if (result > sysctl_local_port_range[1]) result = sysctl_local_port_range[0] + @@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, result = sysctl_local_port_range[0] + ((result - sysctl_local_port_range[0]) & (UDP_HTABLE_SIZE - 1)); - if (! __udp_lib_lport_inuse(result, udptable)) + hash = hash_port_and_addr(result, + inet_sk(sk)->rcv_saddr); + if (! __udp_lib_port_inuse(hash, result, + inet_sk(sk)->rcv_saddr, udptable)) break; } if (i >= (1 << 16) / UDP_HTABLE_SIZE) @@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: *port_rover = snum = result; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr); + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_for_each(sk2, node, head) - if (sk2->sk_hash == snum && + if (sk2->sk_hash == hash && sk2 != sk && + inet_sk(sk2)->num == snum && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && @@ -201,9 +228,9 @@ gotit: goto fail; } inet_sk(sk)->num = snum; - sk->sk_hash = snum; + sk->sk_hash = hash; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[hash & (UDP_HTABLE_SIZE - 1)]; sk_add_node(sk, head); sock_prot_inc_use(sk->sk_prot); } @@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport, { struct sock *sk, *result = NULL; struct hlist_node *node; - unsigned short hnum = ntohs(dport); - int badness = -1; + unsigned int hash, hashwild; + int score, best = -1; + + hash = hash_port_and_addr(ntohs(dport), daddr); + hashwild = hash_port_and_addr(ntohs(dport), 0); read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + +lookup: + + sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); - if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) { - int score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) - continue; - score+=2; - } - if (inet->daddr) { - if (inet->daddr != saddr) - continue; - score+=2; - } - if (inet->dport) { - if (inet->dport != sport) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 9) { - result = sk; - break; - } else if (score > badness) { - result = sk; - badness = score; - } + if (sk->sk_hash != hash || ipv6_only_sock(sk) || + inet->num != dport) + continue; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 9) { + result = sk; + goto found; + } else if (score > best) { + result = sk; + best = score; } } + + if (hash != hashwild) { + hash = hashwild; + goto lookup; + } +found: if (result) sock_hold(result); read_unlock(&udp_hash_lock); return result; } -static inline struct sock *udp_v4_mcast_next(struct sock *sk, - __be16 loc_port, __be32 loc_addr, - __be16 rmt_port, __be32 rmt_addr, - int dif) +static inline struct sock *udp_v4_mcast_next( + struct sock *sk, + unsigned int hnum, __be16 loc_port, __be32 loc_addr, + __be16 rmt_port, __be32 rmt_addr, + int dif) { struct hlist_node *node; struct sock *s = sk; - unsigned short hnum = ntohs(loc_port); sk_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); if (s->sk_hash != hnum || + inet->num != loc_port || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || @@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) { - struct sock *sk; + struct sock *sk, *skw, *sknext; int dif; + unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr); + unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0); + + dif = skb->dev->ifindex; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); - dif = skb->dev->ifindex; - sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]); + skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]); + + sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { + hash = hashwild; + sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source, + saddr, dif); + } + if (sk) { do { struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, - uh->source, saddr, dif); + sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest, + daddr, uh->source, saddr, dif); + if (!sknext && hash != hashwild) { + hash = hashwild; + sknext = udp_v4_mcast_next(skw, hash, uh->dest, + daddr, uh->source, saddr, dif); + } if (sknext) skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) { int ret = udp_queue_rcv_skb(sk, skb1); if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ + /* + * we should probably re-process + * instead of dropping packets here. + */ kfree_skb(skb1); } sk = sknext; From 34588b4c046c34773e5a1a962da7b78b05c4d1bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 30 Apr 2007 00:57:33 -0700 Subject: [PATCH 13/21] [TCP]: Catch skb with S+L bugs earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SACKED_ACKED and LOST are mutually exclusive with SACK, thus having their sum larger than packets_out is bug with SACK. Eventually these bugs trigger traps in the tcp_clean_rtx_queue with SACK but it's much more informative to do this here. Non-SACK TCP, however, could get more than packets_out duplicate ACKs which each increment sacked_out, so it makes sense to do this kind of limitting for non-SACK TCP but not for SACK enabled one. Perhaps the author had the opposite in mind but did the logic accidently wrong way around? Anyway, the sacked_out incrementer code for non-SACK already deals this issue before calling sync_left_out so this trapping can be done unconditionally. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/net/tcp.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a385797f160a..c6ecd455edab 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -736,9 +736,7 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) static inline void tcp_sync_left_out(struct tcp_sock *tp) { - if (tp->rx_opt.sack_ok && - (tp->sacked_out >= tp->packets_out - tp->lost_out)) - tp->sacked_out = tp->packets_out - tp->lost_out; + BUG_ON(tp->sacked_out + tp->lost_out > tp->packets_out); tp->left_out = tp->sacked_out + tp->lost_out; } From 157bfc25020f7eb731f94140e099307ade47299e Mon Sep 17 00:00:00 2001 From: Masahide NAKAMURA Date: Mon, 30 Apr 2007 00:33:35 -0700 Subject: [PATCH 14/21] [XFRM]: Restrict upper layer information by bundle. On MIPv6 usage, XFRM sub policy is enabled. When main (IPsec) and sub (MIPv6) policy selectors have the same address set but different upper layer information (i.e. protocol number and its ports or type/code), multiple bundle should be created. However, currently we have issue to use the same bundle created for the first time with all flows covered by the case. It is useful for the bundle to have the upper layer information to be restructured correctly if it does not match with the flow. 1. Bundle was created by two policies Selector from another policy is added to xfrm_dst. If the flow does not match the selector, it goes to slow path to restructure new bundle by single policy. 2. Bundle was created by one policy Flow cache is added to xfrm_dst as originated one. If the flow does not match the cache, it goes to slow path to try searching another policy. Signed-off-by: Masahide NAKAMURA Signed-off-by: David S. Miller --- include/net/flow.h | 6 +++++ include/net/xfrm.h | 10 ++++++++ net/xfrm/xfrm_policy.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/include/net/flow.h b/include/net/flow.h index ce4b10d8b412..f3cc1f812619 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -97,4 +97,10 @@ extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; +static inline int flow_cache_uli_match(struct flowi *fl1, struct flowi *fl2) +{ + return (fl1->proto == fl2->proto && + !memcmp(&fl1->uli_u, &fl2->uli_u, sizeof(fl1->uli_u))); +} + #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 9561bf817b02..66c2d3eec03c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -603,6 +603,10 @@ struct xfrm_dst struct rt6_info rt6; } u; struct dst_entry *route; +#ifdef CONFIG_XFRM_SUB_POLICY + struct flowi *origin; + struct xfrm_selector *partner; +#endif u32 genid; u32 route_mtu_cached; u32 child_mtu_cached; @@ -615,6 +619,12 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst) dst_release(xdst->route); if (likely(xdst->u.dst.xfrm)) xfrm_state_put(xdst->u.dst.xfrm); +#ifdef CONFIG_XFRM_SUB_POLICY + kfree(xdst->origin); + xdst->origin = NULL; + kfree(xdst->partner); + xdst->partner = NULL; +#endif } extern void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index dbf9d96a2f0b..263e34e45265 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1344,6 +1344,40 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } +static int inline +xfrm_dst_alloc_copy(void **target, void *src, int size) +{ + if (!*target) { + *target = kmalloc(size, GFP_ATOMIC); + if (!*target) + return -ENOMEM; + } + memcpy(*target, src, size); + return 0; +} + +static int inline +xfrm_dst_update_parent(struct dst_entry *dst, struct xfrm_selector *sel) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->partner), + sel, sizeof(*sel)); +#else + return 0; +#endif +} + +static int inline +xfrm_dst_update_origin(struct dst_entry *dst, struct flowi *fl) +{ +#ifdef CONFIG_XFRM_SUB_POLICY + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + return xfrm_dst_alloc_copy((void **)&(xdst->origin), fl, sizeof(*fl)); +#else + return 0; +#endif +} static int stale_bundle(struct dst_entry *dst); @@ -1532,6 +1566,18 @@ restart: err = -EHOSTUNREACH; goto error; } + + if (npols > 1) + err = xfrm_dst_update_parent(dst, &pols[1]->selector); + else + err = xfrm_dst_update_origin(dst, fl); + if (unlikely(err)) { + write_unlock_bh(&policy->lock); + if (dst) + dst_free(dst); + goto error; + } + dst->next = policy->bundles; policy->bundles = dst; dst_hold(dst); @@ -1947,6 +1993,15 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, if (!dst_check(dst->path, ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; +#ifdef CONFIG_XFRM_SUB_POLICY + if (fl) { + if (first->origin && !flow_cache_uli_match(first->origin, fl)) + return 0; + if (first->partner && + !xfrm_selector_match(first->partner, fl, family)) + return 0; + } +#endif last = NULL; From 575ee7140dabe9b9c4f66f4f867039b97e548867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 30 Apr 2007 00:39:55 -0700 Subject: [PATCH 15/21] [TCP] FRTO: Delay skb available check until it's mandatory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No new data is needed until the first ACK comes, so no need to check for application limitedness until then. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 051f0f815f17..6b669898b197 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if these conditions are satisfied: - * - there must be some unsent new data - * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head (SACK enhanced - * variant from Appendix B of RFC4138 is more robust here) +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !tcp_send_head(sk) || - after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, - tp->snd_una + tp->snd_wnd)) + if (!sysctl_tcp_frto) return 0; if (IsSackFrto()) @@ -2710,6 +2705,14 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } if (tp->frto_counter == 1) { + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; From d551e4541dd60ae53459f77a971f2d6043431f5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 30 Apr 2007 00:42:20 -0700 Subject: [PATCH 16/21] [TCP] FRTO: RFC4138 allows Nagle override when new data must be sent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a corner case where less than MSS sized new data thingie is awaiting in the send queue. For F-RTO to work correctly, a new data segment must be sent at certain point or F-RTO cannot be used at all. RFC4138 allows overriding of Nagle at that point. Implementation uses frto_counter states 2 and 3 to distinguish when Nagle override is needed. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_input.c | 13 ++++++++----- net/ipv4/tcp_output.c | 6 ++++-- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index c6ecd455edab..ef8f9d4dae85 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1199,9 +1199,14 @@ static inline struct sk_buff *tcp_send_head(struct sock *sk) static inline void tcp_advance_send_head(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) sk->sk_send_head = NULL; + /* Don't override Nagle indefinately with F-RTO */ + if (tp->frto_counter == 2) + tp->frto_counter = 3; } static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6b669898b197..7641b2761a14 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2637,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag) * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. @@ -2666,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) inet_csk(sk)->icsk_retransmits = 0; if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } @@ -2692,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - if ((tp->frto_counter == 2) && + if ((tp->frto_counter >= 2) && (!(flag&FLAG_FORWARD_PROGRESS) || ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ @@ -2709,14 +2711,15 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) if (!tcp_send_head(sk) || after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); return 1; } tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; - } else /* frto_counter == 2 */ { + } else { switch (sysctl_tcp_frto_response) { case 2: tcp_undo_spur_to_response(sk, flag); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b5fa3c19afee..0faacf9c419d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, if (nonagle & TCP_NAGLE_PUSH) return 1; - /* Don't use the nagle rule for urgent data (or for the final FIN). */ - if (tp->urg_mode || + /* Don't use the nagle rule for urgent data (or for the final FIN). + * Nagle can be ignored during F-RTO too (see RFC4138). + */ + if (tp->urg_mode || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; From 71ff6c0a857d11e70aec0c8f1e0d4ae9a45dd468 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:45:02 -0700 Subject: [PATCH 17/21] [SNMP]: Add definitions for {In,Out}BcastPkts The updated IP-MIB RFC (RFC4293) specifys new objects, InBcastPkts and OutBcastPkts. This adds definitions for them. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- include/linux/snmp.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 854aa6b543f1..802b3a38b041 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -40,6 +40,8 @@ enum IPSTATS_MIB_FRAGCREATES, /* FragCreates */ IPSTATS_MIB_INMCASTPKTS, /* InMcastPkts */ IPSTATS_MIB_OUTMCASTPKTS, /* OutMcastPkts */ + IPSTATS_MIB_INBCASTPKTS, /* InBcastPkts */ + IPSTATS_MIB_OUTBCASTPKTS, /* OutBcastPkts */ __IPSTATS_MIB_MAX }; From e91a47ebb130b90790c7a8c625ade4dcea246842 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:45:49 -0700 Subject: [PATCH 18/21] [IPV4] SNMP: Support InNoRoutes An IP datagram which is being discarded because of no routes in the forwarding path should be counted as InNoRoutes. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 324e7e0fdb2a..63ab5230c611 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,6 +340,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + else if (err == -ENETUNREACH) + IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); goto drop; } } From 704aed53b4e43bebfbd425cf95b66794a9cfa2c2 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:46:30 -0700 Subject: [PATCH 19/21] [IPV4] SNMP: Support InTruncatedPkts An IP datagram which is being discarded because the datagram frame didn't carry enough data should be counted as InTruncatedPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 63ab5230c611..c8c455dd9caf 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -416,7 +416,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto inhdr_error; len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl*4)) + if (skb->len < len) { + IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) goto inhdr_error; /* Our transport medium may have padded the buffer out. Now we know it From 5506b54b36f067b9776935085c9f8e607b026b23 Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:48:10 -0700 Subject: [PATCH 20/21] [IPV4] SNMP: Support InMcastPkts and InBcastPkts A received IP multicast datagram should be counted as InMcastPkts. By the same token, a received IP broadcast datagram should be counted as InBcastPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c8c455dd9caf..97069399d864 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -329,6 +329,7 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); + struct rtable *rt; /* * Initialise the virtual path cache for the packet. It describes @@ -360,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb) if (iph->ihl > 5 && ip_rcv_options(skb)) goto drop; + rt = (struct rtable*)skb->dst; + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + return dst_input(skb); drop: From 80787ebc2bbd8e675d8b9ff8cfa40f15134feebe Mon Sep 17 00:00:00 2001 From: Mitsuru Chinen Date: Mon, 30 Apr 2007 00:48:20 -0700 Subject: [PATCH 21/21] [IPV4] SNMP: Support OutMcastPkts and OutBcastPkts A transmitted IP multicast datagram should be counted as OutMcastPkts. By the same token, a transmitted IP broadcast datagram should be counted as OutBcastPkts. Signed-off-by: Mitsuru Chinen Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 534650cad3a8..d6427d918512 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; + struct rtable *rt = (struct rtable *)dst; struct net_device *dev = dst->dev; int hh_len = LL_RESERVED_SPACE(dev); + if (rt->rt_type == RTN_MULTICAST) + IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + else if (rt->rt_type == RTN_BROADCAST) + IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { struct sk_buff *skb2;