From 082f2300cfa1a3d9d5221c38c5eba85d4ab98bd8 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Tue, 3 Apr 2018 13:40:06 +0200 Subject: [PATCH 01/46] Bluetooth: Fix connection if directed advertising and privacy is used Local random address needs to be updated before creating connection if RPA from LE Direct Advertising Report was resolved in host. Otherwise remote device might ignore connection request due to address mismatch. This was affecting following qualification test cases: GAP/CONN/SCEP/BV-03-C, GAP/CONN/GCEP/BV-05-C, GAP/CONN/DCEP/BV-05-C Before patch: < HCI Command: LE Set Random Address (0x08|0x0005) plen 6 #11350 [hci0] 84680.231216 Address: 56:BC:E8:24:11:68 (Resolvable) Identity type: Random (0x01) Identity: F2:F1:06:3D:9C:42 (Static) > HCI Event: Command Complete (0x0e) plen 4 #11351 [hci0] 84680.246022 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7 #11352 [hci0] 84680.246417 Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Own address type: Random (0x01) Filter policy: Accept all advertisement, inc. directed unresolved RPA (0x02) > HCI Event: Command Complete (0x0e) plen 4 #11353 [hci0] 84680.248854 LE Set Scan Parameters (0x08|0x000b) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #11354 [hci0] 84680.249466 Scanning: Enabled (0x01) Filter duplicates: Enabled (0x01) > HCI Event: Command Complete (0x0e) plen 4 #11355 [hci0] 84680.253222 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 18 #11356 [hci0] 84680.458387 LE Direct Advertising Report (0x0b) Num reports: 1 Event type: Connectable directed - ADV_DIRECT_IND (0x01) Address type: Random (0x01) Address: 53:38:DA:46:8C:45 (Resolvable) Identity type: Public (0x00) Identity: 11:22:33:44:55:66 (OUI 11-22-33) Direct address type: Random (0x01) Direct address: 7C:D6:76:8C:DF:82 (Resolvable) Identity type: Random (0x01) Identity: F2:F1:06:3D:9C:42 (Static) RSSI: -74 dBm (0xb6) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #11357 [hci0] 84680.458737 Scanning: Disabled (0x00) Filter duplicates: Disabled (0x00) > HCI Event: Command Complete (0x0e) plen 4 #11358 [hci0] 84680.469982 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) < HCI Command: LE Create Connection (0x08|0x000d) plen 25 #11359 [hci0] 84680.470444 Scan interval: 60.000 msec (0x0060) Scan window: 60.000 msec (0x0060) Filter policy: White list is not used (0x00) Peer address type: Random (0x01) Peer address: 53:38:DA:46:8C:45 (Resolvable) Identity type: Public (0x00) Identity: 11:22:33:44:55:66 (OUI 11-22-33) Own address type: Random (0x01) Min connection interval: 30.00 msec (0x0018) Max connection interval: 50.00 msec (0x0028) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Min connection length: 0.000 msec (0x0000) Max connection length: 0.000 msec (0x0000) > HCI Event: Command Status (0x0f) plen 4 #11360 [hci0] 84680.474971 LE Create Connection (0x08|0x000d) ncmd 1 Status: Success (0x00) < HCI Command: LE Create Connection Cancel (0x08|0x000e) plen 0 #11361 [hci0] 84682.545385 > HCI Event: Command Complete (0x0e) plen 4 #11362 [hci0] 84682.551014 LE Create Connection Cancel (0x08|0x000e) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 19 #11363 [hci0] 84682.551074 LE Connection Complete (0x01) Status: Unknown Connection Identifier (0x02) Handle: 0 Role: Master (0x00) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Connection interval: 0.00 msec (0x0000) Connection latency: 0 (0x0000) Supervision timeout: 0 msec (0x0000) Master clock accuracy: 0x00 After patch: < HCI Command: LE Set Scan Parameters (0x08|0x000b) plen 7 #210 [hci0] 667.152459 Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Own address type: Random (0x01) Filter policy: Accept all advertisement, inc. directed unresolved RPA (0x02) > HCI Event: Command Complete (0x0e) plen 4 #211 [hci0] 667.153613 LE Set Scan Parameters (0x08|0x000b) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #212 [hci0] 667.153704 Scanning: Enabled (0x01) Filter duplicates: Enabled (0x01) > HCI Event: Command Complete (0x0e) plen 4 #213 [hci0] 667.154584 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 18 #214 [hci0] 667.182619 LE Direct Advertising Report (0x0b) Num reports: 1 Event type: Connectable directed - ADV_DIRECT_IND (0x01) Address type: Random (0x01) Address: 50:52:D9:A6:48:A0 (Resolvable) Identity type: Public (0x00) Identity: 11:22:33:44:55:66 (OUI 11-22-33) Direct address type: Random (0x01) Direct address: 7C:C1:57:A5:B7:A8 (Resolvable) Identity type: Random (0x01) Identity: F4:28:73:5D:38:B0 (Static) RSSI: -70 dBm (0xba) < HCI Command: LE Set Scan Enable (0x08|0x000c) plen 2 #215 [hci0] 667.182704 Scanning: Disabled (0x00) Filter duplicates: Disabled (0x00) > HCI Event: Command Complete (0x0e) plen 4 #216 [hci0] 667.183599 LE Set Scan Enable (0x08|0x000c) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Random Address (0x08|0x0005) plen 6 #217 [hci0] 667.183645 Address: 7C:C1:57:A5:B7:A8 (Resolvable) Identity type: Random (0x01) Identity: F4:28:73:5D:38:B0 (Static) > HCI Event: Command Complete (0x0e) plen 4 #218 [hci0] 667.184590 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) < HCI Command: LE Create Connection (0x08|0x000d) plen 25 #219 [hci0] 667.184613 Scan interval: 60.000 msec (0x0060) Scan window: 60.000 msec (0x0060) Filter policy: White list is not used (0x00) Peer address type: Random (0x01) Peer address: 50:52:D9:A6:48:A0 (Resolvable) Identity type: Public (0x00) Identity: 11:22:33:44:55:66 (OUI 11-22-33) Own address type: Random (0x01) Min connection interval: 30.00 msec (0x0018) Max connection interval: 50.00 msec (0x0028) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Min connection length: 0.000 msec (0x0000) Max connection length: 0.000 msec (0x0000) > HCI Event: Command Status (0x0f) plen 4 #220 [hci0] 667.186558 LE Create Connection (0x08|0x000d) ncmd 1 Status: Success (0x00) > HCI Event: LE Meta Event (0x3e) plen 19 #221 [hci0] 667.485824 LE Connection Complete (0x01) Status: Success (0x00) Handle: 0 Role: Master (0x00) Peer address type: Random (0x01) Peer address: 50:52:D9:A6:48:A0 (Resolvable) Identity type: Public (0x00) Identity: 11:22:33:44:55:66 (OUI 11-22-33) Connection interval: 50.00 msec (0x0028) Connection latency: 0 (0x0000) Supervision timeout: 420 msec (0x002a) Master clock accuracy: 0x07 @ MGMT Event: Device Connected (0x000b) plen 13 {0x0002} [hci0] 667.485996 LE Address: 11:22:33:44:55:66 (OUI 11-22-33) Flags: 0x00000000 Data length: 0 Signed-off-by: Szymon Janc Signed-off-by: Marcel Holtmann Cc: stable@vger.kernel.org --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 29 +++++++++++++++++++++-------- net/bluetooth/hci_event.c | 15 +++++++++++---- net/bluetooth/l2cap_core.c | 2 +- 4 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 95ccc1eef558..b619a190ff12 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -895,7 +895,7 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u16 conn_timeout); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, - u8 role); + u8 role, bdaddr_t *direct_rpa); struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type); struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a9682534c377..45ff5dc124cc 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -749,18 +749,31 @@ static bool conn_use_rpa(struct hci_conn *conn) } static void hci_req_add_le_create_conn(struct hci_request *req, - struct hci_conn *conn) + struct hci_conn *conn, + bdaddr_t *direct_rpa) { struct hci_cp_le_create_conn cp; struct hci_dev *hdev = conn->hdev; u8 own_addr_type; - /* Update random address, but set require_privacy to false so - * that we never connect with an non-resolvable address. + /* If direct address was provided we use it instead of current + * address. */ - if (hci_update_random_address(req, false, conn_use_rpa(conn), - &own_addr_type)) - return; + if (direct_rpa) { + if (bacmp(&req->hdev->random_addr, direct_rpa)) + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, + direct_rpa); + + /* direct address is always RPA */ + own_addr_type = ADDR_LE_DEV_RANDOM; + } else { + /* Update random address, but set require_privacy to false so + * that we never connect with an non-resolvable address. + */ + if (hci_update_random_address(req, false, conn_use_rpa(conn), + &own_addr_type)) + return; + } memset(&cp, 0, sizeof(cp)); @@ -825,7 +838,7 @@ static void hci_req_directed_advertising(struct hci_request *req, struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, - u8 role) + u8 role, bdaddr_t *direct_rpa) { struct hci_conn_params *params; struct hci_conn *conn; @@ -940,7 +953,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); } - hci_req_add_le_create_conn(&req, conn); + hci_req_add_le_create_conn(&req, conn, direct_rpa); create_conn: err = hci_req_run(&req, create_le_conn_complete); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index cd3bbb766c24..139707cd9d35 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4648,7 +4648,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, /* This function requires the caller holds hdev->lock */ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type, u8 adv_type) + u8 addr_type, u8 adv_type, + bdaddr_t *direct_rpa) { struct hci_conn *conn; struct hci_conn_params *params; @@ -4699,7 +4700,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, } conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); + HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER, + direct_rpa); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned * by higher layer that tried to connect, if no then @@ -4808,8 +4810,13 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, bdaddr_type = irk->addr_type; } - /* Check if we have been requested to connect to this device */ - conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type); + /* Check if we have been requested to connect to this device. + * + * direct_addr is set only for directed advertising reports (it is NULL + * for advertising reports) and is already verified to be RPA above. + */ + conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type, + direct_addr); if (conn && type == LE_ADV_IND) { /* Store report for later inclusion by * mgmt_device_connected diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fc6615d59165..9b7907ebfa01 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7156,7 +7156,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level, HCI_LE_CONN_TIMEOUT, - HCI_ROLE_SLAVE); + HCI_ROLE_SLAVE, NULL); else hcon = hci_connect_le_scan(hdev, dst, dst_type, chan->sec_level, From 820ed3fb2e6e986144465082d041e6a403a94135 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:46 -0700 Subject: [PATCH 02/46] bpf: sockmap, free memory on sock close with cork data If a socket with pending cork data is closed we do not return the memory to the socket until the garbage collector free's the psock structure. The garbage collector though can run after the sock has completed its close operation. If this ordering happens the sock code will through a WARN_ON because there is still outstanding memory accounted to the sock. To resolve this ensure we return memory to the sock when a socket is closed. Signed-off-by: John Fastabend Fixes: 91843d540a13 ("bpf: sockmap, add msg_cork_bytes() helper") Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index d2bda5aa25d7..8ddf326b3ade 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -211,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + if (psock->cork) { + free_start_sg(psock->sock, psock->cork); + kfree(psock->cork); + psock->cork = NULL; + } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { list_del(&md->list); free_start_sg(psock->sock, md); From 0e94d87fcfc81883b72574a5cc4638bd87adbb10 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 2 Apr 2018 12:50:52 -0700 Subject: [PATCH 03/46] bpf: sockmap, duplicates release calls may NULL sk_prot It is possible to have multiple ULP tcp_release call paths in flight if a sock is closed and simultaneously being removed from the sockmap control path. The result would be setting the sk_prot to the saved values on the first iteration and then on the second iteration setting the value to NULL. This patch resolves this by ensuring we only reset the sk_prot pointer if we have a valid saved state to set. Fixes: 4f738adba30a7 ("bpf: create tcp_bpf_ulp allowing BPF to monitor socket TX/RX data") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 8ddf326b3ade..8dd9210d7db7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -182,8 +182,10 @@ static void bpf_tcp_release(struct sock *sk) psock->cork = NULL; } - sk->sk_prot = psock->sk_proto; - psock->sk_proto = NULL; + if (psock->sk_proto) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } out: rcu_read_unlock(); } From 33491588c1fb2c76ed114a211ad0ee76c16b5a0c Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 3 Apr 2018 14:09:47 +0200 Subject: [PATCH 04/46] kernel/bpf/syscall: fix warning defined but not used MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There will be a build warning -Wunused-function if CONFIG_CGROUP_BPF isn't defined, since the only user is inside #ifdef CONFIG_CGROUP_BPF: kernel/bpf/syscall.c:1229:12: warning: ‘bpf_prog_attach_check_attach_type’ defined but not used [-Wunused-function] static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Current code moves function bpf_prog_attach_check_attach_type inside ifdef CONFIG_CGROUP_BPF. Fixes: 5e43f899b03a ("bpf: Check attach type at prog load time") Signed-off-by: Anders Roxell Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0244973ee544..4ca46df19c9a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1226,18 +1226,6 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, } } -static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, - enum bpf_attach_type attach_type) -{ - switch (prog->type) { - case BPF_PROG_TYPE_CGROUP_SOCK: - case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: - return attach_type == prog->expected_attach_type ? 0 : -EINVAL; - default: - return 0; - } -} - /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD expected_attach_type @@ -1465,6 +1453,18 @@ out_free_tp: #ifdef CONFIG_CGROUP_BPF +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + #define BPF_PROG_ATTACH_LAST_FIELD attach_flags static int sockmap_get_from_fd(const union bpf_attr *attr, From a9d48205d0aedda021fc3728972a9e9934c2b9de Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:26 -0700 Subject: [PATCH 05/46] net: fool proof dev_valid_name() We want to use dev_valid_name() to validate tunnel names, so better use strnlen(name, IFNAMSIZ) than strlen(name) to make sure to not upset KASAN. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 9b04a9fd1dfd..969462ebb296 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1027,7 +1027,7 @@ bool dev_valid_name(const char *name) { if (*name == '\0') return false; - if (strlen(name) >= IFNAMSIZ) + if (strnlen(name, IFNAMSIZ) == IFNAMSIZ) return false; if (!strcmp(name, ".") || !strcmp(name, "..")) return false; From 9cb726a212a82c88c98aa9f0037fd04777cd8fe5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:27 -0700 Subject: [PATCH 06/46] ip_tunnel: better validate user provided tunnel names Use dev_valid_name() to make sure user does not provide illegal device name. syzbot caught the following bug : BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300 [inline] BUG: KASAN: stack-out-of-bounds in __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 Write of size 20 at addr ffff8801ac79f810 by task syzkaller268107/4482 CPU: 0 PID: 4482 Comm: syzkaller268107 Not tainted 4.16.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x1b9/0x29f lib/dump_stack.c:53 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0xac/0x2f5 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 strlcpy include/linux/string.h:300 [inline] __ip_tunnel_create+0xca/0x6b0 net/ipv4/ip_tunnel.c:257 ip_tunnel_create net/ipv4/ip_tunnel.c:352 [inline] ip_tunnel_ioctl+0x818/0xd40 net/ipv4/ip_tunnel.c:861 ipip_tunnel_ioctl+0x1c5/0x420 net/ipv4/ipip.c:350 dev_ifsioc+0x43e/0xb90 net/core/dev_ioctl.c:334 dev_ioctl+0x69a/0xcc0 net/core/dev_ioctl.c:525 sock_ioctl+0x47e/0x680 net/socket.c:1015 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 SYSC_ioctl fs/ioctl.c:708 [inline] SyS_ioctl+0x24/0x30 fs/ioctl.c:706 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index de6d94482fe7..6b0e362cc99b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -253,13 +253,14 @@ static struct net_device *__ip_tunnel_create(struct net *net, struct net_device *dev; char name[IFNAMSIZ]; - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else { - if (strlen(ops->kind) > (IFNAMSIZ - 3)) { - err = -E2BIG; + err = -E2BIG; + if (parms->name[0]) { + if (!dev_valid_name(parms->name)) + goto failed; + strlcpy(name, parms->name, IFNAMSIZ); + } else { + if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - } strlcpy(name, ops->kind, IFNAMSIZ); strncat(name, "%d", 2); } From b95211e066fc3494b7c115060b2297b4ba21f025 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:28 -0700 Subject: [PATCH 07/46] ipv6: sit: better validate user provided tunnel names Use dev_valid_name() to make sure user does not provide illegal device name. syzbot caught the following bug : BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300 [inline] BUG: KASAN: stack-out-of-bounds in ipip6_tunnel_locate+0x63b/0xaa0 net/ipv6/sit.c:254 Write of size 33 at addr ffff8801b64076d8 by task syzkaller932654/4453 CPU: 0 PID: 4453 Comm: syzkaller932654 Not tainted 4.16.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x1b9/0x29f lib/dump_stack.c:53 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0xac/0x2f5 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 strlcpy include/linux/string.h:300 [inline] ipip6_tunnel_locate+0x63b/0xaa0 net/ipv6/sit.c:254 ipip6_tunnel_ioctl+0xe71/0x241b net/ipv6/sit.c:1221 dev_ifsioc+0x43e/0xb90 net/core/dev_ioctl.c:334 dev_ioctl+0x69a/0xcc0 net/core/dev_ioctl.c:525 sock_ioctl+0x47e/0x680 net/socket.c:1015 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 SYSC_ioctl fs/ioctl.c:708 [inline] SyS_ioctl+0x24/0x30 fs/ioctl.c:706 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/sit.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 1522bcfd253f..2afce37a7177 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -250,11 +250,13 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, if (!create) goto failed; - if (parms->name[0]) + if (parms->name[0]) { + if (!dev_valid_name(parms->name)) + goto failed; strlcpy(name, parms->name, IFNAMSIZ); - else + } else { strcpy(name, "sit%d"); - + } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!dev) From 5f42df013b8bc1b6511af7a04bf93b014884ae2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:29 -0700 Subject: [PATCH 08/46] ip6_gre: better validate user provided tunnel names Use dev_valid_name() to make sure user does not provide illegal device name. syzbot caught the following bug : BUG: KASAN: stack-out-of-bounds in strlcpy include/linux/string.h:300 [inline] BUG: KASAN: stack-out-of-bounds in ip6gre_tunnel_locate+0x334/0x860 net/ipv6/ip6_gre.c:339 Write of size 20 at addr ffff8801afb9f7b8 by task syzkaller851048/4466 CPU: 1 PID: 4466 Comm: syzkaller851048 Not tainted 4.16.0+ #1 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x1b9/0x29f lib/dump_stack.c:53 print_address_description+0x6c/0x20b mm/kasan/report.c:256 kasan_report_error mm/kasan/report.c:354 [inline] kasan_report.cold.7+0xac/0x2f5 mm/kasan/report.c:412 check_memory_region_inline mm/kasan/kasan.c:260 [inline] check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267 memcpy+0x37/0x50 mm/kasan/kasan.c:303 strlcpy include/linux/string.h:300 [inline] ip6gre_tunnel_locate+0x334/0x860 net/ipv6/ip6_gre.c:339 ip6gre_tunnel_ioctl+0x69d/0x12e0 net/ipv6/ip6_gre.c:1195 dev_ifsioc+0x43e/0xb90 net/core/dev_ioctl.c:334 dev_ioctl+0x69a/0xcc0 net/core/dev_ioctl.c:525 sock_ioctl+0x47e/0x680 net/socket.c:1015 vfs_ioctl fs/ioctl.c:46 [inline] file_ioctl fs/ioctl.c:500 [inline] do_vfs_ioctl+0x1cf/0x1650 fs/ioctl.c:684 ksys_ioctl+0xa9/0xd0 fs/ioctl.c:701 SYSC_ioctl fs/ioctl.c:708 [inline] SyS_ioctl+0x24/0x30 fs/ioctl.c:706 do_syscall_64+0x29e/0x9d0 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index f8a103bdbd60..69727bc168cb 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -335,11 +335,13 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (t || !create) return t; - if (parms->name[0]) + if (parms->name[0]) { + if (!dev_valid_name(parms->name)) + return NULL; strlcpy(name, parms->name, IFNAMSIZ); - else + } else { strcpy(name, "ip6gre%d"); - + } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); if (!dev) From db7a65e3ab78e5b1c4b17c0870ebee35a4ee3257 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:30 -0700 Subject: [PATCH 09/46] ip6_tunnel: better validate user provided tunnel names Use valid_name() to make sure user does not provide illegal device name. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index df4c29f7d59f..da66aaac51ce 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -297,13 +297,16 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; - int err = -ENOMEM; + int err = -E2BIG; - if (p->name[0]) + if (p->name[0]) { + if (!dev_valid_name(p->name)) + goto failed; strlcpy(name, p->name, IFNAMSIZ); - else + } else { sprintf(name, "ip6tnl%%d"); - + } + err = -ENOMEM; dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6_tnl_dev_setup); if (!dev) From 537b361fbcbcc3cd6fe2bb47069fd292b9256d16 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Apr 2018 06:39:31 -0700 Subject: [PATCH 10/46] vti6: better validate user provided tunnel names Use valid_name() to make sure user does not provide illegal device name. Fixes: ed1efb2aefbb ("ipv6: Add support for IPsec virtual tunnel interfaces") Signed-off-by: Eric Dumazet Cc: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv6/ip6_vti.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 6ebb2e8777f4..c214ffec02f0 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -212,10 +212,13 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p char name[IFNAMSIZ]; int err; - if (p->name[0]) + if (p->name[0]) { + if (!dev_valid_name(p->name)) + goto failed; strlcpy(name, p->name, IFNAMSIZ); - else + } else { sprintf(name, "ip6_vti%%d"); + } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (!dev) From 58b35f27689b5eb514fc293c332966c226b1b6e4 Mon Sep 17 00:00:00 2001 From: Miguel Fadon Perlines Date: Thu, 5 Apr 2018 10:25:38 +0200 Subject: [PATCH 11/46] arp: fix arp_filter on l3slave devices arp_filter performs an ip_route_output search for arp source address and checks if output device is the same where the arp request was received, if it is not, the arp request is not answered. This route lookup is always done on main route table so l3slave devices never find the proper route and arp is not answered. Passing l3mdev_master_ifindex_rcu(dev) return value as oif fixes the lookup for l3slave devices while maintaining same behavior for non l3slave devices as this function returns 0 in that case. Fixes: 613d09b30f8b ("net: Use VRF device index for lookups on TX") Signed-off-by: Miguel Fadon Perlines Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/arp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index be4c595edccb..bf6c2d4d4fdc 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -437,7 +437,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) /*unsigned long now; */ struct net *net = dev_net(dev); - rt = ip_route_output(net, sip, tip, 0, 0); + rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev)); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { From 3d92f0b582062127026af1fb5e86eda4a3b01783 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 5 Apr 2018 11:55:48 +0200 Subject: [PATCH 12/46] net: mvpp2: Fix parser entry init boundary check Boundary check in mvpp2_prs_init_from_hw must be done according to the passed "tid" parameter, not the mvpp2_prs_entry index, which is not yet initialized at the time of the check. Fixes: 47e0e14eb1a6 ("net: mvpp2: Make mvpp2_prs_hw_read a parser entry init function") Signed-off-by: Maxime Chevallier Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c index 7fc1bbf51c44..54a038943c06 100644 --- a/drivers/net/ethernet/marvell/mvpp2.c +++ b/drivers/net/ethernet/marvell/mvpp2.c @@ -1604,7 +1604,7 @@ static int mvpp2_prs_init_from_hw(struct mvpp2 *priv, { int i; - if (pe->index > MVPP2_PRS_TCAM_SRAM_SIZE - 1) + if (tid > MVPP2_PRS_TCAM_SRAM_SIZE - 1) return -EINVAL; memset(pe, 0, sizeof(*pe)); From 2afc5d61a7197de25a61f54ea4ecfb4cb62b1d42 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Thu, 5 Apr 2018 21:09:18 +0200 Subject: [PATCH 13/46] hv_netvsc: Use Windows version instead of NVSP version on GPAD teardown When changing network interface settings, Windows guests older than WS2016 can no longer shutdown. This was addressed by commit 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order on older versions"), however the issue also occurs on WS2012 guests that share NVSP protocol versions with WS2016 guests. Hence we use Windows version directly to differentiate them. Fixes: 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order on older versions") Signed-off-by: Mohammed Gamal Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index c9910c33e671..d65b7fc6c4a4 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -590,13 +590,13 @@ void netvsc_device_remove(struct hv_device *device) netdev_dbg(ndev, "net device safe to remove\n"); /* older versions require that buffer be revoked before close */ - if (net_device->nvsp_version < NVSP_PROTOCOL_VERSION_4) + if (vmbus_proto_version < VERSION_WIN10) netvsc_teardown_gpadl(device, net_device); /* Now, we can close the channel safely */ vmbus_close(device->channel); - if (net_device->nvsp_version >= NVSP_PROTOCOL_VERSION_4) + if (vmbus_proto_version >= VERSION_WIN10) netvsc_teardown_gpadl(device, net_device); /* Release all resources */ From 7992894c305eaf504d005529637ff8283d0a849d Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Thu, 5 Apr 2018 21:09:19 +0200 Subject: [PATCH 14/46] hv_netvsc: Split netvsc_revoke_buf() and netvsc_teardown_gpadl() Split each of the functions into two for each of send/recv buffers. This will be needed in order to implement a fine-grained messaging sequence to the host so that we accommodate the requirements of different Windows versions Fixes: 0ef58b0a05c12 ("hv_netvsc: change GPAD teardown order on older versions") Signed-off-by: Mohammed Gamal Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 46 +++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index d65b7fc6c4a4..f4df5def8f62 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -109,11 +109,11 @@ static void free_netvsc_device_rcu(struct netvsc_device *nvdev) call_rcu(&nvdev->rcu, free_netvsc_device); } -static void netvsc_revoke_buf(struct hv_device *device, - struct netvsc_device *net_device) +static void netvsc_revoke_recv_buf(struct hv_device *device, + struct netvsc_device *net_device) { - struct nvsp_message *revoke_packet; struct net_device *ndev = hv_get_drvdata(device); + struct nvsp_message *revoke_packet; int ret; /* @@ -157,6 +157,14 @@ static void netvsc_revoke_buf(struct hv_device *device, } net_device->recv_section_cnt = 0; } +} + +static void netvsc_revoke_send_buf(struct hv_device *device, + struct netvsc_device *net_device) +{ + struct net_device *ndev = hv_get_drvdata(device); + struct nvsp_message *revoke_packet; + int ret; /* Deal with the send buffer we may have setup. * If we got a send section size, it means we received a @@ -202,8 +210,8 @@ static void netvsc_revoke_buf(struct hv_device *device, } } -static void netvsc_teardown_gpadl(struct hv_device *device, - struct netvsc_device *net_device) +static void netvsc_teardown_recv_gpadl(struct hv_device *device, + struct netvsc_device *net_device) { struct net_device *ndev = hv_get_drvdata(device); int ret; @@ -222,6 +230,13 @@ static void netvsc_teardown_gpadl(struct hv_device *device, } net_device->recv_buf_gpadl_handle = 0; } +} + +static void netvsc_teardown_send_gpadl(struct hv_device *device, + struct netvsc_device *net_device) +{ + struct net_device *ndev = hv_get_drvdata(device); + int ret; if (net_device->send_buf_gpadl_handle) { ret = vmbus_teardown_gpadl(device->channel, @@ -437,8 +452,10 @@ static int netvsc_init_buf(struct hv_device *device, goto exit; cleanup: - netvsc_revoke_buf(device, net_device); - netvsc_teardown_gpadl(device, net_device); + netvsc_revoke_recv_buf(device, net_device); + netvsc_revoke_send_buf(device, net_device); + netvsc_teardown_recv_gpadl(device, net_device); + netvsc_teardown_send_gpadl(device, net_device); exit: return ret; @@ -575,7 +592,8 @@ void netvsc_device_remove(struct hv_device *device) = rtnl_dereference(net_device_ctx->nvdev); int i; - netvsc_revoke_buf(device, net_device); + netvsc_revoke_recv_buf(device, net_device); + netvsc_revoke_send_buf(device, net_device); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); @@ -590,14 +608,18 @@ void netvsc_device_remove(struct hv_device *device) netdev_dbg(ndev, "net device safe to remove\n"); /* older versions require that buffer be revoked before close */ - if (vmbus_proto_version < VERSION_WIN10) - netvsc_teardown_gpadl(device, net_device); + if (vmbus_proto_version < VERSION_WIN10) { + netvsc_teardown_recv_gpadl(device, net_device); + netvsc_teardown_send_gpadl(device, net_device); + } /* Now, we can close the channel safely */ vmbus_close(device->channel); - if (vmbus_proto_version >= VERSION_WIN10) - netvsc_teardown_gpadl(device, net_device); + if (vmbus_proto_version >= VERSION_WIN10) { + netvsc_teardown_recv_gpadl(device, net_device); + netvsc_teardown_send_gpadl(device, net_device); + } /* Release all resources */ free_netvsc_device_rcu(net_device); From a56d99d714665591fed8527b90eef21530ea61e0 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Thu, 5 Apr 2018 21:09:20 +0200 Subject: [PATCH 15/46] hv_netvsc: Ensure correct teardown message sequence order Prior to commit 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split") the call sequence in netvsc_device_remove() was as follows (as implemented in netvsc_destroy_buf()): 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message 2- Teardown receive buffer GPADL 3- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message 4- Teardown send buffer GPADL 5- Close vmbus This didn't work for WS2016 hosts. Commit 0cf737808ae7 ("hv_netvsc: netvsc_teardown_gpadl() split") rearranged the teardown sequence as follows: 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message 2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message 3- Close vmbus 4- Teardown receive buffer GPADL 5- Teardown send buffer GPADL That worked well for WS2016 hosts, but it prevented guests on older hosts from shutting down after changing network settings. Commit 0ef58b0a05c1 ("hv_netvsc: change GPAD teardown order on older versions") ensured the following message sequence for older hosts 1- Send NVSP_MSG1_TYPE_REVOKE_RECV_BUF message 2- Send NVSP_MSG1_TYPE_REVOKE_SEND_BUF message 3- Teardown receive buffer GPADL 4- Teardown send buffer GPADL 5- Close vmbus However, with this sequence calling `ip link set eth0 mtu 1000` hangs and the process becomes uninterruptible. On futher analysis it turns out that on tearing down the receive buffer GPADL the kernel is waiting indefinitely in vmbus_teardown_gpadl() for a completion to be signaled. Here is a snippet of where this occurs: int vmbus_teardown_gpadl(struct vmbus_channel *channel, u32 gpadl_handle) { struct vmbus_channel_gpadl_teardown *msg; struct vmbus_channel_msginfo *info; unsigned long flags; int ret; info = kmalloc(sizeof(*info) + sizeof(struct vmbus_channel_gpadl_teardown), GFP_KERNEL); if (!info) return -ENOMEM; init_completion(&info->waitevent); info->waiting_channel = channel; [....] ret = vmbus_post_msg(msg, sizeof(struct vmbus_channel_gpadl_teardown), true); if (ret) goto post_msg_err; wait_for_completion(&info->waitevent); [....] } The completion is signaled from vmbus_ongpadl_torndown(), which gets called when the corresponding message is received from the host, which apparently never happens in that case. This patch works around the issue by restoring the first mentioned message sequence for older hosts Fixes: 0ef58b0a05c1 ("hv_netvsc: change GPAD teardown order on older versions") Signed-off-by: Mohammed Gamal Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index f4df5def8f62..df92c2fa7a87 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -592,8 +592,17 @@ void netvsc_device_remove(struct hv_device *device) = rtnl_dereference(net_device_ctx->nvdev); int i; + /* + * Revoke receive buffer. If host is pre-Win2016 then tear down + * receive buffer GPADL. Do the same for send buffer. + */ netvsc_revoke_recv_buf(device, net_device); + if (vmbus_proto_version < VERSION_WIN10) + netvsc_teardown_recv_gpadl(device, net_device); + netvsc_revoke_send_buf(device, net_device); + if (vmbus_proto_version < VERSION_WIN10) + netvsc_teardown_send_gpadl(device, net_device); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); @@ -607,15 +616,13 @@ void netvsc_device_remove(struct hv_device *device) */ netdev_dbg(ndev, "net device safe to remove\n"); - /* older versions require that buffer be revoked before close */ - if (vmbus_proto_version < VERSION_WIN10) { - netvsc_teardown_recv_gpadl(device, net_device); - netvsc_teardown_send_gpadl(device, net_device); - } - /* Now, we can close the channel safely */ vmbus_close(device->channel); + /* + * If host is Win2016 or higher then we do the GPADL tear down + * here after VMBus is closed. + */ if (vmbus_proto_version >= VERSION_WIN10) { netvsc_teardown_recv_gpadl(device, net_device); netvsc_teardown_send_gpadl(device, net_device); From 3f076effb9adad6a16fafd3cfe33fe530c5d428d Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Thu, 5 Apr 2018 21:09:21 +0200 Subject: [PATCH 16/46] hv_netvsc: Pass net_device parameter to revoke and teardown functions The callers to netvsc_revoke_*_buf() and netvsc_teardown_*_gpadl() already have their net_device instances. Pass them as a paramaeter to the function instead of obtaining them from netvsc_device struct everytime Signed-off-by: Mohammed Gamal Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index df92c2fa7a87..04f611e6f678 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -110,9 +110,9 @@ static void free_netvsc_device_rcu(struct netvsc_device *nvdev) } static void netvsc_revoke_recv_buf(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + struct net_device *ndev) { - struct net_device *ndev = hv_get_drvdata(device); struct nvsp_message *revoke_packet; int ret; @@ -160,9 +160,9 @@ static void netvsc_revoke_recv_buf(struct hv_device *device, } static void netvsc_revoke_send_buf(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + struct net_device *ndev) { - struct net_device *ndev = hv_get_drvdata(device); struct nvsp_message *revoke_packet; int ret; @@ -211,9 +211,9 @@ static void netvsc_revoke_send_buf(struct hv_device *device, } static void netvsc_teardown_recv_gpadl(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + struct net_device *ndev) { - struct net_device *ndev = hv_get_drvdata(device); int ret; if (net_device->recv_buf_gpadl_handle) { @@ -233,9 +233,9 @@ static void netvsc_teardown_recv_gpadl(struct hv_device *device, } static void netvsc_teardown_send_gpadl(struct hv_device *device, - struct netvsc_device *net_device) + struct netvsc_device *net_device, + struct net_device *ndev) { - struct net_device *ndev = hv_get_drvdata(device); int ret; if (net_device->send_buf_gpadl_handle) { @@ -452,10 +452,10 @@ static int netvsc_init_buf(struct hv_device *device, goto exit; cleanup: - netvsc_revoke_recv_buf(device, net_device); - netvsc_revoke_send_buf(device, net_device); - netvsc_teardown_recv_gpadl(device, net_device); - netvsc_teardown_send_gpadl(device, net_device); + netvsc_revoke_recv_buf(device, net_device, ndev); + netvsc_revoke_send_buf(device, net_device, ndev); + netvsc_teardown_recv_gpadl(device, net_device, ndev); + netvsc_teardown_send_gpadl(device, net_device, ndev); exit: return ret; @@ -474,7 +474,6 @@ static int negotiate_nvsp_ver(struct hv_device *device, init_packet->hdr.msg_type = NVSP_MSG_TYPE_INIT; init_packet->msg.init_msg.init.min_protocol_ver = nvsp_ver; init_packet->msg.init_msg.init.max_protocol_ver = nvsp_ver; - trace_nvsp_send(ndev, init_packet); /* Send the init request */ @@ -596,13 +595,13 @@ void netvsc_device_remove(struct hv_device *device) * Revoke receive buffer. If host is pre-Win2016 then tear down * receive buffer GPADL. Do the same for send buffer. */ - netvsc_revoke_recv_buf(device, net_device); + netvsc_revoke_recv_buf(device, net_device, ndev); if (vmbus_proto_version < VERSION_WIN10) - netvsc_teardown_recv_gpadl(device, net_device); + netvsc_teardown_recv_gpadl(device, net_device, ndev); - netvsc_revoke_send_buf(device, net_device); + netvsc_revoke_send_buf(device, net_device, ndev); if (vmbus_proto_version < VERSION_WIN10) - netvsc_teardown_send_gpadl(device, net_device); + netvsc_teardown_send_gpadl(device, net_device, ndev); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); @@ -624,8 +623,8 @@ void netvsc_device_remove(struct hv_device *device) * here after VMBus is closed. */ if (vmbus_proto_version >= VERSION_WIN10) { - netvsc_teardown_recv_gpadl(device, net_device); - netvsc_teardown_send_gpadl(device, net_device); + netvsc_teardown_recv_gpadl(device, net_device, ndev); + netvsc_teardown_send_gpadl(device, net_device, ndev); } /* Release all resources */ From 71a1c915238c970cd9bdd5bf158b1279d6b6d55b Mon Sep 17 00:00:00 2001 From: Jeff Barnhill <0xeffeff@gmail.com> Date: Thu, 5 Apr 2018 21:29:47 +0000 Subject: [PATCH 17/46] net/ipv6: Increment OUTxxx counters after netfilter hook At the end of ip6_forward(), IPSTATS_MIB_OUTFORWDATAGRAMS and IPSTATS_MIB_OUTOCTETS are incremented immediately before the NF_HOOK call for NFPROTO_IPV6 / NF_INET_FORWARD. As a result, these counters get incremented regardless of whether or not the netfilter hook allows the packet to continue being processed. This change increments the counters in ip6_forward_finish() so that it will not happen if the netfilter hook chooses to terminate the packet, which is similar to how IPv4 works. Signed-off-by: Jeff Barnhill <0xeffeff@gmail.com> Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b8ee50e94af3..2e891d2c30ef 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -375,6 +375,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + return dst_output(net, sk, skb); } @@ -569,8 +574,6 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; - __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); From 3239534a79ee6f20cffd974173a1e62e0730e8ac Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 6 Apr 2018 01:19:37 +0200 Subject: [PATCH 18/46] net/sched: fix NULL dereference in the error path of tcf_bpf_init() when tcf_bpf_init_from_ops() fails (e.g. because of program having invalid number of instructions), tcf_bpf_cfg_cleanup() calls bpf_prog_put(NULL) or bpf_prog_destroy(NULL). Unless CONFIG_BPF_SYSCALL is unset, this causes the following error: BUG: unable to handle kernel NULL pointer dereference at 0000000000000020 PGD 800000007345a067 P4D 800000007345a067 PUD 340e1067 PMD 0 Oops: 0000 [#1] SMP PTI Modules linked in: act_bpf(E) ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache jbd2 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_codec_generic pcbc snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm aesni_intel crypto_simd glue_helper cryptd joydev snd_timer snd virtio_balloon pcspkr soundcore i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_blk drm virtio_net virtio_console i2c_core crc32c_intel serio_raw virtio_pci ata_piix libata virtio_ring floppy virtio dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_bpf] CPU: 3 PID: 5654 Comm: tc Tainted: G E 4.16.0.bpf_test+ #408 Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 RIP: 0010:__bpf_prog_put+0xc/0xc0 RSP: 0018:ffff9594003ef728 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff9594003ef758 RCX: 0000000000000024 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000044 R10: 0000000000000220 R11: ffff8a7ab9f17131 R12: 0000000000000000 R13: ffff8a7ab7c3c8e0 R14: 0000000000000001 R15: ffff8a7ab88f1054 FS: 00007fcb2f17c740(0000) GS:ffff8a7abfd80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000020 CR3: 000000007c888006 CR4: 00000000001606e0 Call Trace: tcf_bpf_cfg_cleanup+0x2f/0x40 [act_bpf] tcf_bpf_cleanup+0x4c/0x70 [act_bpf] __tcf_idr_release+0x79/0x140 tcf_bpf_init+0x125/0x330 [act_bpf] tcf_action_init_1+0x2cc/0x430 ? get_page_from_freelist+0x3f0/0x11b0 tcf_action_init+0xd3/0x1b0 tc_ctl_action+0x18b/0x240 rtnetlink_rcv_msg+0x29c/0x310 ? _cond_resched+0x15/0x30 ? __kmalloc_node_track_caller+0x1b9/0x270 ? rtnl_calcit.isra.29+0x100/0x100 netlink_rcv_skb+0xd2/0x110 netlink_unicast+0x17c/0x230 netlink_sendmsg+0x2cd/0x3c0 sock_sendmsg+0x30/0x40 ___sys_sendmsg+0x27a/0x290 ? mem_cgroup_commit_charge+0x80/0x130 ? page_add_new_anon_rmap+0x73/0xc0 ? do_anonymous_page+0x2a2/0x560 ? __handle_mm_fault+0xc75/0xe20 __sys_sendmsg+0x58/0xa0 do_syscall_64+0x6e/0x1a0 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x7fcb2e58eba0 RSP: 002b:00007ffc93c496c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007ffc93c497f0 RCX: 00007fcb2e58eba0 RDX: 0000000000000000 RSI: 00007ffc93c49740 RDI: 0000000000000003 RBP: 000000005ac6a646 R08: 0000000000000002 R09: 0000000000000000 R10: 00007ffc93c49120 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffc93c49804 R14: 0000000000000001 R15: 000000000066afa0 Code: 5f 00 48 8b 43 20 48 c7 c7 70 2f 7c b8 c7 40 10 00 00 00 00 5b e9 a5 8b 61 00 0f 1f 44 00 00 0f 1f 44 00 00 41 54 55 48 89 fd 53 <48> 8b 47 20 f0 ff 08 74 05 5b 5d 41 5c c3 41 89 f4 0f 1f 44 00 RIP: __bpf_prog_put+0xc/0xc0 RSP: ffff9594003ef728 CR2: 0000000000000020 Fix it in tcf_bpf_cfg_cleanup(), ensuring that bpf_prog_{put,destroy}(f) is called only when f is not NULL. Fixes: bbc09e7842a5 ("net/sched: fix idr leak on the error path of tcf_bpf_init()") Reported-by: Lucas Bates Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 9092531d45d8..18089c02e557 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -248,10 +248,14 @@ static int tcf_bpf_init_from_efd(struct nlattr **tb, struct tcf_bpf_cfg *cfg) static void tcf_bpf_cfg_cleanup(const struct tcf_bpf_cfg *cfg) { - if (cfg->is_ebpf) - bpf_prog_put(cfg->filter); - else - bpf_prog_destroy(cfg->filter); + struct bpf_prog *filter = cfg->filter; + + if (filter) { + if (cfg->is_ebpf) + bpf_prog_put(filter); + else + bpf_prog_destroy(filter); + } kfree(cfg->bpf_ops); kfree(cfg->bpf_name); From 63bb4e1ebd9876d250eea24d27a2caf3e0e3302c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 28 Mar 2018 12:50:45 +0000 Subject: [PATCH 19/46] ice: Fix error return code in ice_init_hw() Fix to return error code ICE_ERR_NO_MEMORY from the alloc error handling case instead of 0, as done elsewhere in this function. Fixes: dc49c7723676 ("ice: Get MAC/PHY/link info and scheduler topology") Signed-off-by: Wei Yongjun Acked-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_common.c b/drivers/net/ethernet/intel/ice/ice_common.c index 385f5d425d19..21977ec984c4 100644 --- a/drivers/net/ethernet/intel/ice/ice_common.c +++ b/drivers/net/ethernet/intel/ice/ice_common.c @@ -468,8 +468,10 @@ enum ice_status ice_init_hw(struct ice_hw *hw) mac_buf_len = sizeof(struct ice_aqc_manage_mac_read_resp); mac_buf = devm_kzalloc(ice_hw_to_dev(hw), mac_buf_len, GFP_KERNEL); - if (!mac_buf) + if (!mac_buf) { + status = ICE_ERR_NO_MEMORY; goto err_unroll_fltr_mgmt_struct; + } status = ice_aq_manage_mac_read(hw, mac_buf, mac_buf_len, NULL); devm_kfree(ice_hw_to_dev(hw), mac_buf); From cba5957d7e08fffd6237e4bd8f42adad057ab8bb Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan Date: Mon, 2 Apr 2018 09:26:06 -0700 Subject: [PATCH 20/46] ice: Bug fixes in ethtool code 1) Return correct size from ice_get_regs_len. 2) Fix incorrect use of ARRAY_SIZE in ice_get_regs. Fixes: fcea6f3da546 (ice: Add stats and ethtool support) Signed-off-by: Anirudh Venkataramanan Tested-by: Tony Brelinski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 186764a5c263..1db304c01d10 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -156,7 +156,7 @@ ice_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) static int ice_get_regs_len(struct net_device __always_unused *netdev) { - return ARRAY_SIZE(ice_regs_dump_list); + return sizeof(ice_regs_dump_list); } static void @@ -170,7 +170,7 @@ ice_get_regs(struct net_device *netdev, struct ethtool_regs *regs, void *p) regs->version = 1; - for (i = 0; i < ARRAY_SIZE(ice_regs_dump_list) / sizeof(u32); ++i) + for (i = 0; i < ARRAY_SIZE(ice_regs_dump_list); ++i) regs_buf[i] = rd32(hw, ice_regs_dump_list[i]); } From dd9a122ae99ae471beed4d4f8073d71e8d31ffa6 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Thu, 5 Apr 2018 22:40:29 +0200 Subject: [PATCH 21/46] net: phy: marvell: Enable interrupt function on LED2 pin The LED2[2]/INTn pin on Marvell 88E1318S as well as 88E1510/12/14/18 needs to be configured to be usable as interrupt not only when WOL is enabled, but whenever we rely on interrupts from the PHY. Signed-off-by: Esben Haabendal Cc: Rasmus Villemoes Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index a75c511950c3..c22e8e383247 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -828,6 +828,22 @@ static int m88e1121_config_init(struct phy_device *phydev) return marvell_config_init(phydev); } +static int m88e1318_config_init(struct phy_device *phydev) +{ + if (phy_interrupt_is_valid(phydev)) { + int err = phy_modify_paged( + phydev, MII_MARVELL_LED_PAGE, + MII_88E1318S_PHY_LED_TCR, + MII_88E1318S_PHY_LED_TCR_FORCE_INT, + MII_88E1318S_PHY_LED_TCR_INTn_ENABLE | + MII_88E1318S_PHY_LED_TCR_INT_ACTIVE_LOW); + if (err < 0) + return err; + } + + return m88e1121_config_init(phydev); +} + static int m88e1510_config_init(struct phy_device *phydev) { int err; @@ -870,7 +886,7 @@ static int m88e1510_config_init(struct phy_device *phydev) phydev->advertising &= ~pause; } - return m88e1121_config_init(phydev); + return m88e1318_config_init(phydev); } static int m88e1118_config_aneg(struct phy_device *phydev) @@ -2086,7 +2102,7 @@ static struct phy_driver marvell_drivers[] = { .features = PHY_GBIT_FEATURES, .flags = PHY_HAS_INTERRUPT, .probe = marvell_probe, - .config_init = &m88e1121_config_init, + .config_init = &m88e1318_config_init, .config_aneg = &m88e1318_config_aneg, .read_status = &marvell_read_status, .ack_interrupt = &marvell_ack_interrupt, From f12c643209db0626f2f54780d86bb93bfa7a9c2d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 6 Apr 2018 17:19:41 -0700 Subject: [PATCH 22/46] net_sched: fix a missing idr_remove() in u32_delete_key() When we delete a u32 key via u32_delete_key(), we forget to call idr_remove() to remove its handle from IDR. Fixes: e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles") Reported-by: Marcin Kabiesz Tested-by: Marcin Kabiesz Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ed8b6a24b9e9..bac47b5d18fd 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -489,6 +489,7 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key) RCU_INIT_POINTER(*kp, key->next); tcf_unbind_filter(tp, &key->res); + idr_remove(&ht->handle_idr, key->handle); tcf_exts_get_net(&key->exts); call_rcu(&key->rcu, u32_delete_key_freepf_rcu); return 0; From a466856e0b7ab269cdf9461886d007e88ff575b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:36 -0700 Subject: [PATCH 23/46] crypto: af_alg - fix possible uninit-value in alg_bind() syzbot reported : BUG: KMSAN: uninit-value in alg_bind+0xe3/0xd90 crypto/af_alg.c:162 We need to check addr_len before dereferencing sa (or uaddr) Fixes: bb30b8848c85 ("crypto: af_alg - whitelist mask and type") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Stephan Mueller Cc: Herbert Xu Signed-off-by: David S. Miller --- crypto/af_alg.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index c49766b03165..7846c0c20cfe 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -158,16 +158,16 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) void *private; int err; - /* If caller uses non-allowed flag, return error. */ - if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) - return -EINVAL; - if (sock->state == SS_CONNECTED) return -EINVAL; if (addr_len < sizeof(*sa)) return -EINVAL; + /* If caller uses non-allowed flag, return error. */ + if ((sa->salg_feat & ~allowed) || (sa->salg_mask & ~allowed)) + return -EINVAL; + sa->salg_type[sizeof(sa->salg_type) - 1] = 0; sa->salg_name[sizeof(sa->salg_name) + addr_len - sizeof(*sa) - 1] = 0; From 6091f09c2f79730d895149bcfe3d66140288cd0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:37 -0700 Subject: [PATCH 24/46] netlink: fix uninit-value in netlink_sendmsg syzbot reported : BUG: KMSAN: uninit-value in ffs arch/x86/include/asm/bitops.h:432 [inline] BUG: KMSAN: uninit-value in netlink_sendmsg+0xb26/0x1310 net/netlink/af_netlink.c:1851 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fa556fdef57d..55342c4d5cec 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1844,6 +1844,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_namelen) { err = -EINVAL; + if (msg->msg_namelen < sizeof(struct sockaddr_nl)) + goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; From b1993a2de12c9e75c35729e2ffbc3a92d50c0d31 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:38 -0700 Subject: [PATCH 25/46] net: fix rtnh_ok() syzbot reported : BUG: KMSAN: uninit-value in rtnh_ok include/net/nexthop.h:11 [inline] BUG: KMSAN: uninit-value in fib_count_nexthops net/ipv4/fib_semantics.c:469 [inline] BUG: KMSAN: uninit-value in fib_create_info+0x554/0x8d20 net/ipv4/fib_semantics.c:1091 @remaining is an integer, coming from user space. If it is negative we want rtnh_ok() to return false. Fixes: 4e902c57417c ("[IPv4]: FIB configuration using struct fib_config") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/nexthop.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/nexthop.h b/include/net/nexthop.h index 36bb794f5cd6..902ff382a6dc 100644 --- a/include/net/nexthop.h +++ b/include/net/nexthop.h @@ -7,7 +7,7 @@ static inline int rtnh_ok(const struct rtnexthop *rtnh, int remaining) { - return remaining >= sizeof(*rtnh) && + return remaining >= (int)sizeof(*rtnh) && rtnh->rtnh_len >= sizeof(*rtnh) && rtnh->rtnh_len <= remaining; } From b13dda9f9aa7caceeee61c080c2e544d5f5d85e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:39 -0700 Subject: [PATCH 26/46] net: initialize skb->peeked when cloning syzbot reported __skb_try_recv_from_queue() was using skb->peeked while it was potentially unitialized. We need to clear it in __skb_clone() Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/skbuff.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1bca1e0fc8f7..345b51837ca8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -857,6 +857,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len; n->cloned = 1; n->nohdr = 0; + n->peeked = 0; n->destructor = NULL; C(tail); C(end); From 77d36398d99f2565c0a8d43a86fd520a82e64bb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:40 -0700 Subject: [PATCH 27/46] net: fix uninit-value in __hw_addr_add_ex() syzbot complained : BUG: KMSAN: uninit-value in memcmp+0x119/0x180 lib/string.c:861 CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 memcmp+0x119/0x180 lib/string.c:861 __hw_addr_add_ex net/core/dev_addr_lists.c:60 [inline] __dev_mc_add+0x1c2/0x8e0 net/core/dev_addr_lists.c:670 dev_mc_add+0x6d/0x80 net/core/dev_addr_lists.c:687 igmp6_group_added+0x2db/0xa00 net/ipv6/mcast.c:662 ipv6_dev_mc_inc+0xe9e/0x1130 net/ipv6/mcast.c:914 addrconf_join_solict net/ipv6/addrconf.c:2078 [inline] addrconf_dad_begin net/ipv6/addrconf.c:3828 [inline] addrconf_dad_work+0x427/0x2150 net/ipv6/addrconf.c:3954 process_one_work+0x12c6/0x1f60 kernel/workqueue.c:2113 worker_thread+0x113c/0x24f0 kernel/workqueue.c:2247 kthread+0x539/0x720 kernel/kthread.c:239 Fixes: f001fde5eadd ("net: introduce a list of device addresses dev_addr_list (v6)") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index c0548d268e1a..e3e6a3e2ca22 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -57,8 +57,8 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, return -EINVAL; list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - ha->type == addr_type) { + if (ha->type == addr_type && + !memcmp(ha->addr, addr, addr_len)) { if (global) { /* check if addr is already used as global */ if (ha->global_use) From b855ff827476adbdc2259e9895681d82b7b26065 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:41 -0700 Subject: [PATCH 28/46] dccp: initialize ireq->ir_mark syzbot reported an uninit-value read of skb->mark in iptable_mangle_hook() Thanks to the nice report, I tracked the problem to dccp not caring of ireq->ir_mark for passive sessions. BUG: KMSAN: uninit-value in ipt_mangle_out net/ipv4/netfilter/iptable_mangle.c:66 [inline] BUG: KMSAN: uninit-value in iptable_mangle_hook+0x5e5/0x720 net/ipv4/netfilter/iptable_mangle.c:84 CPU: 0 PID: 5300 Comm: syz-executor3 Not tainted 4.16.0+ #81 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 ipt_mangle_out net/ipv4/netfilter/iptable_mangle.c:66 [inline] iptable_mangle_hook+0x5e5/0x720 net/ipv4/netfilter/iptable_mangle.c:84 nf_hook_entry_hookfn include/linux/netfilter.h:120 [inline] nf_hook_slow+0x158/0x3d0 net/netfilter/core.c:483 nf_hook include/linux/netfilter.h:243 [inline] __ip_local_out net/ipv4/ip_output.c:113 [inline] ip_local_out net/ipv4/ip_output.c:122 [inline] ip_queue_xmit+0x1d21/0x21c0 net/ipv4/ip_output.c:504 dccp_transmit_skb+0x15eb/0x1900 net/dccp/output.c:142 dccp_xmit_packet+0x814/0x9e0 net/dccp/output.c:281 dccp_write_xmit+0x20f/0x480 net/dccp/output.c:363 dccp_sendmsg+0x12ca/0x12d0 net/dccp/proto.c:818 inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x455259 RSP: 002b:00007f1a4473dc68 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f1a4473e6d4 RCX: 0000000000455259 RDX: 0000000000000000 RSI: 0000000020b76fc8 RDI: 0000000000000015 RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000000004f0 R14: 00000000006fa720 R15: 0000000000000000 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 ip_queue_xmit+0x1e35/0x21c0 net/ipv4/ip_output.c:502 dccp_transmit_skb+0x15eb/0x1900 net/dccp/output.c:142 dccp_xmit_packet+0x814/0x9e0 net/dccp/output.c:281 dccp_write_xmit+0x20f/0x480 net/dccp/output.c:363 dccp_sendmsg+0x12ca/0x12d0 net/dccp/proto.c:818 inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] ___sys_sendmsg+0xec0/0x1310 net/socket.c:2046 __sys_sendmsg net/socket.c:2080 [inline] SYSC_sendmsg+0x2a3/0x3d0 net/socket.c:2091 SyS_sendmsg+0x54/0x80 net/socket.c:2087 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 inet_csk_clone_lock+0x503/0x580 net/ipv4/inet_connection_sock.c:797 dccp_create_openreq_child+0x7f/0x890 net/dccp/minisocks.c:92 dccp_v4_request_recv_sock+0x22c/0xe90 net/dccp/ipv4.c:408 dccp_v6_request_recv_sock+0x290/0x2000 net/dccp/ipv6.c:414 dccp_check_req+0x7b9/0x8f0 net/dccp/minisocks.c:197 dccp_v4_rcv+0x12e4/0x2630 net/dccp/ipv4.c:840 ip_local_deliver_finish+0x6ed/0xd40 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:288 [inline] ip_local_deliver+0x43c/0x4e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:449 [inline] ip_rcv_finish+0x1253/0x16d0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:288 [inline] ip_rcv+0x119d/0x16f0 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x47cf/0x4a80 net/core/dev.c:4562 __netif_receive_skb net/core/dev.c:4627 [inline] process_backlog+0x62d/0xe20 net/core/dev.c:5307 napi_poll net/core/dev.c:5705 [inline] net_rx_action+0x7c1/0x1a70 net/core/dev.c:5771 __do_softirq+0x56d/0x93d kernel/softirq.c:285 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmem_cache_alloc+0xaab/0xb90 mm/slub.c:2756 reqsk_alloc include/net/request_sock.h:88 [inline] inet_reqsk_alloc+0xc4/0x7f0 net/ipv4/tcp_input.c:6145 dccp_v4_conn_request+0x5cc/0x1770 net/dccp/ipv4.c:600 dccp_v6_conn_request+0x299/0x1880 net/dccp/ipv6.c:317 dccp_rcv_state_process+0x2ea/0x2410 net/dccp/input.c:612 dccp_v4_do_rcv+0x229/0x340 net/dccp/ipv4.c:682 dccp_v6_do_rcv+0x16d/0x1220 net/dccp/ipv6.c:578 sk_backlog_rcv include/net/sock.h:908 [inline] __sk_receive_skb+0x60e/0xf20 net/core/sock.c:513 dccp_v4_rcv+0x24d4/0x2630 net/dccp/ipv4.c:874 ip_local_deliver_finish+0x6ed/0xd40 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:288 [inline] ip_local_deliver+0x43c/0x4e0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:449 [inline] ip_rcv_finish+0x1253/0x16d0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:288 [inline] ip_rcv+0x119d/0x16f0 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x47cf/0x4a80 net/core/dev.c:4562 __netif_receive_skb net/core/dev.c:4627 [inline] process_backlog+0x62d/0xe20 net/core/dev.c:5307 napi_poll net/core/dev.c:5705 [inline] net_rx_action+0x7c1/0x1a70 net/core/dev.c:5771 __do_softirq+0x56d/0x93d kernel/softirq.c:285 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 1 + net/dccp/ipv6.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e65fcb45c3f6..b08feb219b44 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -614,6 +614,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->ir_mark = inet_request_mark(sk, skb); ireq->ireq_family = AF_INET; ireq->ir_iif = sk->sk_bound_dev_if; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5df7857fc0f3..6344f1b18a6a 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -351,6 +351,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ireq_family = AF_INET6; + ireq->ir_mark = inet_request_mark(sk, skb); if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || From d0ea2b12500543535be3f54e17920fffc9bb45f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:42 -0700 Subject: [PATCH 29/46] ipv4: fix uninit-value in ip_route_output_key_hash_rcu() syzbot complained that res.type could be used while not initialized. Using RTN_UNSPEC as initial value seems better than using garbage. BUG: KMSAN: uninit-value in __mkroute_output net/ipv4/route.c:2200 [inline] BUG: KMSAN: uninit-value in ip_route_output_key_hash_rcu+0x31f0/0x3940 net/ipv4/route.c:2493 CPU: 1 PID: 12207 Comm: syz-executor0 Not tainted 4.16.0+ #81 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 __mkroute_output net/ipv4/route.c:2200 [inline] ip_route_output_key_hash_rcu+0x31f0/0x3940 net/ipv4/route.c:2493 ip_route_output_key_hash net/ipv4/route.c:2322 [inline] __ip_route_output_key include/net/route.h:126 [inline] ip_route_output_flow+0x1eb/0x3c0 net/ipv4/route.c:2577 raw_sendmsg+0x1861/0x3ed0 net/ipv4/raw.c:653 inet_sendmsg+0x48d/0x740 net/ipv4/af_inet.c:764 sock_sendmsg_nosec net/socket.c:630 [inline] sock_sendmsg net/socket.c:640 [inline] SYSC_sendto+0x6c3/0x7e0 net/socket.c:1747 SyS_sendto+0x8a/0xb0 net/socket.c:1715 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x455259 RSP: 002b:00007fdc0625dc68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 00007fdc0625e6d4 RCX: 0000000000455259 RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000013 RBP: 000000000072bea0 R08: 0000000020000080 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000000004f7 R14: 00000000006fa7c8 R15: 0000000000000000 Local variable description: ----res.i.i@ip_route_output_flow Variable was created at: ip_route_output_flow+0x75/0x3c0 net/ipv4/route.c:2576 raw_sendmsg+0x1861/0x3ed0 net/ipv4/raw.c:653 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8322e479f299..59bc6ab1a4eb 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2297,13 +2297,14 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { __u8 tos = RT_FL_TOS(fl4); - struct fib_result res; + struct fib_result res = { + .type = RTN_UNSPEC, + .fi = NULL, + .table = NULL, + .tclassid = 0, + }; struct rtable *rth; - res.tclassid = 0; - res.fi = NULL; - res.table = NULL; - fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? From 3099a52918937ab86ec47038ad80d377ba16c531 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 13:42:43 -0700 Subject: [PATCH 30/46] soreuseport: initialise timewait reuseport field syzbot reported an uninit-value in inet_csk_bind_conflict() [1] It turns out we never propagated sk->sk_reuseport into timewait socket. [1] BUG: KMSAN: uninit-value in inet_csk_bind_conflict+0x5f9/0x990 net/ipv4/inet_connection_sock.c:151 CPU: 1 PID: 3589 Comm: syzkaller008242 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 inet_csk_bind_conflict+0x5f9/0x990 net/ipv4/inet_connection_sock.c:151 inet_csk_get_port+0x1d28/0x1e40 net/ipv4/inet_connection_sock.c:320 inet6_bind+0x121c/0x1820 net/ipv6/af_inet6.c:399 SYSC_bind+0x3f2/0x4b0 net/socket.c:1474 SyS_bind+0x54/0x80 net/socket.c:1460 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x4416e9 RSP: 002b:00007ffce6d15c88 EFLAGS: 00000217 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 0100000000000000 RCX: 00000000004416e9 RDX: 000000000000001c RSI: 0000000020402000 RDI: 0000000000000004 RBP: 0000000000000000 R08: 00000000e6d15e08 R09: 00000000e6d15e08 R10: 0000000000000004 R11: 0000000000000217 R12: 0000000000009478 R13: 00000000006cd448 R14: 0000000000000000 R15: 0000000000000000 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 tcp_time_wait+0xf17/0xf50 net/ipv4/tcp_minisocks.c:283 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Uninit was stored to memory at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_save_stack mm/kmsan/kmsan.c:293 [inline] kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684 __msan_chain_origin+0x69/0xc0 mm/kmsan/kmsan_instr.c:521 inet_twsk_alloc+0xaef/0xc00 net/ipv4/inet_timewait_sock.c:182 tcp_time_wait+0xd9/0xf50 net/ipv4/tcp_minisocks.c:258 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmem_cache_alloc+0xaab/0xb90 mm/slub.c:2756 inet_twsk_alloc+0x13b/0xc00 net/ipv4/inet_timewait_sock.c:163 tcp_time_wait+0xd9/0xf50 net/ipv4/tcp_minisocks.c:258 tcp_rcv_state_process+0xebe/0x6490 net/ipv4/tcp_input.c:6003 tcp_v6_do_rcv+0x11dd/0x1d90 net/ipv6/tcp_ipv6.c:1331 sk_backlog_rcv include/net/sock.h:908 [inline] __release_sock+0x2d6/0x680 net/core/sock.c:2271 release_sock+0x97/0x2a0 net/core/sock.c:2786 tcp_close+0x277/0x18f0 net/ipv4/tcp.c:2269 inet_release+0x240/0x2a0 net/ipv4/af_inet.c:427 inet6_release+0xaf/0x100 net/ipv6/af_inet6.c:435 sock_release net/socket.c:595 [inline] sock_close+0xe0/0x300 net/socket.c:1149 __fput+0x49e/0xa10 fs/file_table.c:209 ____fput+0x37/0x40 fs/file_table.c:243 task_work_run+0x243/0x2c0 kernel/task_work.c:113 exit_task_work include/linux/task_work.h:22 [inline] do_exit+0x10e1/0x38d0 kernel/exit.c:867 do_group_exit+0x1a0/0x360 kernel/exit.c:970 SYSC_exit_group+0x21/0x30 kernel/exit.c:981 SyS_exit_group+0x25/0x30 kernel/exit.c:979 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: da5e36308d9f ("soreuseport: TCP/IPv4 implementation") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 1 + net/ipv4/inet_timewait_sock.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 899495589a7e..c7be1ca8e562 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -43,6 +43,7 @@ struct inet_timewait_sock { #define tw_family __tw_common.skc_family #define tw_state __tw_common.skc_state #define tw_reuse __tw_common.skc_reuse +#define tw_reuseport __tw_common.skc_reuseport #define tw_ipv6only __tw_common.skc_ipv6only #define tw_bound_dev_if __tw_common.skc_bound_dev_if #define tw_node __tw_common.skc_nulls_node diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c3ea4906d237..88c5069b5d20 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -178,6 +178,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; + tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet->transparent; From 6780db244d6b1537d139dea0ec8aad10cf9e4adb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 7 Apr 2018 17:15:22 -0700 Subject: [PATCH 31/46] sctp: do not leak kernel memory to user space syzbot produced a nice report [1] Issue here is that a recvmmsg() managed to leak 8 bytes of kernel memory to user space, because sin_zero (padding field) was not properly cleared. [1] BUG: KMSAN: uninit-value in copy_to_user include/linux/uaccess.h:184 [inline] BUG: KMSAN: uninit-value in move_addr_to_user+0x32e/0x530 net/socket.c:227 CPU: 1 PID: 3586 Comm: syzkaller481044 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 kmsan_internal_check_memory+0x164/0x1d0 mm/kmsan/kmsan.c:1176 kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199 copy_to_user include/linux/uaccess.h:184 [inline] move_addr_to_user+0x32e/0x530 net/socket.c:227 ___sys_recvmsg+0x4e2/0x810 net/socket.c:2211 __sys_recvmmsg+0x54e/0xdb0 net/socket.c:2313 SYSC_recvmmsg+0x29b/0x3e0 net/socket.c:2394 SyS_recvmmsg+0x76/0xa0 net/socket.c:2378 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x4401c9 RSP: 002b:00007ffc56f73098 EFLAGS: 00000217 ORIG_RAX: 000000000000012b RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004401c9 RDX: 0000000000000001 RSI: 0000000020003ac0 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000020003bc0 R09: 0000000000000010 R10: 0000000000000000 R11: 0000000000000217 R12: 0000000000401af0 R13: 0000000000401b80 R14: 0000000000000000 R15: 0000000000000000 Local variable description: ----addr@___sys_recvmsg Variable was created at: ___sys_recvmsg+0xd5/0x810 net/socket.c:2172 __sys_recvmmsg+0x54e/0xdb0 net/socket.c:2313 Bytes 8-15 of 16 are uninitialized ================================================================== Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 3586 Comm: syzkaller481044 Tainted: G B 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 panic+0x39d/0x940 kernel/panic.c:183 kmsan_report+0x238/0x240 mm/kmsan/kmsan.c:1083 kmsan_internal_check_memory+0x164/0x1d0 mm/kmsan/kmsan.c:1176 kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199 copy_to_user include/linux/uaccess.h:184 [inline] move_addr_to_user+0x32e/0x530 net/socket.c:227 ___sys_recvmsg+0x4e2/0x810 net/socket.c:2211 __sys_recvmmsg+0x54e/0xdb0 net/socket.c:2313 SYSC_recvmmsg+0x29b/0x3e0 net/socket.c:2394 SyS_recvmmsg+0x76/0xa0 net/socket.c:2378 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Cc: Neil Horman Reported-by: syzbot Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 0d873c58e516..f1fc48e9689c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -728,8 +728,10 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) sctp_v6_map_v4(addr); } - if (addr->sa.sa_family == AF_INET) + if (addr->sa.sa_family == AF_INET) { + memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); + } return sizeof(struct sockaddr_in6); } From fc5f33768cca7144f8d793205b229d46740d183b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 7 Apr 2018 20:37:40 +0200 Subject: [PATCH 32/46] net: dsa: Discard frames from unused ports The Marvell switches under some conditions will pass a frame to the host with the port being the CPU port. Such frames are invalid, and should be dropped. Not dropping them can result in a crash when incrementing the receive statistics for an invalid port. Reported-by: Chris Healy Fixes: 91da11f870f0 ("net: Distributed Switch Architecture protocol support") Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 70de7895e5b8..053731473c99 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -126,6 +126,7 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, struct dsa_port *cpu_dp = dev->dsa_ptr; struct dsa_switch_tree *dst = cpu_dp->dst; struct dsa_switch *ds; + struct dsa_port *slave_port; if (device < 0 || device >= DSA_MAX_SWITCHES) return NULL; @@ -137,7 +138,12 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, if (port < 0 || port >= ds->num_ports) return NULL; - return ds->ports[port].slave; + slave_port = &ds->ports[port]; + + if (unlikely(slave_port->type != DSA_PORT_TYPE_USER)) + return NULL; + + return slave_port->slave; } /* port.c */ From 81e98370293afcb58340ce8bd71af7b97f925c26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 8 Apr 2018 07:52:08 -0700 Subject: [PATCH 33/46] sctp: sctp_sockaddr_af must check minimal addr length for AF_INET6 Check must happen before call to ipv6_addr_v4mapped() syzbot report was : BUG: KMSAN: uninit-value in sctp_sockaddr_af net/sctp/socket.c:359 [inline] BUG: KMSAN: uninit-value in sctp_do_bind+0x60f/0xdc0 net/sctp/socket.c:384 CPU: 0 PID: 3576 Comm: syzkaller968804 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 sctp_sockaddr_af net/sctp/socket.c:359 [inline] sctp_do_bind+0x60f/0xdc0 net/sctp/socket.c:384 sctp_bind+0x149/0x190 net/sctp/socket.c:332 inet6_bind+0x1fd/0x1820 net/ipv6/af_inet6.c:293 SYSC_bind+0x3f2/0x4b0 net/socket.c:1474 SyS_bind+0x54/0x80 net/socket.c:1460 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x43fd49 RSP: 002b:00007ffe99df3d28 EFLAGS: 00000213 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fd49 RDX: 0000000000000010 RSI: 0000000020000000 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401670 R13: 0000000000401700 R14: 0000000000000000 R15: 0000000000000000 Local variable description: ----address@SYSC_bind Variable was created at: SYSC_bind+0x6f/0x4b0 net/socket.c:1461 SyS_bind+0x54/0x80 net/socket.c:1460 Signed-off-by: Eric Dumazet Cc: Vlad Yasevich Cc: Neil Horman Reported-by: syzbot Signed-off-by: David S. Miller --- net/sctp/socket.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 7a10ae3c3d82..eb712df7156e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -357,11 +357,14 @@ static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, if (!opt->pf->af_supported(addr->sa.sa_family, opt)) return NULL; - /* V4 mapped address are really of AF_INET family */ - if (addr->sa.sa_family == AF_INET6 && - ipv6_addr_v4mapped(&addr->v6.sin6_addr) && - !opt->pf->af_supported(AF_INET, opt)) - return NULL; + if (addr->sa.sa_family == AF_INET6) { + if (len < SIN6_LEN_RFC2133) + return NULL; + /* V4 mapped address are really of AF_INET family */ + if (ipv6_addr_v4mapped(&addr->v6.sin6_addr) && + !opt->pf->af_supported(AF_INET, opt)) + return NULL; + } /* If we get this far, af is valid. */ af = sctp_get_af_specific(addr->sa.sa_family); From e41f0548473eb7b6499bd8482474e30ae6d31220 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 6 Apr 2018 18:54:52 -0700 Subject: [PATCH 34/46] tipc: use the right skb in tipc_sk_fill_sock_diag() Commit 4b2e6877b879 ("tipc: Fix namespace violation in tipc_sk_fill_sock_diag") tried to fix the crash but failed, the crash is still 100% reproducible with it. In tipc_sk_fill_sock_diag(), skb is the diag dump we are filling, it is not correct to retrieve its NETLINK_CB(), instead, like other protocol diag, we should use NETLINK_CB(cb->skb).sk here. Reported-by: Fixes: 4b2e6877b879 ("tipc: Fix namespace violation in tipc_sk_fill_sock_diag") Fixes: c30b70deb5f4 (tipc: implement socket diagnostics for AF_TIPC) Cc: GhantaKrishnamurthy MohanKrishna Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/tipc/diag.c | 2 +- net/tipc/socket.c | 6 +++--- net/tipc/socket.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/tipc/diag.c b/net/tipc/diag.c index 46d9cd62f781..aaabb0b776dd 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -59,7 +59,7 @@ static int __tipc_add_sock_diag(struct sk_buff *skb, if (!nlh) return -EMSGSIZE; - err = tipc_sk_fill_sock_diag(skb, tsk, req->tidiag_states, + err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index cee6674a3bf4..1fd1c8b5ce03 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3257,8 +3257,8 @@ out: } EXPORT_SYMBOL(tipc_nl_sk_walk); -int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, - u32 sk_filter_state, +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, + struct tipc_sock *tsk, u32 sk_filter_state, u64 (*tipc_diag_gen_cookie)(struct sock *sk)) { struct sock *sk = &tsk->sk; @@ -3280,7 +3280,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, nla_put_u32(skb, TIPC_NLA_SOCK_TIPC_STATE, (u32)sk->sk_state) || nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || nla_put_u32(skb, TIPC_NLA_SOCK_UID, - from_kuid_munged(sk_user_ns(NETLINK_CB(skb).sk), + from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk), sock_i_uid(sk))) || nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, tipc_diag_gen_cookie(sk), diff --git a/net/tipc/socket.h b/net/tipc/socket.h index aae3fd4cd06c..aff9b2ae5a1f 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -61,8 +61,8 @@ int tipc_sk_rht_init(struct net *net); void tipc_sk_rht_destroy(struct net *net); int tipc_nl_sk_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb); -int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct tipc_sock *tsk, - u32 sk_filter_state, +int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, + struct tipc_sock *tsk, u32 sk_filter_state, u64 (*tipc_diag_gen_cookie)(struct sock *sk)); int tipc_nl_sk_walk(struct sk_buff *skb, struct netlink_callback *cb, int (*skb_handler)(struct sk_buff *skb, From 37e40fa8f62ba33021cb0050e38f87d7519ee447 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 6 Apr 2018 18:37:02 -0500 Subject: [PATCH 35/46] ibmvnic: Fix DMA mapping mistakes Fix some mistakes caught by the DMA debugger. The first change fixes a unnecessary unmap that should have been removed in an earlier update. The next hunk fixes another bad unmap by zeroing the bit checked to determine that an unmap is needed. The final change fixes some buffers that are unmapped with the wrong direction specified. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index b492af6affc3..58e01432c2aa 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -320,9 +320,6 @@ failure: dev_info(dev, "replenish pools failure\n"); pool->free_map[pool->next_free] = index; pool->rx_buff[index].skb = NULL; - if (!dma_mapping_error(dev, dma_addr)) - dma_unmap_single(dev, dma_addr, pool->buff_size, - DMA_FROM_DEVICE); dev_kfree_skb_any(skb); adapter->replenish_add_buff_failure++; @@ -2574,7 +2571,7 @@ static int ibmvnic_complete_tx(struct ibmvnic_adapter *adapter, union sub_crq *next; int index; int i, j; - u8 first; + u8 *first; restart_loop: while (pending_scrq(adapter, scrq)) { @@ -2605,11 +2602,12 @@ restart_loop: txbuff->data_dma[j] = 0; } /* if sub_crq was sent indirectly */ - first = txbuff->indir_arr[0].generic.first; - if (first == IBMVNIC_CRQ_CMD) { + first = &txbuff->indir_arr[0].generic.first; + if (*first == IBMVNIC_CRQ_CMD) { dma_unmap_single(dev, txbuff->indir_dma, sizeof(txbuff->indir_arr), DMA_TO_DEVICE); + *first = 0; } if (txbuff->last_frag) { @@ -3882,9 +3880,9 @@ static int handle_login_rsp(union ibmvnic_crq *login_rsp_crq, int i; dma_unmap_single(dev, adapter->login_buf_token, adapter->login_buf_sz, - DMA_BIDIRECTIONAL); + DMA_TO_DEVICE); dma_unmap_single(dev, adapter->login_rsp_buf_token, - adapter->login_rsp_buf_sz, DMA_BIDIRECTIONAL); + adapter->login_rsp_buf_sz, DMA_FROM_DEVICE); /* If the number of queues requested can't be allocated by the * server, the login response will return with code 1. We will need From 41f714672f93608751dbd2fa2291d476a8ff0150 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 6 Apr 2018 18:37:03 -0500 Subject: [PATCH 36/46] ibmvnic: Zero used TX descriptor counter on reset The counter that tracks used TX descriptors pending completion needs to be zeroed as part of a device reset. This change fixes a bug causing transmit queues to be stopped unnecessarily and in some cases a transmit queue stall and timeout reset. If the counter is not reset, the remaining descriptors will not be "removed", effectively reducing queue capacity. If the queue is over half full, it will cause the queue to stall if stopped. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 58e01432c2aa..153a868c5135 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -2361,6 +2361,7 @@ static int reset_one_sub_crq_queue(struct ibmvnic_adapter *adapter, } memset(scrq->msgs, 0, 4 * PAGE_SIZE); + atomic_set(&scrq->used, 0); scrq->cur = 0; rc = h_reg_sub_crq(adapter->vdev->unit_address, scrq->msg_token, From af894d239840908dfebb2215e13a713e63d2ffb0 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 6 Apr 2018 18:37:04 -0500 Subject: [PATCH 37/46] ibmvnic: Fix reset scheduler error handling In some cases, if the driver is waiting for a reset following a device parameter change, failure to schedule a reset can result in a hang since a completion signal is never sent. If the device configuration is being altered by a tool such as ethtool or ifconfig, it could cause the console to hang if the reset request does not get scheduled. Add some additional error handling code to exit the wait_for_completion if there is one in progress. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 39 ++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 153a868c5135..bbcd07a02694 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1875,23 +1875,25 @@ static void __ibmvnic_reset(struct work_struct *work) mutex_unlock(&adapter->reset_lock); } -static void ibmvnic_reset(struct ibmvnic_adapter *adapter, - enum ibmvnic_reset_reason reason) +static int ibmvnic_reset(struct ibmvnic_adapter *adapter, + enum ibmvnic_reset_reason reason) { struct ibmvnic_rwi *rwi, *tmp; struct net_device *netdev = adapter->netdev; struct list_head *entry; + int ret; if (adapter->state == VNIC_REMOVING || adapter->state == VNIC_REMOVED) { + ret = EBUSY; netdev_dbg(netdev, "Adapter removing, skipping reset\n"); - return; + goto err; } if (adapter->state == VNIC_PROBING) { netdev_warn(netdev, "Adapter reset during probe\n"); - adapter->init_done_rc = EAGAIN; - return; + ret = adapter->init_done_rc = EAGAIN; + goto err; } mutex_lock(&adapter->rwi_lock); @@ -1901,7 +1903,8 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter, if (tmp->reset_reason == reason) { netdev_dbg(netdev, "Skipping matching reset\n"); mutex_unlock(&adapter->rwi_lock); - return; + ret = EBUSY; + goto err; } } @@ -1909,7 +1912,8 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter, if (!rwi) { mutex_unlock(&adapter->rwi_lock); ibmvnic_close(netdev); - return; + ret = ENOMEM; + goto err; } rwi->reset_reason = reason; @@ -1918,6 +1922,12 @@ static void ibmvnic_reset(struct ibmvnic_adapter *adapter, netdev_dbg(adapter->netdev, "Scheduling reset (reason %d)\n", reason); schedule_work(&adapter->ibmvnic_reset); + + return 0; +err: + if (adapter->wait_for_reset) + adapter->wait_for_reset = false; + return -ret; } static void ibmvnic_tx_timeout(struct net_device *dev) @@ -2052,6 +2062,8 @@ static void ibmvnic_netpoll_controller(struct net_device *dev) static int wait_for_reset(struct ibmvnic_adapter *adapter) { + int rc, ret; + adapter->fallback.mtu = adapter->req_mtu; adapter->fallback.rx_queues = adapter->req_rx_queues; adapter->fallback.tx_queues = adapter->req_tx_queues; @@ -2059,11 +2071,15 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter) adapter->fallback.tx_entries = adapter->req_tx_entries_per_subcrq; init_completion(&adapter->reset_done); - ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); adapter->wait_for_reset = true; + rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); + if (rc) + return rc; wait_for_completion(&adapter->reset_done); + ret = 0; if (adapter->reset_done_rc) { + ret = -EIO; adapter->desired.mtu = adapter->fallback.mtu; adapter->desired.rx_queues = adapter->fallback.rx_queues; adapter->desired.tx_queues = adapter->fallback.tx_queues; @@ -2071,12 +2087,15 @@ static int wait_for_reset(struct ibmvnic_adapter *adapter) adapter->desired.tx_entries = adapter->fallback.tx_entries; init_completion(&adapter->reset_done); - ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); + adapter->wait_for_reset = true; + rc = ibmvnic_reset(adapter, VNIC_RESET_CHANGE_PARAM); + if (rc) + return ret; wait_for_completion(&adapter->reset_done); } adapter->wait_for_reset = false; - return adapter->reset_done_rc; + return ret; } static int ibmvnic_change_mtu(struct net_device *netdev, int new_mtu) From 5a18e1e0c193b2f6a8d4651f38aaabee58080647 Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Fri, 6 Apr 2018 18:37:05 -0500 Subject: [PATCH 38/46] ibmvnic: Fix failover case for non-redundant configuration There is a failover case for a non-redundant pseries VNIC configuration that was not being handled properly. The current implementation assumes that the driver will always have a redandant device to communicate with following a failover notification. There are cases, however, when a non-redundant configuration can receive a failover request. If that happens, the driver should wait until it receives a signal that the device is ready for operation. The driver is agnostic of its backing hardware configuration, so this fix necessarily affects all device failover management. The driver needs to wait until it receives a signal that the device is ready for resetting. A flag is introduced to track this intermediary state where the driver is waiting for an active device. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 37 +++++++++++++++++++++++------- drivers/net/ethernet/ibm/ibmvnic.h | 1 + 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index bbcd07a02694..151542e79884 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -325,10 +325,11 @@ failure: adapter->replenish_add_buff_failure++; atomic_add(buffers_added, &pool->available); - if (lpar_rc == H_CLOSED) { + if (lpar_rc == H_CLOSED || adapter->failover_pending) { /* Disable buffer pool replenishment and report carrier off if - * queue is closed. Firmware guarantees that a signal will - * be sent to the driver, triggering a reset. + * queue is closed or pending failover. + * Firmware guarantees that a signal will be sent to the + * driver, triggering a reset. */ deactivate_rx_pools(adapter); netif_carrier_off(adapter->netdev); @@ -1068,6 +1069,14 @@ static int ibmvnic_open(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; + /* If device failover is pending, just set device state and return. + * Device operation will be handled by reset routine. + */ + if (adapter->failover_pending) { + adapter->state = VNIC_OPEN; + return 0; + } + mutex_lock(&adapter->reset_lock); if (adapter->state != VNIC_CLOSED) { @@ -1225,6 +1234,14 @@ static int ibmvnic_close(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc; + /* If device failover is pending, just set device state and return. + * Device operation will be handled by reset routine. + */ + if (adapter->failover_pending) { + adapter->state = VNIC_CLOSED; + return 0; + } + mutex_lock(&adapter->reset_lock); rc = __ibmvnic_close(netdev); mutex_unlock(&adapter->reset_lock); @@ -1559,8 +1576,9 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) dev_kfree_skb_any(skb); tx_buff->skb = NULL; - if (lpar_rc == H_CLOSED) { - /* Disable TX and report carrier off if queue is closed. + if (lpar_rc == H_CLOSED || adapter->failover_pending) { + /* Disable TX and report carrier off if queue is closed + * or pending failover. * Firmware guarantees that a signal will be sent to the * driver, triggering a reset or some other action. */ @@ -1884,9 +1902,10 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter, int ret; if (adapter->state == VNIC_REMOVING || - adapter->state == VNIC_REMOVED) { + adapter->state == VNIC_REMOVED || + adapter->failover_pending) { ret = EBUSY; - netdev_dbg(netdev, "Adapter removing, skipping reset\n"); + netdev_dbg(netdev, "Adapter removing or pending failover, skipping reset\n"); goto err; } @@ -4162,7 +4181,9 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, case IBMVNIC_CRQ_INIT: dev_info(dev, "Partner initialized\n"); adapter->from_passive_init = true; + adapter->failover_pending = false; complete(&adapter->init_done); + ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); @@ -4179,7 +4200,7 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, ibmvnic_reset(adapter, VNIC_RESET_MOBILITY); } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) { dev_info(dev, "Backing device failover detected\n"); - ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); + adapter->failover_pending = true; } else { /* The adapter lost the connection */ dev_err(dev, "Virtual Adapter failed (rc=%d)\n", diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index 89efe700eafe..99c0b58c2c39 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -1108,6 +1108,7 @@ struct ibmvnic_adapter { bool napi_enabled, from_passive_init; bool mac_change_pending; + bool failover_pending; struct ibmvnic_tunables desired; struct ibmvnic_tunables fallback; From 30f796258c49baa313222456bcf5b0246da55ff1 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Fri, 6 Apr 2018 18:37:06 -0500 Subject: [PATCH 39/46] ibmvnic: Do not reset CRQ for Mobility driver resets When resetting the ibmvnic driver after a partition migration occurs there is no requirement to do a reset of the main CRQ. The current driver code does the required re-enable of the main CRQ, then does a reset of the main CRQ later. What we should be doing for a driver reset after a migration is to re-enable the main CRQ, release all the sub-CRQs, and then allocate new sub-CRQs after capability negotiation. This patch updates the handling of mobility resets to do the proper work and not reset the main CRQ. To do this the initialization/reset of the main CRQ had to be moved out of the ibmvnic_init routine and in to the ibmvnic_probe and do_reset routines. Signed-off-by: Nathan Fontenot Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 55 +++++++++++++++++------------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 151542e79884..aad5658d79d5 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -118,6 +118,7 @@ static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); static int ibmvnic_init(struct ibmvnic_adapter *); static void release_crq_queue(struct ibmvnic_adapter *); static int __ibmvnic_set_mac(struct net_device *netdev, struct sockaddr *p); +static int init_crq_queue(struct ibmvnic_adapter *adapter); struct ibmvnic_stat { char name[ETH_GSTRING_LEN]; @@ -1224,7 +1225,6 @@ static int __ibmvnic_close(struct net_device *netdev) rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); if (rc) return rc; - ibmvnic_cleanup(netdev); adapter->state = VNIC_CLOSED; return 0; } @@ -1244,6 +1244,7 @@ static int ibmvnic_close(struct net_device *netdev) mutex_lock(&adapter->reset_lock); rc = __ibmvnic_close(netdev); + ibmvnic_cleanup(netdev); mutex_unlock(&adapter->reset_lock); return rc; @@ -1726,14 +1727,10 @@ static int do_reset(struct ibmvnic_adapter *adapter, old_num_rx_queues = adapter->req_rx_queues; old_num_tx_queues = adapter->req_tx_queues; - if (rwi->reset_reason == VNIC_RESET_MOBILITY) { - rc = ibmvnic_reenable_crq_queue(adapter); - if (rc) - return 0; - ibmvnic_cleanup(netdev); - } else if (rwi->reset_reason == VNIC_RESET_FAILOVER) { - ibmvnic_cleanup(netdev); - } else { + ibmvnic_cleanup(netdev); + + if (adapter->reset_reason != VNIC_RESET_MOBILITY && + adapter->reset_reason != VNIC_RESET_FAILOVER) { rc = __ibmvnic_close(netdev); if (rc) return rc; @@ -1752,6 +1749,23 @@ static int do_reset(struct ibmvnic_adapter *adapter, */ adapter->state = VNIC_PROBED; + if (adapter->wait_for_reset) { + rc = init_crq_queue(adapter); + } else if (adapter->reset_reason == VNIC_RESET_MOBILITY) { + rc = ibmvnic_reenable_crq_queue(adapter); + release_sub_crqs(adapter, 1); + } else { + rc = ibmvnic_reset_crq(adapter); + if (!rc) + rc = vio_enable_interrupts(adapter->vdev); + } + + if (rc) { + netdev_err(adapter->netdev, + "Couldn't initialize crq. rc=%d\n", rc); + return rc; + } + rc = ibmvnic_init(adapter); if (rc) return IBMVNIC_INIT_FAILED; @@ -4500,19 +4514,6 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter) u64 old_num_rx_queues, old_num_tx_queues; int rc; - if (adapter->resetting && !adapter->wait_for_reset) { - rc = ibmvnic_reset_crq(adapter); - if (!rc) - rc = vio_enable_interrupts(adapter->vdev); - } else { - rc = init_crq_queue(adapter); - } - - if (rc) { - dev_err(dev, "Couldn't initialize crq. rc=%d\n", rc); - return rc; - } - adapter->from_passive_init = false; old_num_rx_queues = adapter->req_rx_queues; @@ -4537,7 +4538,8 @@ static int ibmvnic_init(struct ibmvnic_adapter *adapter) return -1; } - if (adapter->resetting && !adapter->wait_for_reset) { + if (adapter->resetting && !adapter->wait_for_reset && + adapter->reset_reason != VNIC_RESET_MOBILITY) { if (adapter->req_rx_queues != old_num_rx_queues || adapter->req_tx_queues != old_num_tx_queues) { release_sub_crqs(adapter, 0); @@ -4625,6 +4627,13 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) adapter->mac_change_pending = false; do { + rc = init_crq_queue(adapter); + if (rc) { + dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n", + rc); + goto ibmvnic_init_fail; + } + rc = ibmvnic_init(adapter); if (rc && rc != EAGAIN) goto ibmvnic_init_fail; From 21481189e8ffa4016e398d46ac6d66fb0f23acc3 Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 6 Apr 2018 14:38:34 +0200 Subject: [PATCH 40/46] net/fsl_pq_mdio: Allow explicit speficition of TBIPA address This introduces a simpler and generic method for for finding (and mapping) the TBIPA register. Instead of relying of complicated logic for finding the TBIPA register address based on the MDIO or MII register block base address, which even in some cases relies on undocumented shadow registers, a second "reg" entry for the mdio bus devicetree node specifies the TBIPA register. Backwards compatibility is kept, as the existing logic is applied when only a single "reg" mapping is specified. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- .../devicetree/bindings/net/fsl-tsec-phy.txt | 6 ++- drivers/net/ethernet/freescale/fsl_pq_mdio.c | 50 +++++++++++++------ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt index 594982c6b9f9..79bf352e659c 100644 --- a/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt +++ b/Documentation/devicetree/bindings/net/fsl-tsec-phy.txt @@ -6,7 +6,11 @@ the definition of the PHY node in booting-without-of.txt for an example of how to define a PHY. Required properties: - - reg : Offset and length of the register set for the device + - reg : Offset and length of the register set for the device, and optionally + the offset and length of the TBIPA register (TBI PHY address + register). If TBIPA register is not specified, the driver will + attempt to infer it from the register set specified (your mileage may + vary). - compatible : Should define the compatible device type for the mdio. Currently supported strings/devices are: - "fsl,gianfar-tbi" diff --git a/drivers/net/ethernet/freescale/fsl_pq_mdio.c b/drivers/net/ethernet/freescale/fsl_pq_mdio.c index 80ad16acf0f1..ac2c3f6a12bc 100644 --- a/drivers/net/ethernet/freescale/fsl_pq_mdio.c +++ b/drivers/net/ethernet/freescale/fsl_pq_mdio.c @@ -377,6 +377,38 @@ static const struct of_device_id fsl_pq_mdio_match[] = { }; MODULE_DEVICE_TABLE(of, fsl_pq_mdio_match); +static void set_tbipa(const u32 tbipa_val, struct platform_device *pdev, + uint32_t __iomem * (*get_tbipa)(void __iomem *), + void __iomem *reg_map, struct resource *reg_res) +{ + struct device_node *np = pdev->dev.of_node; + uint32_t __iomem *tbipa; + bool tbipa_mapped; + + tbipa = of_iomap(np, 1); + if (tbipa) { + tbipa_mapped = true; + } else { + tbipa_mapped = false; + tbipa = (*get_tbipa)(reg_map); + + /* + * Add consistency check to make sure TBI is contained within + * the mapped range (not because we would get a segfault, + * rather to catch bugs in computing TBI address). Print error + * message but continue anyway. + */ + if ((void *)tbipa > reg_map + resource_size(reg_res) - 4) + dev_err(&pdev->dev, "invalid register map (should be at least 0x%04zx to contain TBI address)\n", + ((void *)tbipa - reg_map) + 4); + } + + iowrite32be(be32_to_cpu(tbipa_val), tbipa); + + if (tbipa_mapped) + iounmap(tbipa); +} + static int fsl_pq_mdio_probe(struct platform_device *pdev) { const struct of_device_id *id = @@ -450,8 +482,6 @@ static int fsl_pq_mdio_probe(struct platform_device *pdev) if (tbi) { const u32 *prop = of_get_property(tbi, "reg", NULL); - uint32_t __iomem *tbipa; - if (!prop) { dev_err(&pdev->dev, "missing 'reg' property in node %pOF\n", @@ -459,20 +489,8 @@ static int fsl_pq_mdio_probe(struct platform_device *pdev) err = -EBUSY; goto error; } - - tbipa = data->get_tbipa(priv->map); - - /* - * Add consistency check to make sure TBI is contained - * within the mapped range (not because we would get a - * segfault, rather to catch bugs in computing TBI - * address). Print error message but continue anyway. - */ - if ((void *)tbipa > priv->map + resource_size(&res) - 4) - dev_err(&pdev->dev, "invalid register map (should be at least 0x%04zx to contain TBI address)\n", - ((void *)tbipa - priv->map) + 4); - - iowrite32be(be32_to_cpup(prop), tbipa); + set_tbipa(*prop, pdev, + data->get_tbipa, priv->map, &res); } } From 5571196135abb6d51e01592812997403c136067c Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Fri, 6 Apr 2018 14:46:35 +0200 Subject: [PATCH 41/46] ARM: dts: ls1021a: Specify TBIPA register address The current (mildly evil) fsl_pq_mdio code uses an undocumented shadow of the TBIPA register on LS1021A, which happens to be read-only. Changing TBI PHY address therefore does not work on LS1021A. The real (and documented) address of the TBIPA registere lies in the eTSEC block and not in MDIO/MII, which is read/write, so using that fixes the problem. Signed-off-by: Esben Haabendal Signed-off-by: David S. Miller --- arch/arm/boot/dts/ls1021a.dtsi | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/ls1021a.dtsi b/arch/arm/boot/dts/ls1021a.dtsi index c31dad98f989..728e206009ea 100644 --- a/arch/arm/boot/dts/ls1021a.dtsi +++ b/arch/arm/boot/dts/ls1021a.dtsi @@ -587,7 +587,8 @@ device_type = "mdio"; #address-cells = <1>; #size-cells = <0>; - reg = <0x0 0x2d24000 0x0 0x4000>; + reg = <0x0 0x2d24000 0x0 0x4000>, + <0x0 0x2d10030 0x0 0x4>; }; ptp_clock@2d10e00 { From fc56be47da8cb111add373c36230b0139139898f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 5 Apr 2018 22:13:21 +0200 Subject: [PATCH 42/46] devlink: convert occ_get op to separate registration This resolves race during initialization where the resources with ops are registered before driver and the structures used by occ_get op is initialized. So keep occ_get callbacks registered only when all structs are initialized. The example flows, as it is in mlxsw: 1) driver load/asic probe: mlxsw_core -> mlxsw_sp_resources_register -> mlxsw_sp_kvdl_resources_register -> devlink_resource_register IDX mlxsw_spectrum -> mlxsw_sp_kvdl_init -> mlxsw_sp_kvdl_parts_init -> mlxsw_sp_kvdl_part_init -> devlink_resource_size_get IDX (to get the current setup size from devlink) -> devlink_resource_occ_get_register IDX (register current occupancy getter) 2) reload triggered by devlink command: -> mlxsw_devlink_core_bus_device_reload -> mlxsw_sp_fini -> mlxsw_sp_kvdl_fini -> devlink_resource_occ_get_unregister IDX (struct mlxsw_sp *mlxsw_sp is freed at this point, call to occ get which is using mlxsw_sp would cause use-after free) -> mlxsw_sp_init -> mlxsw_sp_kvdl_init -> mlxsw_sp_kvdl_parts_init -> mlxsw_sp_kvdl_part_init -> devlink_resource_size_get IDX (to get the current setup size from devlink) -> devlink_resource_occ_get_register IDX (register current occupancy getter) Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 24 +----- .../net/ethernet/mellanox/mlxsw/spectrum.h | 1 - .../ethernet/mellanox/mlxsw/spectrum_kvdl.c | 67 ++++++++++------- drivers/net/netdevsim/devlink.c | 65 ++++++++-------- include/net/devlink.h | 40 ++++++---- net/core/devlink.c | 74 ++++++++++++++++--- 6 files changed, 165 insertions(+), 106 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 53fffd09d133..ca38a30fbe91 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -3805,18 +3805,6 @@ static const struct mlxsw_config_profile mlxsw_sp_config_profile = { }, }; -static u64 mlxsw_sp_resource_kvd_linear_occ_get(struct devlink *devlink) -{ - struct mlxsw_core *mlxsw_core = devlink_priv(devlink); - struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); - - return mlxsw_sp_kvdl_occ_get(mlxsw_sp); -} - -static const struct devlink_resource_ops mlxsw_sp_resource_kvd_linear_ops = { - .occ_get = mlxsw_sp_resource_kvd_linear_occ_get, -}; - static void mlxsw_sp_resource_size_params_prepare(struct mlxsw_core *mlxsw_core, struct devlink_resource_size_params *kvd_size_params, @@ -3877,8 +3865,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) err = devlink_resource_register(devlink, MLXSW_SP_RESOURCE_NAME_KVD, kvd_size, MLXSW_SP_RESOURCE_KVD, DEVLINK_RESOURCE_ID_PARENT_TOP, - &kvd_size_params, - NULL); + &kvd_size_params); if (err) return err; @@ -3887,8 +3874,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) linear_size, MLXSW_SP_RESOURCE_KVD_LINEAR, MLXSW_SP_RESOURCE_KVD, - &linear_size_params, - &mlxsw_sp_resource_kvd_linear_ops); + &linear_size_params); if (err) return err; @@ -3905,8 +3891,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) double_size, MLXSW_SP_RESOURCE_KVD_HASH_DOUBLE, MLXSW_SP_RESOURCE_KVD, - &hash_double_size_params, - NULL); + &hash_double_size_params); if (err) return err; @@ -3915,8 +3900,7 @@ static int mlxsw_sp_resources_register(struct mlxsw_core *mlxsw_core) single_size, MLXSW_SP_RESOURCE_KVD_HASH_SINGLE, MLXSW_SP_RESOURCE_KVD, - &hash_single_size_params, - NULL); + &hash_single_size_params); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 82820ba43728..804d4d2c8031 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -442,7 +442,6 @@ void mlxsw_sp_kvdl_free(struct mlxsw_sp *mlxsw_sp, int entry_index); int mlxsw_sp_kvdl_alloc_size_query(struct mlxsw_sp *mlxsw_sp, unsigned int entry_count, unsigned int *p_alloc_size); -u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp); int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core); struct mlxsw_sp_acl_rule_info { diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c index 8796db44dcc3..fe4327f547d2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_kvdl.c @@ -315,8 +315,9 @@ static u64 mlxsw_sp_kvdl_part_occ(struct mlxsw_sp_kvdl_part *part) return occ; } -u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp) +static u64 mlxsw_sp_kvdl_occ_get(void *priv) { + const struct mlxsw_sp *mlxsw_sp = priv; u64 occ = 0; int i; @@ -326,48 +327,33 @@ u64 mlxsw_sp_kvdl_occ_get(const struct mlxsw_sp *mlxsw_sp) return occ; } -static u64 mlxsw_sp_kvdl_single_occ_get(struct devlink *devlink) +static u64 mlxsw_sp_kvdl_single_occ_get(void *priv) { - struct mlxsw_core *mlxsw_core = devlink_priv(devlink); - struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + const struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_kvdl_part *part; part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_SINGLE]; return mlxsw_sp_kvdl_part_occ(part); } -static u64 mlxsw_sp_kvdl_chunks_occ_get(struct devlink *devlink) +static u64 mlxsw_sp_kvdl_chunks_occ_get(void *priv) { - struct mlxsw_core *mlxsw_core = devlink_priv(devlink); - struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + const struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_kvdl_part *part; part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_CHUNKS]; return mlxsw_sp_kvdl_part_occ(part); } -static u64 mlxsw_sp_kvdl_large_chunks_occ_get(struct devlink *devlink) +static u64 mlxsw_sp_kvdl_large_chunks_occ_get(void *priv) { - struct mlxsw_core *mlxsw_core = devlink_priv(devlink); - struct mlxsw_sp *mlxsw_sp = mlxsw_core_driver_priv(mlxsw_core); + const struct mlxsw_sp *mlxsw_sp = priv; struct mlxsw_sp_kvdl_part *part; part = mlxsw_sp->kvdl->parts[MLXSW_SP_KVDL_PART_ID_LARGE_CHUNKS]; return mlxsw_sp_kvdl_part_occ(part); } -static const struct devlink_resource_ops mlxsw_sp_kvdl_single_ops = { - .occ_get = mlxsw_sp_kvdl_single_occ_get, -}; - -static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_ops = { - .occ_get = mlxsw_sp_kvdl_chunks_occ_get, -}; - -static const struct devlink_resource_ops mlxsw_sp_kvdl_chunks_large_ops = { - .occ_get = mlxsw_sp_kvdl_large_chunks_occ_get, -}; - int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core) { struct devlink *devlink = priv_to_devlink(mlxsw_core); @@ -386,8 +372,7 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core) MLXSW_SP_KVDL_SINGLE_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE, MLXSW_SP_RESOURCE_KVD_LINEAR, - &size_params, - &mlxsw_sp_kvdl_single_ops); + &size_params); if (err) return err; @@ -398,8 +383,7 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core) MLXSW_SP_KVDL_CHUNKS_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS, MLXSW_SP_RESOURCE_KVD_LINEAR, - &size_params, - &mlxsw_sp_kvdl_chunks_ops); + &size_params); if (err) return err; @@ -410,13 +394,13 @@ int mlxsw_sp_kvdl_resources_register(struct mlxsw_core *mlxsw_core) MLXSW_SP_KVDL_LARGE_CHUNKS_SIZE, MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS, MLXSW_SP_RESOURCE_KVD_LINEAR, - &size_params, - &mlxsw_sp_kvdl_chunks_large_ops); + &size_params); return err; } int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp) { + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); struct mlxsw_sp_kvdl *kvdl; int err; @@ -429,6 +413,23 @@ int mlxsw_sp_kvdl_init(struct mlxsw_sp *mlxsw_sp) if (err) goto err_kvdl_parts_init; + devlink_resource_occ_get_register(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR, + mlxsw_sp_kvdl_occ_get, + mlxsw_sp); + devlink_resource_occ_get_register(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE, + mlxsw_sp_kvdl_single_occ_get, + mlxsw_sp); + devlink_resource_occ_get_register(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS, + mlxsw_sp_kvdl_chunks_occ_get, + mlxsw_sp); + devlink_resource_occ_get_register(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS, + mlxsw_sp_kvdl_large_chunks_occ_get, + mlxsw_sp); + return 0; err_kvdl_parts_init: @@ -438,6 +439,16 @@ err_kvdl_parts_init: void mlxsw_sp_kvdl_fini(struct mlxsw_sp *mlxsw_sp) { + struct devlink *devlink = priv_to_devlink(mlxsw_sp->core); + + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_LARGE_CHUNKS); + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_CHUNKS); + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR_SINGLE); + devlink_resource_occ_get_unregister(devlink, + MLXSW_SP_RESOURCE_KVD_LINEAR); mlxsw_sp_kvdl_parts_fini(mlxsw_sp); kfree(mlxsw_sp->kvdl); } diff --git a/drivers/net/netdevsim/devlink.c b/drivers/net/netdevsim/devlink.c index 1dba47936456..bef7db5d129a 100644 --- a/drivers/net/netdevsim/devlink.c +++ b/drivers/net/netdevsim/devlink.c @@ -30,52 +30,36 @@ static struct net *nsim_devlink_net(struct devlink *devlink) /* IPv4 */ -static u64 nsim_ipv4_fib_resource_occ_get(struct devlink *devlink) +static u64 nsim_ipv4_fib_resource_occ_get(void *priv) { - struct net *net = nsim_devlink_net(devlink); + struct net *net = priv; return nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB, false); } -static struct devlink_resource_ops nsim_ipv4_fib_res_ops = { - .occ_get = nsim_ipv4_fib_resource_occ_get, -}; - -static u64 nsim_ipv4_fib_rules_res_occ_get(struct devlink *devlink) +static u64 nsim_ipv4_fib_rules_res_occ_get(void *priv) { - struct net *net = nsim_devlink_net(devlink); + struct net *net = priv; return nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB_RULES, false); } -static struct devlink_resource_ops nsim_ipv4_fib_rules_res_ops = { - .occ_get = nsim_ipv4_fib_rules_res_occ_get, -}; - /* IPv6 */ -static u64 nsim_ipv6_fib_resource_occ_get(struct devlink *devlink) +static u64 nsim_ipv6_fib_resource_occ_get(void *priv) { - struct net *net = nsim_devlink_net(devlink); + struct net *net = priv; return nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB, false); } -static struct devlink_resource_ops nsim_ipv6_fib_res_ops = { - .occ_get = nsim_ipv6_fib_resource_occ_get, -}; - -static u64 nsim_ipv6_fib_rules_res_occ_get(struct devlink *devlink) +static u64 nsim_ipv6_fib_rules_res_occ_get(void *priv) { - struct net *net = nsim_devlink_net(devlink); + struct net *net = priv; return nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB_RULES, false); } -static struct devlink_resource_ops nsim_ipv6_fib_rules_res_ops = { - .occ_get = nsim_ipv6_fib_rules_res_occ_get, -}; - static int devlink_resources_register(struct devlink *devlink) { struct devlink_resource_size_params params = { @@ -91,7 +75,7 @@ static int devlink_resources_register(struct devlink *devlink) err = devlink_resource_register(devlink, "IPv4", (u64)-1, NSIM_RESOURCE_IPV4, DEVLINK_RESOURCE_ID_PARENT_TOP, - ¶ms, NULL); + ¶ms); if (err) { pr_err("Failed to register IPv4 top resource\n"); goto out; @@ -100,8 +84,7 @@ static int devlink_resources_register(struct devlink *devlink) n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB, true); err = devlink_resource_register(devlink, "fib", n, NSIM_RESOURCE_IPV4_FIB, - NSIM_RESOURCE_IPV4, - ¶ms, &nsim_ipv4_fib_res_ops); + NSIM_RESOURCE_IPV4, ¶ms); if (err) { pr_err("Failed to register IPv4 FIB resource\n"); return err; @@ -110,8 +93,7 @@ static int devlink_resources_register(struct devlink *devlink) n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV4_FIB_RULES, true); err = devlink_resource_register(devlink, "fib-rules", n, NSIM_RESOURCE_IPV4_FIB_RULES, - NSIM_RESOURCE_IPV4, - ¶ms, &nsim_ipv4_fib_rules_res_ops); + NSIM_RESOURCE_IPV4, ¶ms); if (err) { pr_err("Failed to register IPv4 FIB rules resource\n"); return err; @@ -121,7 +103,7 @@ static int devlink_resources_register(struct devlink *devlink) err = devlink_resource_register(devlink, "IPv6", (u64)-1, NSIM_RESOURCE_IPV6, DEVLINK_RESOURCE_ID_PARENT_TOP, - ¶ms, NULL); + ¶ms); if (err) { pr_err("Failed to register IPv6 top resource\n"); goto out; @@ -130,8 +112,7 @@ static int devlink_resources_register(struct devlink *devlink) n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB, true); err = devlink_resource_register(devlink, "fib", n, NSIM_RESOURCE_IPV6_FIB, - NSIM_RESOURCE_IPV6, - ¶ms, &nsim_ipv6_fib_res_ops); + NSIM_RESOURCE_IPV6, ¶ms); if (err) { pr_err("Failed to register IPv6 FIB resource\n"); return err; @@ -140,12 +121,28 @@ static int devlink_resources_register(struct devlink *devlink) n = nsim_fib_get_val(net, NSIM_RESOURCE_IPV6_FIB_RULES, true); err = devlink_resource_register(devlink, "fib-rules", n, NSIM_RESOURCE_IPV6_FIB_RULES, - NSIM_RESOURCE_IPV6, - ¶ms, &nsim_ipv6_fib_rules_res_ops); + NSIM_RESOURCE_IPV6, ¶ms); if (err) { pr_err("Failed to register IPv6 FIB rules resource\n"); return err; } + + devlink_resource_occ_get_register(devlink, + NSIM_RESOURCE_IPV4_FIB, + nsim_ipv4_fib_resource_occ_get, + net); + devlink_resource_occ_get_register(devlink, + NSIM_RESOURCE_IPV4_FIB_RULES, + nsim_ipv4_fib_rules_res_occ_get, + net); + devlink_resource_occ_get_register(devlink, + NSIM_RESOURCE_IPV6_FIB, + nsim_ipv6_fib_resource_occ_get, + net); + devlink_resource_occ_get_register(devlink, + NSIM_RESOURCE_IPV6_FIB_RULES, + nsim_ipv6_fib_rules_res_occ_get, + net); out: return err; } diff --git a/include/net/devlink.h b/include/net/devlink.h index e21d8cadd480..2e4f71e16e95 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -231,14 +231,6 @@ struct devlink_dpipe_headers { unsigned int headers_count; }; -/** - * struct devlink_resource_ops - resource ops - * @occ_get: get the occupied size - */ -struct devlink_resource_ops { - u64 (*occ_get)(struct devlink *devlink); -}; - /** * struct devlink_resource_size_params - resource's size parameters * @size_min: minimum size which can be set @@ -265,6 +257,8 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para size_params->unit = unit; } +typedef u64 devlink_resource_occ_get_t(void *priv); + /** * struct devlink_resource - devlink resource * @name: name of the resource @@ -277,7 +271,6 @@ devlink_resource_size_params_init(struct devlink_resource_size_params *size_para * @size_params: size parameters * @list: parent list * @resource_list: list of child resources - * @resource_ops: resource ops */ struct devlink_resource { const char *name; @@ -289,7 +282,8 @@ struct devlink_resource { struct devlink_resource_size_params size_params; struct list_head list; struct list_head resource_list; - const struct devlink_resource_ops *resource_ops; + devlink_resource_occ_get_t *occ_get; + void *occ_get_priv; }; #define DEVLINK_RESOURCE_ID_PARENT_TOP 0 @@ -409,8 +403,7 @@ int devlink_resource_register(struct devlink *devlink, u64 resource_size, u64 resource_id, u64 parent_resource_id, - const struct devlink_resource_size_params *size_params, - const struct devlink_resource_ops *resource_ops); + const struct devlink_resource_size_params *size_params); void devlink_resources_unregister(struct devlink *devlink, struct devlink_resource *resource); int devlink_resource_size_get(struct devlink *devlink, @@ -419,6 +412,12 @@ int devlink_resource_size_get(struct devlink *devlink, int devlink_dpipe_table_resource_set(struct devlink *devlink, const char *table_name, u64 resource_id, u64 resource_units); +void devlink_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv); +void devlink_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id); #else @@ -562,8 +561,7 @@ devlink_resource_register(struct devlink *devlink, u64 resource_size, u64 resource_id, u64 parent_resource_id, - const struct devlink_resource_size_params *size_params, - const struct devlink_resource_ops *resource_ops) + const struct devlink_resource_size_params *size_params) { return 0; } @@ -589,6 +587,20 @@ devlink_dpipe_table_resource_set(struct devlink *devlink, return -EOPNOTSUPP; } +static inline void +devlink_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ +} + +static inline void +devlink_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id) +{ +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 9236e421bd62..ad1317376798 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2405,6 +2405,16 @@ devlink_resource_size_params_put(struct devlink_resource *resource, return 0; } +static int devlink_resource_occ_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + if (!resource->occ_get) + return 0; + return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->occ_get(resource->occ_get_priv), + DEVLINK_ATTR_PAD); +} + static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, struct devlink_resource *resource) { @@ -2425,11 +2435,8 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, if (resource->size != resource->size_new) nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, resource->size_new, DEVLINK_ATTR_PAD); - if (resource->resource_ops && resource->resource_ops->occ_get) - if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, - resource->resource_ops->occ_get(devlink), - DEVLINK_ATTR_PAD)) - goto nla_put_failure; + if (devlink_resource_occ_put(resource, skb)) + goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) goto nla_put_failure; if (list_empty(&resource->resource_list)) @@ -3162,15 +3169,13 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); * @resource_id: resource's id * @parent_reosurce_id: resource's parent id * @size params: size parameters - * @resource_ops: resource ops */ int devlink_resource_register(struct devlink *devlink, const char *resource_name, u64 resource_size, u64 resource_id, u64 parent_resource_id, - const struct devlink_resource_size_params *size_params, - const struct devlink_resource_ops *resource_ops) + const struct devlink_resource_size_params *size_params) { struct devlink_resource *resource; struct list_head *resource_list; @@ -3213,7 +3218,6 @@ int devlink_resource_register(struct devlink *devlink, resource->size = resource_size; resource->size_new = resource_size; resource->id = resource_id; - resource->resource_ops = resource_ops; resource->size_valid = true; memcpy(&resource->size_params, size_params, sizeof(resource->size_params)); @@ -3315,6 +3319,58 @@ out: } EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); +/** + * devlink_resource_occ_get_register - register occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + * @occ_get: occupancy getter callback + * @occ_get_priv: occupancy getter callback priv + */ +void devlink_resource_occ_get_register(struct devlink *devlink, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + struct devlink_resource *resource; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (WARN_ON(!resource)) + goto out; + WARN_ON(resource->occ_get); + + resource->occ_get = occ_get; + resource->occ_get_priv = occ_get_priv; +out: + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register); + +/** + * devlink_resource_occ_get_unregister - unregister occupancy getter + * + * @devlink: devlink + * @resource_id: resource id + */ +void devlink_resource_occ_get_unregister(struct devlink *devlink, + u64 resource_id) +{ + struct devlink_resource *resource; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (WARN_ON(!resource)) + goto out; + WARN_ON(!resource->occ_get); + + resource->occ_get = NULL; + resource->occ_get_priv = NULL; +out: + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); From 76327a35caabd1a932e83d6a42b967aa08584e5d Mon Sep 17 00:00:00 2001 From: Esben Haabendal Date: Sun, 8 Apr 2018 22:17:01 +0200 Subject: [PATCH 43/46] dp83640: Ensure against premature access to PHY registers after reset The datasheet specifies a 3uS pause after performing a software reset. The default implementation of genphy_soft_reset() does not provide this, so implement soft_reset with the needed pause. Signed-off-by: Esben Haabendal Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/dp83640.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c index 654f42d00092..a6c87793d899 100644 --- a/drivers/net/phy/dp83640.c +++ b/drivers/net/phy/dp83640.c @@ -1207,6 +1207,23 @@ static void dp83640_remove(struct phy_device *phydev) kfree(dp83640); } +static int dp83640_soft_reset(struct phy_device *phydev) +{ + int ret; + + ret = genphy_soft_reset(phydev); + if (ret < 0) + return ret; + + /* From DP83640 datasheet: "Software driver code must wait 3 us + * following a software reset before allowing further serial MII + * operations with the DP83640." + */ + udelay(10); /* Taking udelay inaccuracy into account */ + + return 0; +} + static int dp83640_config_init(struct phy_device *phydev) { struct dp83640_private *dp83640 = phydev->priv; @@ -1501,6 +1518,7 @@ static struct phy_driver dp83640_driver = { .flags = PHY_HAS_INTERRUPT, .probe = dp83640_probe, .remove = dp83640_remove, + .soft_reset = dp83640_soft_reset, .config_init = dp83640_config_init, .ack_interrupt = dp83640_ack_interrupt, .config_intr = dp83640_config_intr, From b6a37e5e25414df4b8e9140a5c6f5ee0ec6f3b90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 9 Apr 2018 06:43:27 -0700 Subject: [PATCH 44/46] inetpeer: fix uninit-value in inet_getpeer syzbot/KMSAN reported that p->dtime was read while it was not yet initialized in : delta = (__u32)jiffies - p->dtime; if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; This is a false positive, because the inetpeer wont be erased from rb-tree if the refcount_dec_if_one(&p->refcnt) does not succeed. And this wont happen before first inet_putpeer() call for this inetpeer has been done, and ->dtime field is written exactly before the refcount_dec_and_test(&p->refcnt). The KMSAN report was : BUG: KMSAN: uninit-value in inet_peer_gc net/ipv4/inetpeer.c:163 [inline] BUG: KMSAN: uninit-value in inet_getpeer+0x1567/0x1e70 net/ipv4/inetpeer.c:228 CPU: 0 PID: 9494 Comm: syz-executor5 Not tainted 4.16.0+ #82 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:17 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:53 kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067 __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:676 inet_peer_gc net/ipv4/inetpeer.c:163 [inline] inet_getpeer+0x1567/0x1e70 net/ipv4/inetpeer.c:228 inet_getpeer_v4 include/net/inetpeer.h:110 [inline] icmpv4_xrlim_allow net/ipv4/icmp.c:330 [inline] icmp_send+0x2b44/0x3050 net/ipv4/icmp.c:725 ip_options_compile+0x237c/0x29f0 net/ipv4/ip_options.c:472 ip_rcv_options net/ipv4/ip_input.c:284 [inline] ip_rcv_finish+0xda8/0x16d0 net/ipv4/ip_input.c:365 NF_HOOK include/linux/netfilter.h:288 [inline] ip_rcv+0x119d/0x16f0 net/ipv4/ip_input.c:493 __netif_receive_skb_core+0x47cf/0x4a80 net/core/dev.c:4562 __netif_receive_skb net/core/dev.c:4627 [inline] netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701 netif_receive_skb+0x230/0x240 net/core/dev.c:4725 tun_rx_batched drivers/net/tun.c:1555 [inline] tun_get_user+0x6d88/0x7580 drivers/net/tun.c:1962 tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990 do_iter_readv_writev+0x7bb/0x970 include/linux/fs.h:1776 do_iter_write+0x30d/0xd40 fs/read_write.c:932 vfs_writev fs/read_write.c:977 [inline] do_writev+0x3c9/0x830 fs/read_write.c:1012 SYSC_writev+0x9b/0xb0 fs/read_write.c:1085 SyS_writev+0x56/0x80 fs/read_write.c:1082 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 RIP: 0033:0x455111 RSP: 002b:00007fae0365cba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 000000000000002e RCX: 0000000000455111 RDX: 0000000000000001 RSI: 00007fae0365cbf0 RDI: 00000000000000fc RBP: 0000000020000040 R08: 00000000000000fc R09: 0000000000000000 R10: 000000000000002e R11: 0000000000000293 R12: 00000000ffffffff R13: 0000000000000658 R14: 00000000006fc8e0 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314 kmem_cache_alloc+0xaab/0xb90 mm/slub.c:2756 inet_getpeer+0xed8/0x1e70 net/ipv4/inetpeer.c:210 inet_getpeer_v4 include/net/inetpeer.h:110 [inline] ip4_frag_init+0x4d1/0x740 net/ipv4/ip_fragment.c:153 inet_frag_alloc net/ipv4/inet_fragment.c:369 [inline] inet_frag_create net/ipv4/inet_fragment.c:385 [inline] inet_frag_find+0x7da/0x1610 net/ipv4/inet_fragment.c:418 ip_find net/ipv4/ip_fragment.c:275 [inline] ip_defrag+0x448/0x67a0 net/ipv4/ip_fragment.c:676 ip_check_defrag+0x775/0xda0 net/ipv4/ip_fragment.c:724 packet_rcv_fanout+0x2a8/0x8d0 net/packet/af_packet.c:1447 deliver_skb net/core/dev.c:1897 [inline] deliver_ptype_list_skb net/core/dev.c:1912 [inline] __netif_receive_skb_core+0x314a/0x4a80 net/core/dev.c:4545 __netif_receive_skb net/core/dev.c:4627 [inline] netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701 netif_receive_skb+0x230/0x240 net/core/dev.c:4725 tun_rx_batched drivers/net/tun.c:1555 [inline] tun_get_user+0x6d88/0x7580 drivers/net/tun.c:1962 tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990 do_iter_readv_writev+0x7bb/0x970 include/linux/fs.h:1776 do_iter_write+0x30d/0xd40 fs/read_write.c:932 vfs_writev fs/read_write.c:977 [inline] do_writev+0x3c9/0x830 fs/read_write.c:1012 SYSC_writev+0x9b/0xb0 fs/read_write.c:1085 SyS_writev+0x56/0x80 fs/read_write.c:1082 do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/inetpeer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 1f04bd91fc2e..d757b9642d0d 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -211,6 +211,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; + p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 2); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; From 9b5c4dfb2a6390bfb0166770d0d776ce0d3cdebd Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Mon, 9 Apr 2018 06:24:48 -0700 Subject: [PATCH 45/46] net: thunderx: rework mac addresses list to u64 array It is too expensive to pass u64 values via linked list, instead allocate array for them by overall number of mac addresses from netdev. This eventually removes multiple kmalloc() calls, aviod memory fragmentation and allow to put single null check on kmalloc return value in order to prevent a potential null pointer dereference. Addresses-Coverity-ID: 1467429 ("Dereference null return value") Fixes: 37c3347eb247 ("net: thunderx: add ndo_set_rx_mode callback implementation for VF") Reported-by: Dan Carpenter Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 7 +---- .../net/ethernet/cavium/thunder/nicvf_main.c | 28 +++++++------------ 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 5fc46c5a4f36..448d1fafc827 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -265,14 +265,9 @@ struct nicvf_drv_stats { struct cavium_ptp; -struct xcast_addr { - struct list_head list; - u64 addr; -}; - struct xcast_addr_list { - struct list_head list; int count; + u64 mc[]; }; struct nicvf_work { diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 1e9a31fef729..707db3304396 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1929,7 +1929,7 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg) work.work); struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work); union nic_mbx mbx = {}; - struct xcast_addr *xaddr, *next; + int idx; if (!vf_work) return; @@ -1956,16 +1956,10 @@ static void nicvf_set_rx_mode_task(struct work_struct *work_arg) /* check if we have any specific MACs to be added to PF DMAC filter */ if (vf_work->mc) { /* now go through kernel list of MACs and add them one by one */ - list_for_each_entry_safe(xaddr, next, - &vf_work->mc->list, list) { + for (idx = 0; idx < vf_work->mc->count; idx++) { mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; - mbx.xcast.data.mac = xaddr->addr; + mbx.xcast.data.mac = vf_work->mc->mc[idx]; nicvf_send_msg_to_pf(nic, &mbx); - - /* after receiving ACK from PF release memory */ - list_del(&xaddr->list); - kfree(xaddr); - vf_work->mc->count--; } kfree(vf_work->mc); } @@ -1996,17 +1990,15 @@ static void nicvf_set_rx_mode(struct net_device *netdev) mode |= BGX_XCAST_MCAST_FILTER; /* here we need to copy mc addrs */ if (netdev_mc_count(netdev)) { - struct xcast_addr *xaddr; - - mc_list = kmalloc(sizeof(*mc_list), GFP_ATOMIC); - INIT_LIST_HEAD(&mc_list->list); + mc_list = kmalloc(offsetof(typeof(*mc_list), + mc[netdev_mc_count(netdev)]), + GFP_ATOMIC); + if (unlikely(!mc_list)) + return; + mc_list->count = 0; netdev_hw_addr_list_for_each(ha, &netdev->mc) { - xaddr = kmalloc(sizeof(*xaddr), - GFP_ATOMIC); - xaddr->addr = + mc_list->mc[mc_list->count] = ether_addr_to_u64(ha->addr); - list_add_tail(&xaddr->list, - &mc_list->list); mc_list->count++; } } From a2ac99905f1ea8b15997a6ec39af69aa28a3653b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?haibinzhang=28=E5=BC=A0=E6=B5=B7=E6=96=8C=29?= Date: Mon, 9 Apr 2018 07:22:17 +0000 Subject: [PATCH 46/46] vhost-net: set packet weight of tx polling to 2 * vq size handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy polling udp packets with small length(e.g. 1byte udp payload), because setting VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet length. Ping-Latencies shown below were tested between two Virtual Machines using netperf (UDP_STREAM, len=1), and then another machine pinged the client: vq size=256 Packet-Weight Ping-Latencies(millisecond) min avg max Origin 3.319 18.489 57.303 64 1.643 2.021 2.552 128 1.825 2.600 3.224 256 1.997 2.710 4.295 512 1.860 3.171 4.631 1024 2.002 4.173 9.056 2048 2.257 5.650 9.688 4096 2.093 8.508 15.943 vq size=512 Packet-Weight Ping-Latencies(millisecond) min avg max Origin 6.537 29.177 66.245 64 2.798 3.614 4.403 128 2.861 3.820 4.775 256 3.008 4.018 4.807 512 3.254 4.523 5.824 1024 3.079 5.335 7.747 2048 3.944 8.201 12.762 4096 4.158 11.057 19.985 Seems pretty consistent, a small dip at 2 VQ sizes. Ring size is a hint from device about a burst size it can tolerate. Based on benchmarks, set the weight to 2 * vq size. To evaluate this change, another tests were done using netperf(RR, TX) between two machines with Intel(R) Xeon(R) Gold 6133 CPU @ 2.50GHz, and vq size was tweaked through qemu. Results shown below does not show obvious changes. vq size=256 TCP_RR vq size=512 TCP_RR size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% 1/ 1/ -7%/ -2% 1/ 1/ 0%/ -2% 1/ 4/ +1%/ 0% 1/ 4/ +1%/ 0% 1/ 8/ +1%/ -2% 1/ 8/ 0%/ +1% 64/ 1/ -6%/ 0% 64/ 1/ +7%/ +3% 64/ 4/ 0%/ +2% 64/ 4/ -1%/ +1% 64/ 8/ 0%/ 0% 64/ 8/ -1%/ -2% 256/ 1/ -3%/ -4% 256/ 1/ -4%/ -2% 256/ 4/ +3%/ +4% 256/ 4/ +1%/ +2% 256/ 8/ +2%/ 0% 256/ 8/ +1%/ -1% vq size=256 UDP_RR vq size=512 UDP_RR size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% 1/ 1/ -5%/ +1% 1/ 1/ -3%/ -2% 1/ 4/ +4%/ +1% 1/ 4/ -2%/ +2% 1/ 8/ -1%/ -1% 1/ 8/ -1%/ 0% 64/ 1/ -2%/ -3% 64/ 1/ +1%/ +1% 64/ 4/ -5%/ -1% 64/ 4/ +2%/ 0% 64/ 8/ 0%/ -1% 64/ 8/ -2%/ +1% 256/ 1/ +7%/ +1% 256/ 1/ -7%/ 0% 256/ 4/ +1%/ +1% 256/ 4/ -3%/ -4% 256/ 8/ +2%/ +2% 256/ 8/ +1%/ +1% vq size=256 TCP_STREAM vq size=512 TCP_STREAM size/sessions/+thu%/+normalize% size/sessions/+thu%/+normalize% 64/ 1/ 0%/ -3% 64/ 1/ 0%/ 0% 64/ 4/ +3%/ -1% 64/ 4/ -2%/ +4% 64/ 8/ +9%/ -4% 64/ 8/ -1%/ +2% 256/ 1/ +1%/ -4% 256/ 1/ +1%/ +1% 256/ 4/ -1%/ -1% 256/ 4/ -3%/ 0% 256/ 8/ +7%/ +5% 256/ 8/ -3%/ 0% 512/ 1/ +1%/ 0% 512/ 1/ -1%/ -1% 512/ 4/ +1%/ -1% 512/ 4/ 0%/ 0% 512/ 8/ +7%/ -5% 512/ 8/ +6%/ -1% 1024/ 1/ 0%/ -1% 1024/ 1/ 0%/ +1% 1024/ 4/ +3%/ 0% 1024/ 4/ +1%/ 0% 1024/ 8/ +8%/ +5% 1024/ 8/ -1%/ 0% 2048/ 1/ +2%/ +2% 2048/ 1/ -1%/ 0% 2048/ 4/ +1%/ 0% 2048/ 4/ 0%/ -1% 2048/ 8/ -2%/ 0% 2048/ 8/ 5%/ -1% 4096/ 1/ -2%/ 0% 4096/ 1/ -2%/ 0% 4096/ 4/ +2%/ 0% 4096/ 4/ 0%/ 0% 4096/ 8/ +9%/ -2% 4096/ 8/ -5%/ -1% Acked-by: Michael S. Tsirkin Signed-off-by: Haibin Zhang Signed-off-by: Yunfang Tai Signed-off-by: Lidong Chen Signed-off-by: David S. Miller --- drivers/vhost/net.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index edc6fec9ad84..986058a57917 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -44,6 +44,10 @@ MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;" * Using this limit prevents one virtqueue from starving others. */ #define VHOST_NET_WEIGHT 0x80000 +/* Max number of packets transferred before requeueing the job. + * Using this limit prevents one virtqueue from starving rx. */ +#define VHOST_NET_PKT_WEIGHT(vq) ((vq)->num * 2) + /* MAX number of TX used buffers for outstanding zerocopy */ #define VHOST_MAX_PEND 128 #define VHOST_GOODCOPY_LEN 256 @@ -473,6 +477,7 @@ static void handle_tx(struct vhost_net *net) struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); bool zcopy, zcopy_used; + int sent_pkts = 0; mutex_lock(&vq->mutex); sock = vq->private_data; @@ -580,7 +585,8 @@ static void handle_tx(struct vhost_net *net) else vhost_zerocopy_signal_used(net, vq); vhost_net_tx_packet(net); - if (unlikely(total_len >= VHOST_NET_WEIGHT)) { + if (unlikely(total_len >= VHOST_NET_WEIGHT) || + unlikely(++sent_pkts >= VHOST_NET_PKT_WEIGHT(vq))) { vhost_poll_queue(&vq->poll); break; }