From 1d49eb91e86e8c1c1614c72e3e958b6b7e2472a9 Mon Sep 17 00:00:00 2001 From: Ioanna Alifieraki Date: Mon, 15 Nov 2021 15:16:45 +0200 Subject: [PATCH 001/231] ipmi: Move remove_work to dedicated workqueue Currently when removing an ipmi_user the removal is deferred as a work on the system's workqueue. Although this guarantees the free operation will occur in non atomic context, it can race with the ipmi_msghandler module removal (see [1]) . In case a remove_user work is scheduled for removal and shortly after ipmi_msghandler module is removed we can end up in a situation where the module is removed fist and when the work is executed the system crashes with : BUG: unable to handle page fault for address: ffffffffc05c3450 PF: supervisor instruction fetch in kernel mode PF: error_code(0x0010) - not-present page because the pages of the module are gone. In cleanup_ipmi() there is no easy way to detect if there are any pending works to flush them before removing the module. This patch creates a separate workqueue and schedules the remove_work works on it. When removing the module the workqueue is drained when destroyed to avoid the race. [1] https://bugs.launchpad.net/bugs/1950666 Cc: stable@vger.kernel.org # 5.1 Fixes: 3b9a907223d7 (ipmi: fix sleep-in-atomic in free_user at cleanup SRCU user->release_barrier) Signed-off-by: Ioanna Alifieraki Message-Id: <20211115131645.25116-1-ioanna-maria.alifieraki@canonical.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index deed355422f4..1ade72bfae0f 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -191,6 +191,8 @@ struct ipmi_user { struct work_struct remove_work; }; +struct workqueue_struct *remove_work_wq; + static struct ipmi_user *acquire_ipmi_user(struct ipmi_user *user, int *index) __acquires(user->release_barrier) { @@ -1297,7 +1299,7 @@ static void free_user(struct kref *ref) struct ipmi_user *user = container_of(ref, struct ipmi_user, refcount); /* SRCU cleanup must happen in task context. */ - schedule_work(&user->remove_work); + queue_work(remove_work_wq, &user->remove_work); } static void _ipmi_destroy_user(struct ipmi_user *user) @@ -5383,6 +5385,13 @@ static int ipmi_init_msghandler(void) atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + remove_work_wq = create_singlethread_workqueue("ipmi-msghandler-remove-wq"); + if (!remove_work_wq) { + pr_err("unable to create ipmi-msghandler-remove-wq workqueue"); + rv = -ENOMEM; + goto out; + } + initialized = true; out: @@ -5408,6 +5417,8 @@ static void __exit cleanup_ipmi(void) int count; if (initialized) { + destroy_workqueue(remove_work_wq); + atomic_notifier_chain_unregister(&panic_notifier_list, &panic_block); From d3c45824ad65aebf765fcf51366d317a29538820 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 16 Nov 2021 09:55:01 -0500 Subject: [PATCH 002/231] NFSv42: Don't fail clone() unless the OP_CLONE operation failed The failure to retrieve post-op attributes has no bearing on whether or not the clone operation itself was successful. We must therefore ignore the return value of decode_getfattr() when looking at the success or failure of nfs4_xdr_dec_clone(). Fixes: 36022770de6c ("nfs42: add CLONE xdr functions") Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index c8bad735e4c1..271e5f92ed01 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1434,8 +1434,7 @@ static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp, status = decode_clone(xdr); if (status) goto out; - status = decode_getfattr(xdr, res->dst_fattr, res->server); - + decode_getfattr(xdr, res->dst_fattr, res->server); out: res->rpc_status = status; return status; From 93c2e5e0a9ecfc183ab1204e1ecaa7ee7eb2a61a Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 16 Nov 2021 08:49:24 -0500 Subject: [PATCH 003/231] NFS: Add a tracepoint to show the results of nfs_set_cache_invalid() This provides some insight into the client's invalidation behavior to show both when the client uses the helper, and the results of calling the helper which can vary depending on how the helper is called. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 1 + fs/nfs/nfstrace.h | 1 + 2 files changed, 2 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index dd53704c3f40..fda530d5e764 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -219,6 +219,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) NFS_INO_DATA_INVAL_DEFER); else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; + trace_nfs_set_cache_invalid(inode, 0); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 21dac847f1e4..b3aee261801e 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -162,6 +162,7 @@ DEFINE_NFS_INODE_EVENT_DONE(nfs_writeback_inode_exit); DEFINE_NFS_INODE_EVENT(nfs_fsync_enter); DEFINE_NFS_INODE_EVENT_DONE(nfs_fsync_exit); DEFINE_NFS_INODE_EVENT(nfs_access_enter); +DEFINE_NFS_INODE_EVENT_DONE(nfs_set_cache_invalid); TRACE_EVENT(nfs_access_exit, TP_PROTO( From 3f015d89a47cd8855cd92f71fff770095bd885a1 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Tue, 16 Nov 2021 10:48:13 -0500 Subject: [PATCH 004/231] NFSv42: Fix pagecache invalidation after COPY/CLONE The mechanism in use to allow the client to see the results of COPY/CLONE is to drop those pages from the pagecache. This forces the client to read those pages once more from the server. However, truncate_pagecache_range() zeros out partial pages instead of dropping them. Let us instead use invalidate_inode_pages2_range() with full-page offsets to ensure the client properly sees the results of COPY/CLONE operations. Cc: # v4.7+ Fixes: 2e72448b07dc ("NFS: Add COPY nfs operation") Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- fs/nfs/nfs42proc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 08355b66e7cb..8b21ff1be717 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -289,7 +289,9 @@ static void nfs42_copy_dest_done(struct inode *inode, loff_t pos, loff_t len) loff_t newsize = pos + len; loff_t end = newsize - 1; - truncate_pagecache_range(inode, pos, end); + WARN_ON_ONCE(invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, end >> PAGE_SHIFT)); + spin_lock(&inode->i_lock); if (newsize > i_size_read(inode)) i_size_write(inode, newsize); From ea027cb2e1b59c76582af867b71d5c037fa6bb8e Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 15 Nov 2021 16:30:40 -0500 Subject: [PATCH 005/231] NFSv4.1: handle NFS4ERR_NOSPC by CREATE_SESSION When the client receives ERR_NOSPC on reply to CREATE_SESSION it leads to a client hanging in nfs_wait_client_init_complete(). Instead, complete and fail the client initiation with an EIO error which allows for the mount command to fail instead of hanging. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ecc4594299d6..f63dfa01001c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1998,6 +1998,10 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) dprintk("%s: exit with error %d for server %s\n", __func__, -EPROTONOSUPPORT, clp->cl_hostname); return -EPROTONOSUPPORT; + case -ENOSPC: + if (clp->cl_cons_state == NFS_CS_SESSION_INITING) + nfs_mark_client_ready(clp, -EIO); + return -EIO; case -NFS4ERR_NOT_SAME: /* FixMe: implement recovery * in nfs4_exchange_id */ default: From 268bb03856ed6c8511c31d08de0148782f50822f Mon Sep 17 00:00:00 2001 From: Thiago Rafael Becker Date: Wed, 17 Nov 2021 10:26:30 -0300 Subject: [PATCH 006/231] sunrpc: fix header include guard in trace header rpcgss.h include protection was protecting against the define for rpcrdma.h. Signed-off-by: Thiago Rafael Becker Signed-off-by: Trond Myklebust --- include/trace/events/rpcgss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index 3ba63319af3c..c9048f3e471b 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -8,7 +8,7 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM rpcgss -#if !defined(_TRACE_RPCRDMA_H) || defined(TRACE_HEADER_MULTI_READ) +#if !defined(_TRACE_RPCGSS_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RPCGSS_H #include From 574c3c55e969096cea770eda3375ff35ccf91702 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Mon, 15 Nov 2021 13:17:04 -0800 Subject: [PATCH 007/231] KVM: x86/mmu: Fix TLB flush range when handling disconnected pt When recursively clearing out disconnected pts, the range based TLB flush in handle_removed_tdp_mmu_page uses the wrong starting GFN, resulting in the flush mostly missing the affected range. Fix this by using base_gfn for the flush. In response to feedback from David Matlack on the RFC version of this patch, also move a few definitions into the for loop in the function to prevent unintended references to them in the future. Fixes: a066e61f13cf ("KVM: x86/mmu: Factor out handling of removed page tables") CC: stable@vger.kernel.org Signed-off-by: Ben Gardon Message-Id: <20211115211704.2621644-1-bgardon@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index a54c3491af42..377a96718a2e 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt)); int level = sp->role.level; gfn_t base_gfn = sp->gfn; - u64 old_child_spte; - u64 *sptep; - gfn_t gfn; int i; trace_kvm_mmu_prepare_zap_page(sp); @@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, tdp_mmu_unlink_page(kvm, sp, shared); for (i = 0; i < PT64_ENT_PER_PAGE; i++) { - sptep = rcu_dereference(pt) + i; - gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 *sptep = rcu_dereference(pt) + i; + gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level); + u64 old_child_spte; if (shared) { /* @@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt, shared); } - kvm_flush_remote_tlbs_with_address(kvm, gfn, + kvm_flush_remote_tlbs_with_address(kvm, base_gfn, KVM_PAGES_PER_HPAGE(level + 1)); call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback); From bda44d844758c70c8dc1478e6fc9c25efa90c5a7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 00:25:02 +0000 Subject: [PATCH 008/231] KVM: Ensure local memslot copies operate on up-to-date arch-specific data When modifying memslots, snapshot the "old" memslot and copy it to the "new" memslot's arch data after (re)acquiring slots_arch_lock. x86 can change a memslot's arch data while memslot updates are in-progress so long as it holds slots_arch_lock, thus snapshotting a memslot without holding the lock can result in the consumption of stale data. Fixes: b10a038e84d1 ("KVM: mmu: Add slots_arch_lock for memslot arch fields") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20211104002531.1176691-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 47 ++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9646bb9112c1..2104fc29cdd2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1531,11 +1531,10 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, static int kvm_set_memslot(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot *old, struct kvm_memory_slot *new, int as_id, enum kvm_mr_change change) { - struct kvm_memory_slot *slot; + struct kvm_memory_slot *slot, old; struct kvm_memslots *slots; int r; @@ -1566,7 +1565,7 @@ static int kvm_set_memslot(struct kvm *kvm, * Note, the INVALID flag needs to be in the appropriate entry * in the freshly allocated memslots, not in @old or @new. */ - slot = id_to_memslot(slots, old->id); + slot = id_to_memslot(slots, new->id); slot->flags |= KVM_MEMSLOT_INVALID; /* @@ -1597,6 +1596,26 @@ static int kvm_set_memslot(struct kvm *kvm, kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id)); } + /* + * Make a full copy of the old memslot, the pointer will become stale + * when the memslots are re-sorted by update_memslots(), and the old + * memslot needs to be referenced after calling update_memslots(), e.g. + * to free its resources and for arch specific behavior. This needs to + * happen *after* (re)acquiring slots_arch_lock. + */ + slot = id_to_memslot(slots, new->id); + if (slot) { + old = *slot; + } else { + WARN_ON_ONCE(change != KVM_MR_CREATE); + memset(&old, 0, sizeof(old)); + old.id = new->id; + old.as_id = as_id; + } + + /* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */ + memcpy(&new->arch, &old.arch, sizeof(old.arch)); + r = kvm_arch_prepare_memory_region(kvm, new, mem, change); if (r) goto out_slots; @@ -1604,14 +1623,18 @@ static int kvm_set_memslot(struct kvm *kvm, update_memslots(slots, new, change); slots = install_new_memslots(kvm, as_id, slots); - kvm_arch_commit_memory_region(kvm, mem, old, new, change); + kvm_arch_commit_memory_region(kvm, mem, &old, new, change); + + /* Free the old memslot's metadata. Note, this is the full copy!!! */ + if (change == KVM_MR_DELETE) + kvm_free_memslot(kvm, &old); kvfree(slots); return 0; out_slots: if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { - slot = id_to_memslot(slots, old->id); + slot = id_to_memslot(slots, new->id); slot->flags &= ~KVM_MEMSLOT_INVALID; slots = install_new_memslots(kvm, as_id, slots); } else { @@ -1626,7 +1649,6 @@ static int kvm_delete_memslot(struct kvm *kvm, struct kvm_memory_slot *old, int as_id) { struct kvm_memory_slot new; - int r; if (!old->npages) return -EINVAL; @@ -1639,12 +1661,7 @@ static int kvm_delete_memslot(struct kvm *kvm, */ new.as_id = as_id; - r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); - if (r) - return r; - - kvm_free_memslot(kvm, old); - return 0; + return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE); } /* @@ -1718,7 +1735,6 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!old.npages) { change = KVM_MR_CREATE; new.dirty_bitmap = NULL; - memset(&new.arch, 0, sizeof(new.arch)); } else { /* Modify an existing slot. */ if ((new.userspace_addr != old.userspace_addr) || (new.npages != old.npages) || @@ -1732,9 +1748,8 @@ int __kvm_set_memory_region(struct kvm *kvm, else /* Nothing to change. */ return 0; - /* Copy dirty_bitmap and arch from the current memslot. */ + /* Copy dirty_bitmap from the current memslot. */ new.dirty_bitmap = old.dirty_bitmap; - memcpy(&new.arch, &old.arch, sizeof(new.arch)); } if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { @@ -1760,7 +1775,7 @@ int __kvm_set_memory_region(struct kvm *kvm, bitmap_set(new.dirty_bitmap, 0, new.npages); } - r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); + r = kvm_set_memslot(kvm, mem, &new, as_id, change); if (r) goto out_bitmap; From 6b285a5587506bae084cf9a3ed5aa491d623b91b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 00:25:03 +0000 Subject: [PATCH 009/231] KVM: Disallow user memslot with size that exceeds "unsigned long" Reject userspace memslots whose size exceeds the storage capacity of an "unsigned long". KVM's uAPI takes the size as u64 to support large slots on 64-bit hosts, but does not account for the size being truncated on 32-bit hosts in various flows. The access_ok() check on the userspace virtual address in particular casts the size to "unsigned long" and will check the wrong number of bytes. KVM doesn't actually support slots whose size doesn't fit in an "unsigned long", e.g. KVM's internal kvm_memory_slot.npages is an "unsigned long", not a "u64", and misc arch specific code follows that behavior. Fixes: fa3d315a4ce2 ("KVM: Validate userspace_addr of memslot when registered") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Maciej S. Szmigiero Message-Id: <20211104002531.1176691-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2104fc29cdd2..6c5083f2eb50 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1689,7 +1689,8 @@ int __kvm_set_memory_region(struct kvm *kvm, id = (u16)mem->slot; /* General sanity checks */ - if (mem->memory_size & (PAGE_SIZE - 1)) + if ((mem->memory_size & (PAGE_SIZE - 1)) || + (mem->memory_size != (unsigned long)mem->memory_size)) return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) return -EINVAL; From 9dba4d24cbb5524dd39ab1e08886373b17f07ff2 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 17 Nov 2021 08:16:17 +0100 Subject: [PATCH 010/231] x86/kvm: remove unused ack_notifier callbacks Commit f52447261bc8c2 ("KVM: irq ack notification") introduced an ack_notifier() callback in struct kvm_pic and in struct kvm_ioapic without using them anywhere. Remove those callbacks again. Signed-off-by: Juergen Gross Message-Id: <20211117071617.19504-1-jgross@suse.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.h | 1 - arch/x86/kvm/irq.h | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index e66e620c3bed..539333ac4b38 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -81,7 +81,6 @@ struct kvm_ioapic { unsigned long irq_states[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; - void (*ack_notifier)(void *opaque, int irq); spinlock_t lock; struct rtc_status rtc_status; struct delayed_work eoi_inject; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 650642b18d15..c2d7cfe82d00 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -56,7 +56,6 @@ struct kvm_pic { struct kvm_io_device dev_master; struct kvm_io_device dev_slave; struct kvm_io_device dev_elcr; - void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[PIC_NUM_PINS]; }; From c7785d85b6c6cc9f3d0f1a8cab128f4062b30abb Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Nov 2021 17:20:39 +0800 Subject: [PATCH 011/231] KVM: x86/mmu: Skip tlb flush if it has been done in zap_gfn_range() If the parameter flush is set, zap_gfn_range() would flush remote tlb when yield, then tlb flush is not needed outside. So use the return value of zap_gfn_range() directly instead of OR on it in kvm_unmap_gfn_range() and kvm_tdp_mmu_unmap_gfn_range(). Fixes: 3039bcc744980 ("KVM: Move x86's MMU notifier memslot walkers to generic code") Signed-off-by: Hou Wenlong Message-Id: <5e16546e228877a4d974f8c0e448a93d52c7a5a9.1637140154.git.houwenlong93@linux.alibaba.com> Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/tdp_mmu.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3be9beea838d..0a8436ea0090 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp); if (is_tdp_mmu_enabled(kvm)) - flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); return flush; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 377a96718a2e..1f8c9f783b78 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1032,8 +1032,8 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, struct kvm_mmu_page *root; for_each_tdp_mmu_root(kvm, root, range->slot->as_id) - flush |= zap_gfn_range(kvm, root, range->start, range->end, - range->may_block, flush, false); + flush = zap_gfn_range(kvm, root, range->start, range->end, + range->may_block, flush, false); return flush; } From 8ed716ca7dc91f058be0ba644a3048667a20db13 Mon Sep 17 00:00:00 2001 From: Hou Wenlong Date: Wed, 17 Nov 2021 17:20:40 +0800 Subject: [PATCH 012/231] KVM: x86/mmu: Pass parameter flush as false in kvm_tdp_mmu_zap_collapsible_sptes() Since tlb flush has been done for legacy MMU before kvm_tdp_mmu_zap_collapsible_sptes(), so the parameter flush should be false for kvm_tdp_mmu_zap_collapsible_sptes(). Fixes: e2209710ccc5d ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Hou Wenlong Message-Id: <21453a1d2533afb6e59fb6c729af89e771ff2e76.1637140154.git.houwenlong93@linux.alibaba.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0a8436ea0090..0c839ee1282c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5854,7 +5854,7 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush = false; + bool flush; if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); @@ -5871,7 +5871,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush); + flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, false); if (flush) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); read_unlock(&kvm->mmu_lock); From 884c6cb3b7030f75c46e55b9e625d2372708c306 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:36:56 +0530 Subject: [PATCH 013/231] ASoC: tegra: Fix wrong value type in ADMAIF The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: f74028e159bb ("ASoC: tegra: Add Tegra210 based ADMAIF driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-2-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_admaif.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/tegra/tegra210_admaif.c b/sound/soc/tegra/tegra210_admaif.c index bcccdf3ddc52..6febe80cfa6f 100644 --- a/sound/soc/tegra/tegra210_admaif.c +++ b/sound/soc/tegra/tegra210_admaif.c @@ -430,7 +430,7 @@ static int tegra_admaif_get_control(struct snd_kcontrol *kcontrol, struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); - long *uctl_val = &ucontrol->value.integer.value[0]; + unsigned int *uctl_val = &ucontrol->value.enumerated.item[0]; if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) *uctl_val = admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg]; @@ -450,7 +450,7 @@ static int tegra_admaif_put_control(struct snd_kcontrol *kcontrol, struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); - int value = ucontrol->value.integer.value[0]; + unsigned int value = ucontrol->value.enumerated.item[0]; if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg] = value; From 8a2c2fa0c5331445c801e9241f2bb4e0e2a895a8 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:36:57 +0530 Subject: [PATCH 014/231] ASoC: tegra: Fix wrong value type in I2S The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: c0bfa98349d1 ("ASoC: tegra: Add Tegra210 based I2S driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-3-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_i2s.c | 42 +++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/sound/soc/tegra/tegra210_i2s.c b/sound/soc/tegra/tegra210_i2s.c index 45f31ccb49d8..5c304612769f 100644 --- a/sound/soc/tegra/tegra210_i2s.c +++ b/sound/soc/tegra/tegra210_i2s.c @@ -317,24 +317,27 @@ static int tegra210_i2s_get_control(struct snd_kcontrol *kcontrol, { struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); - long *uctl_val = &ucontrol->value.integer.value[0]; if (strstr(kcontrol->id.name, "Loopback")) - *uctl_val = i2s->loopback; + ucontrol->value.integer.value[0] = i2s->loopback; else if (strstr(kcontrol->id.name, "FSYNC Width")) - *uctl_val = i2s->fsync_width; + ucontrol->value.integer.value[0] = i2s->fsync_width; else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) - *uctl_val = i2s->stereo_to_mono[I2S_TX_PATH]; + ucontrol->value.enumerated.item[0] = + i2s->stereo_to_mono[I2S_TX_PATH]; else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) - *uctl_val = i2s->mono_to_stereo[I2S_TX_PATH]; + ucontrol->value.enumerated.item[0] = + i2s->mono_to_stereo[I2S_TX_PATH]; else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) - *uctl_val = i2s->stereo_to_mono[I2S_RX_PATH]; + ucontrol->value.enumerated.item[0] = + i2s->stereo_to_mono[I2S_RX_PATH]; else if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) - *uctl_val = i2s->mono_to_stereo[I2S_RX_PATH]; + ucontrol->value.enumerated.item[0] = + i2s->mono_to_stereo[I2S_RX_PATH]; else if (strstr(kcontrol->id.name, "Playback FIFO Threshold")) - *uctl_val = i2s->rx_fifo_th; + ucontrol->value.integer.value[0] = i2s->rx_fifo_th; else if (strstr(kcontrol->id.name, "BCLK Ratio")) - *uctl_val = i2s->bclk_ratio; + ucontrol->value.integer.value[0] = i2s->bclk_ratio; return 0; } @@ -344,10 +347,9 @@ static int tegra210_i2s_put_control(struct snd_kcontrol *kcontrol, { struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); - int value = ucontrol->value.integer.value[0]; if (strstr(kcontrol->id.name, "Loopback")) { - i2s->loopback = value; + i2s->loopback = ucontrol->value.integer.value[0]; regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, I2S_CTRL_LPBK_MASK, @@ -362,24 +364,28 @@ static int tegra210_i2s_put_control(struct snd_kcontrol *kcontrol, * cases mixer control is used to update custom values. A value * of "N" here means, width is "N + 1" bit clock wide. */ - i2s->fsync_width = value; + i2s->fsync_width = ucontrol->value.integer.value[0]; regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, I2S_CTRL_FSYNC_WIDTH_MASK, i2s->fsync_width << I2S_FSYNC_WIDTH_SHIFT); } else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) { - i2s->stereo_to_mono[I2S_TX_PATH] = value; + i2s->stereo_to_mono[I2S_TX_PATH] = + ucontrol->value.enumerated.item[0]; } else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) { - i2s->mono_to_stereo[I2S_TX_PATH] = value; + i2s->mono_to_stereo[I2S_TX_PATH] = + ucontrol->value.enumerated.item[0]; } else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) { - i2s->stereo_to_mono[I2S_RX_PATH] = value; + i2s->stereo_to_mono[I2S_RX_PATH] = + ucontrol->value.enumerated.item[0]; } else if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) { - i2s->mono_to_stereo[I2S_RX_PATH] = value; + i2s->mono_to_stereo[I2S_RX_PATH] = + ucontrol->value.enumerated.item[0]; } else if (strstr(kcontrol->id.name, "Playback FIFO Threshold")) { - i2s->rx_fifo_th = value; + i2s->rx_fifo_th = ucontrol->value.integer.value[0]; } else if (strstr(kcontrol->id.name, "BCLK Ratio")) { - i2s->bclk_ratio = value; + i2s->bclk_ratio = ucontrol->value.integer.value[0]; } return 0; From 559d234569a998a4004de1bd1f12da5487fb826e Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:36:58 +0530 Subject: [PATCH 015/231] ASoC: tegra: Fix wrong value type in DMIC The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: 8c8ff982e9e2 ("ASoC: tegra: Add Tegra210 based DMIC driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-4-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_dmic.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sound/soc/tegra/tegra210_dmic.c b/sound/soc/tegra/tegra210_dmic.c index b096478cd2ef..ee2aedb0440f 100644 --- a/sound/soc/tegra/tegra210_dmic.c +++ b/sound/soc/tegra/tegra210_dmic.c @@ -165,15 +165,15 @@ static int tegra210_dmic_get_control(struct snd_kcontrol *kcontrol, if (strstr(kcontrol->id.name, "Boost Gain Volume")) ucontrol->value.integer.value[0] = dmic->boost_gain; else if (strstr(kcontrol->id.name, "Channel Select")) - ucontrol->value.integer.value[0] = dmic->ch_select; + ucontrol->value.enumerated.item[0] = dmic->ch_select; else if (strstr(kcontrol->id.name, "Mono To Stereo")) - ucontrol->value.integer.value[0] = dmic->mono_to_stereo; + ucontrol->value.enumerated.item[0] = dmic->mono_to_stereo; else if (strstr(kcontrol->id.name, "Stereo To Mono")) - ucontrol->value.integer.value[0] = dmic->stereo_to_mono; + ucontrol->value.enumerated.item[0] = dmic->stereo_to_mono; else if (strstr(kcontrol->id.name, "OSR Value")) - ucontrol->value.integer.value[0] = dmic->osr_val; + ucontrol->value.enumerated.item[0] = dmic->osr_val; else if (strstr(kcontrol->id.name, "LR Polarity Select")) - ucontrol->value.integer.value[0] = dmic->lrsel; + ucontrol->value.enumerated.item[0] = dmic->lrsel; return 0; } @@ -183,20 +183,19 @@ static int tegra210_dmic_put_control(struct snd_kcontrol *kcontrol, { struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); - int value = ucontrol->value.integer.value[0]; if (strstr(kcontrol->id.name, "Boost Gain Volume")) - dmic->boost_gain = value; + dmic->boost_gain = ucontrol->value.integer.value[0]; else if (strstr(kcontrol->id.name, "Channel Select")) - dmic->ch_select = ucontrol->value.integer.value[0]; + dmic->ch_select = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Mono To Stereo")) - dmic->mono_to_stereo = value; + dmic->mono_to_stereo = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Stereo To Mono")) - dmic->stereo_to_mono = value; + dmic->stereo_to_mono = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "OSR Value")) - dmic->osr_val = value; + dmic->osr_val = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "LR Polarity Select")) - dmic->lrsel = value; + dmic->lrsel = ucontrol->value.enumerated.item[0]; return 0; } From 3aa0d5c8bb3f5ef622ec2764823f551a1f630711 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:36:59 +0530 Subject: [PATCH 016/231] ASoC: tegra: Fix wrong value type in DSPK The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: 327ef6470266 ("ASoC: tegra: Add Tegra186 based DSPK driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-5-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra186_dspk.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sound/soc/tegra/tegra186_dspk.c b/sound/soc/tegra/tegra186_dspk.c index 8ee9a77bd83d..67269e77d6e8 100644 --- a/sound/soc/tegra/tegra186_dspk.c +++ b/sound/soc/tegra/tegra186_dspk.c @@ -35,15 +35,15 @@ static int tegra186_dspk_get_control(struct snd_kcontrol *kcontrol, if (strstr(kcontrol->id.name, "FIFO Threshold")) ucontrol->value.integer.value[0] = dspk->rx_fifo_th; else if (strstr(kcontrol->id.name, "OSR Value")) - ucontrol->value.integer.value[0] = dspk->osr_val; + ucontrol->value.enumerated.item[0] = dspk->osr_val; else if (strstr(kcontrol->id.name, "LR Polarity Select")) - ucontrol->value.integer.value[0] = dspk->lrsel; + ucontrol->value.enumerated.item[0] = dspk->lrsel; else if (strstr(kcontrol->id.name, "Channel Select")) - ucontrol->value.integer.value[0] = dspk->ch_sel; + ucontrol->value.enumerated.item[0] = dspk->ch_sel; else if (strstr(kcontrol->id.name, "Mono To Stereo")) - ucontrol->value.integer.value[0] = dspk->mono_to_stereo; + ucontrol->value.enumerated.item[0] = dspk->mono_to_stereo; else if (strstr(kcontrol->id.name, "Stereo To Mono")) - ucontrol->value.integer.value[0] = dspk->stereo_to_mono; + ucontrol->value.enumerated.item[0] = dspk->stereo_to_mono; return 0; } @@ -53,20 +53,19 @@ static int tegra186_dspk_put_control(struct snd_kcontrol *kcontrol, { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); - int val = ucontrol->value.integer.value[0]; if (strstr(kcontrol->id.name, "FIFO Threshold")) - dspk->rx_fifo_th = val; + dspk->rx_fifo_th = ucontrol->value.integer.value[0]; else if (strstr(kcontrol->id.name, "OSR Value")) - dspk->osr_val = val; + dspk->osr_val = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "LR Polarity Select")) - dspk->lrsel = val; + dspk->lrsel = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Channel Select")) - dspk->ch_sel = val; + dspk->ch_sel = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Mono To Stereo")) - dspk->mono_to_stereo = val; + dspk->mono_to_stereo = ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Stereo To Mono")) - dspk->stereo_to_mono = val; + dspk->stereo_to_mono = ucontrol->value.enumerated.item[0]; return 0; } From 42afca1a65661935cdd54d2e0c5d0cc2426db7af Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:00 +0530 Subject: [PATCH 017/231] ASoC: tegra: Fix wrong value type in SFC The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: b2f74ec53a6c ("ASoC: tegra: Add Tegra210 based SFC driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-6-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_sfc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sound/soc/tegra/tegra210_sfc.c b/sound/soc/tegra/tegra210_sfc.c index dc477ee1b82c..cb592ef55bd3 100644 --- a/sound/soc/tegra/tegra210_sfc.c +++ b/sound/soc/tegra/tegra210_sfc.c @@ -3251,16 +3251,16 @@ static int tegra210_sfc_get_control(struct snd_kcontrol *kcontrol, struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); if (strstr(kcontrol->id.name, "Input Stereo To Mono")) - ucontrol->value.integer.value[0] = + ucontrol->value.enumerated.item[0] = sfc->stereo_to_mono[SFC_RX_PATH]; else if (strstr(kcontrol->id.name, "Input Mono To Stereo")) - ucontrol->value.integer.value[0] = + ucontrol->value.enumerated.item[0] = sfc->mono_to_stereo[SFC_RX_PATH]; else if (strstr(kcontrol->id.name, "Output Stereo To Mono")) - ucontrol->value.integer.value[0] = + ucontrol->value.enumerated.item[0] = sfc->stereo_to_mono[SFC_TX_PATH]; else if (strstr(kcontrol->id.name, "Output Mono To Stereo")) - ucontrol->value.integer.value[0] = + ucontrol->value.enumerated.item[0] = sfc->mono_to_stereo[SFC_TX_PATH]; return 0; @@ -3271,16 +3271,19 @@ static int tegra210_sfc_put_control(struct snd_kcontrol *kcontrol, { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); - int value = ucontrol->value.integer.value[0]; if (strstr(kcontrol->id.name, "Input Stereo To Mono")) - sfc->stereo_to_mono[SFC_RX_PATH] = value; + sfc->stereo_to_mono[SFC_RX_PATH] = + ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Input Mono To Stereo")) - sfc->mono_to_stereo[SFC_RX_PATH] = value; + sfc->mono_to_stereo[SFC_RX_PATH] = + ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Output Stereo To Mono")) - sfc->stereo_to_mono[SFC_TX_PATH] = value; + sfc->stereo_to_mono[SFC_TX_PATH] = + ucontrol->value.enumerated.item[0]; else if (strstr(kcontrol->id.name, "Output Mono To Stereo")) - sfc->mono_to_stereo[SFC_TX_PATH] = value; + sfc->mono_to_stereo[SFC_TX_PATH] = + ucontrol->value.enumerated.item[0]; else return 0; From 6762965d0214df474e3a58e1d4d3ab004c5da0ea Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:01 +0530 Subject: [PATCH 018/231] ASoC: tegra: Fix wrong value type in MVC The enum controls are expected to use enumerated value type. Update relevant references in control get/put callbacks. Fixes: e539891f9687 ("ASoC: tegra: Add Tegra210 based MVC driver") Suggested-by: Takashi Iwai Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-7-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_mvc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sound/soc/tegra/tegra210_mvc.c b/sound/soc/tegra/tegra210_mvc.c index 7b9c7006e419..b7e317065251 100644 --- a/sound/soc/tegra/tegra210_mvc.c +++ b/sound/soc/tegra/tegra210_mvc.c @@ -275,7 +275,7 @@ static int tegra210_mvc_get_curve_type(struct snd_kcontrol *kcontrol, struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_mvc *mvc = snd_soc_component_get_drvdata(cmpnt); - ucontrol->value.integer.value[0] = mvc->curve_type; + ucontrol->value.enumerated.item[0] = mvc->curve_type; return 0; } @@ -285,7 +285,7 @@ static int tegra210_mvc_put_curve_type(struct snd_kcontrol *kcontrol, { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_mvc *mvc = snd_soc_component_get_drvdata(cmpnt); - int value; + unsigned int value; regmap_read(mvc->regmap, TEGRA210_MVC_ENABLE, &value); if (value & TEGRA210_MVC_EN) { @@ -294,10 +294,10 @@ static int tegra210_mvc_put_curve_type(struct snd_kcontrol *kcontrol, return -EINVAL; } - if (mvc->curve_type == ucontrol->value.integer.value[0]) + if (mvc->curve_type == ucontrol->value.enumerated.item[0]) return 0; - mvc->curve_type = ucontrol->value.integer.value[0]; + mvc->curve_type = ucontrol->value.enumerated.item[0]; tegra210_mvc_reset_vol_settings(mvc, cmpnt->dev); From e2b87a18a60c02d0dcd1de801d669587e516cc4d Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:02 +0530 Subject: [PATCH 019/231] ASoC: tegra: Fix kcontrol put callback in ADMAIF The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Update the ADMAIF driver accordingly. Fixes: f74028e159bb ("ASoC: tegra: Add Tegra210 based ADMAIF driver") Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-8-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_admaif.c | 142 +++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 31 deletions(-) diff --git a/sound/soc/tegra/tegra210_admaif.c b/sound/soc/tegra/tegra210_admaif.c index 6febe80cfa6f..1a2e868a6220 100644 --- a/sound/soc/tegra/tegra210_admaif.c +++ b/sound/soc/tegra/tegra210_admaif.c @@ -424,46 +424,122 @@ static const struct snd_soc_dai_ops tegra_admaif_dai_ops = { .trigger = tegra_admaif_trigger, }; -static int tegra_admaif_get_control(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) +static int tegra210_admaif_pget_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); - struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); - unsigned int *uctl_val = &ucontrol->value.enumerated.item[0]; + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; - if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) - *uctl_val = admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg]; - else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) - *uctl_val = admaif->mono_to_stereo[ADMAIF_RX_PATH][ec->reg]; - else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) - *uctl_val = admaif->stereo_to_mono[ADMAIF_TX_PATH][ec->reg]; - else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) - *uctl_val = admaif->stereo_to_mono[ADMAIF_RX_PATH][ec->reg]; + ucontrol->value.enumerated.item[0] = + admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg]; return 0; } -static int tegra_admaif_put_control(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) +static int tegra210_admaif_pput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); - struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; unsigned int value = ucontrol->value.enumerated.item[0]; - if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) - admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg] = value; - else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) - admaif->mono_to_stereo[ADMAIF_RX_PATH][ec->reg] = value; - else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) - admaif->stereo_to_mono[ADMAIF_TX_PATH][ec->reg] = value; - else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) - admaif->stereo_to_mono[ADMAIF_RX_PATH][ec->reg] = value; + if (value == admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg]) + return 0; + + admaif->mono_to_stereo[ADMAIF_TX_PATH][ec->reg] = value; + + return 1; +} + +static int tegra210_admaif_cget_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + + ucontrol->value.enumerated.item[0] = + admaif->mono_to_stereo[ADMAIF_RX_PATH][ec->reg]; return 0; } +static int tegra210_admaif_cput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == admaif->mono_to_stereo[ADMAIF_RX_PATH][ec->reg]) + return 0; + + admaif->mono_to_stereo[ADMAIF_RX_PATH][ec->reg] = value; + + return 1; +} + +static int tegra210_admaif_pget_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + + ucontrol->value.enumerated.item[0] = + admaif->stereo_to_mono[ADMAIF_TX_PATH][ec->reg]; + + return 0; +} + +static int tegra210_admaif_pput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == admaif->stereo_to_mono[ADMAIF_TX_PATH][ec->reg]) + return 0; + + admaif->stereo_to_mono[ADMAIF_TX_PATH][ec->reg] = value; + + return 1; +} + +static int tegra210_admaif_cget_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + + ucontrol->value.enumerated.item[0] = + admaif->stereo_to_mono[ADMAIF_RX_PATH][ec->reg]; + + return 0; +} + +static int tegra210_admaif_cput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra_admaif *admaif = snd_soc_component_get_drvdata(cmpnt); + struct soc_enum *ec = (struct soc_enum *)kcontrol->private_value; + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == admaif->stereo_to_mono[ADMAIF_RX_PATH][ec->reg]) + return 0; + + admaif->stereo_to_mono[ADMAIF_RX_PATH][ec->reg] = value; + + return 1; +} + static int tegra_admaif_dai_probe(struct snd_soc_dai *dai) { struct tegra_admaif *admaif = snd_soc_dai_get_drvdata(dai); @@ -559,17 +635,21 @@ static const char * const tegra_admaif_mono_conv_text[] = { } #define TEGRA_ADMAIF_CIF_CTRL(reg) \ - NV_SOC_ENUM_EXT("ADMAIF" #reg " Playback Mono To Stereo", reg - 1,\ - tegra_admaif_get_control, tegra_admaif_put_control, \ + NV_SOC_ENUM_EXT("ADMAIF" #reg " Playback Mono To Stereo", reg - 1, \ + tegra210_admaif_pget_mono_to_stereo, \ + tegra210_admaif_pput_mono_to_stereo, \ tegra_admaif_mono_conv_text), \ - NV_SOC_ENUM_EXT("ADMAIF" #reg " Playback Stereo To Mono", reg - 1,\ - tegra_admaif_get_control, tegra_admaif_put_control, \ + NV_SOC_ENUM_EXT("ADMAIF" #reg " Playback Stereo To Mono", reg - 1, \ + tegra210_admaif_pget_stereo_to_mono, \ + tegra210_admaif_pput_stereo_to_mono, \ tegra_admaif_stereo_conv_text), \ - NV_SOC_ENUM_EXT("ADMAIF" #reg " Capture Mono To Stereo", reg - 1, \ - tegra_admaif_get_control, tegra_admaif_put_control, \ + NV_SOC_ENUM_EXT("ADMAIF" #reg " Capture Mono To Stereo", reg - 1, \ + tegra210_admaif_cget_mono_to_stereo, \ + tegra210_admaif_cput_mono_to_stereo, \ tegra_admaif_mono_conv_text), \ - NV_SOC_ENUM_EXT("ADMAIF" #reg " Capture Stereo To Mono", reg - 1, \ - tegra_admaif_get_control, tegra_admaif_put_control, \ + NV_SOC_ENUM_EXT("ADMAIF" #reg " Capture Stereo To Mono", reg - 1, \ + tegra210_admaif_cget_stereo_to_mono, \ + tegra210_admaif_cput_stereo_to_mono, \ tegra_admaif_stereo_conv_text) static struct snd_kcontrol_new tegra210_admaif_controls[] = { From f21a9df3f7cb0005947679d7b9237c90574e229a Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:03 +0530 Subject: [PATCH 020/231] ASoC: tegra: Fix kcontrol put callback in I2S The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Update the I2S driver accordingly. Fixes: c0bfa98349d1 ("ASoC: tegra: Add Tegra210 based I2S driver") Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-9-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_i2s.c | 330 ++++++++++++++++++++++++--------- 1 file changed, 240 insertions(+), 90 deletions(-) diff --git a/sound/soc/tegra/tegra210_i2s.c b/sound/soc/tegra/tegra210_i2s.c index 5c304612769f..9552bbb939dd 100644 --- a/sound/soc/tegra/tegra210_i2s.c +++ b/sound/soc/tegra/tegra210_i2s.c @@ -302,6 +302,229 @@ static int tegra210_i2s_set_tdm_slot(struct snd_soc_dai *dai, return 0; } +static int tegra210_i2s_get_loopback(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.integer.value[0] = i2s->loopback; + + return 0; +} + +static int tegra210_i2s_put_loopback(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + int value = ucontrol->value.integer.value[0]; + + if (value == i2s->loopback) + return 0; + + i2s->loopback = value; + + regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, I2S_CTRL_LPBK_MASK, + i2s->loopback << I2S_CTRL_LPBK_SHIFT); + + return 1; +} + +static int tegra210_i2s_get_fsync_width(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.integer.value[0] = i2s->fsync_width; + + return 0; +} + +static int tegra210_i2s_put_fsync_width(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + int value = ucontrol->value.integer.value[0]; + + if (value == i2s->fsync_width) + return 0; + + i2s->fsync_width = value; + + /* + * Frame sync width is used only for FSYNC modes and not + * applicable for LRCK modes. Reset value for this field is "0", + * which means the width is one bit clock wide. + * The width requirement may depend on the codec and in such + * cases mixer control is used to update custom values. A value + * of "N" here means, width is "N + 1" bit clock wide. + */ + regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, + I2S_CTRL_FSYNC_WIDTH_MASK, + i2s->fsync_width << I2S_FSYNC_WIDTH_SHIFT); + + return 1; +} + +static int tegra210_i2s_cget_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.enumerated.item[0] = i2s->stereo_to_mono[I2S_TX_PATH]; + + return 0; +} + +static int tegra210_i2s_cput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == i2s->stereo_to_mono[I2S_TX_PATH]) + return 0; + + i2s->stereo_to_mono[I2S_TX_PATH] = value; + + return 1; +} + +static int tegra210_i2s_cget_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.enumerated.item[0] = i2s->mono_to_stereo[I2S_TX_PATH]; + + return 0; +} + +static int tegra210_i2s_cput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == i2s->mono_to_stereo[I2S_TX_PATH]) + return 0; + + i2s->mono_to_stereo[I2S_TX_PATH] = value; + + return 1; +} + +static int tegra210_i2s_pget_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.enumerated.item[0] = i2s->stereo_to_mono[I2S_RX_PATH]; + + return 0; +} + +static int tegra210_i2s_pput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == i2s->stereo_to_mono[I2S_RX_PATH]) + return 0; + + i2s->stereo_to_mono[I2S_RX_PATH] = value; + + return 1; +} + +static int tegra210_i2s_pget_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.enumerated.item[0] = i2s->mono_to_stereo[I2S_RX_PATH]; + + return 0; +} + +static int tegra210_i2s_pput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == i2s->mono_to_stereo[I2S_RX_PATH]) + return 0; + + i2s->mono_to_stereo[I2S_RX_PATH] = value; + + return 1; +} + +static int tegra210_i2s_pget_fifo_th(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.integer.value[0] = i2s->rx_fifo_th; + + return 0; +} + +static int tegra210_i2s_pput_fifo_th(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + int value = ucontrol->value.integer.value[0]; + + if (value == i2s->rx_fifo_th) + return 0; + + i2s->rx_fifo_th = value; + + return 1; +} + +static int tegra210_i2s_get_bclk_ratio(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + + ucontrol->value.integer.value[0] = i2s->bclk_ratio; + + return 0; +} + +static int tegra210_i2s_put_bclk_ratio(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); + int value = ucontrol->value.integer.value[0]; + + if (value == i2s->bclk_ratio) + return 0; + + i2s->bclk_ratio = value; + + return 1; +} + static int tegra210_i2s_set_dai_bclk_ratio(struct snd_soc_dai *dai, unsigned int ratio) { @@ -312,85 +535,6 @@ static int tegra210_i2s_set_dai_bclk_ratio(struct snd_soc_dai *dai, return 0; } -static int tegra210_i2s_get_control(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); - struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); - - if (strstr(kcontrol->id.name, "Loopback")) - ucontrol->value.integer.value[0] = i2s->loopback; - else if (strstr(kcontrol->id.name, "FSYNC Width")) - ucontrol->value.integer.value[0] = i2s->fsync_width; - else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) - ucontrol->value.enumerated.item[0] = - i2s->stereo_to_mono[I2S_TX_PATH]; - else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) - ucontrol->value.enumerated.item[0] = - i2s->mono_to_stereo[I2S_TX_PATH]; - else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) - ucontrol->value.enumerated.item[0] = - i2s->stereo_to_mono[I2S_RX_PATH]; - else if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) - ucontrol->value.enumerated.item[0] = - i2s->mono_to_stereo[I2S_RX_PATH]; - else if (strstr(kcontrol->id.name, "Playback FIFO Threshold")) - ucontrol->value.integer.value[0] = i2s->rx_fifo_th; - else if (strstr(kcontrol->id.name, "BCLK Ratio")) - ucontrol->value.integer.value[0] = i2s->bclk_ratio; - - return 0; -} - -static int tegra210_i2s_put_control(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) -{ - struct snd_soc_component *compnt = snd_soc_kcontrol_component(kcontrol); - struct tegra210_i2s *i2s = snd_soc_component_get_drvdata(compnt); - - if (strstr(kcontrol->id.name, "Loopback")) { - i2s->loopback = ucontrol->value.integer.value[0]; - - regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, - I2S_CTRL_LPBK_MASK, - i2s->loopback << I2S_CTRL_LPBK_SHIFT); - - } else if (strstr(kcontrol->id.name, "FSYNC Width")) { - /* - * Frame sync width is used only for FSYNC modes and not - * applicable for LRCK modes. Reset value for this field is "0", - * which means the width is one bit clock wide. - * The width requirement may depend on the codec and in such - * cases mixer control is used to update custom values. A value - * of "N" here means, width is "N + 1" bit clock wide. - */ - i2s->fsync_width = ucontrol->value.integer.value[0]; - - regmap_update_bits(i2s->regmap, TEGRA210_I2S_CTRL, - I2S_CTRL_FSYNC_WIDTH_MASK, - i2s->fsync_width << I2S_FSYNC_WIDTH_SHIFT); - - } else if (strstr(kcontrol->id.name, "Capture Stereo To Mono")) { - i2s->stereo_to_mono[I2S_TX_PATH] = - ucontrol->value.enumerated.item[0]; - } else if (strstr(kcontrol->id.name, "Capture Mono To Stereo")) { - i2s->mono_to_stereo[I2S_TX_PATH] = - ucontrol->value.enumerated.item[0]; - } else if (strstr(kcontrol->id.name, "Playback Stereo To Mono")) { - i2s->stereo_to_mono[I2S_RX_PATH] = - ucontrol->value.enumerated.item[0]; - } else if (strstr(kcontrol->id.name, "Playback Mono To Stereo")) { - i2s->mono_to_stereo[I2S_RX_PATH] = - ucontrol->value.enumerated.item[0]; - } else if (strstr(kcontrol->id.name, "Playback FIFO Threshold")) { - i2s->rx_fifo_th = ucontrol->value.integer.value[0]; - } else if (strstr(kcontrol->id.name, "BCLK Ratio")) { - i2s->bclk_ratio = ucontrol->value.integer.value[0]; - } - - return 0; -} - static int tegra210_i2s_set_timing_params(struct device *dev, unsigned int sample_size, unsigned int srate, @@ -604,22 +748,28 @@ static const struct soc_enum tegra210_i2s_stereo_conv_enum = tegra210_i2s_stereo_conv_text); static const struct snd_kcontrol_new tegra210_i2s_controls[] = { - SOC_SINGLE_EXT("Loopback", 0, 0, 1, 0, tegra210_i2s_get_control, - tegra210_i2s_put_control), - SOC_SINGLE_EXT("FSYNC Width", 0, 0, 255, 0, tegra210_i2s_get_control, - tegra210_i2s_put_control), + SOC_SINGLE_EXT("Loopback", 0, 0, 1, 0, tegra210_i2s_get_loopback, + tegra210_i2s_put_loopback), + SOC_SINGLE_EXT("FSYNC Width", 0, 0, 255, 0, + tegra210_i2s_get_fsync_width, + tegra210_i2s_put_fsync_width), SOC_ENUM_EXT("Capture Stereo To Mono", tegra210_i2s_stereo_conv_enum, - tegra210_i2s_get_control, tegra210_i2s_put_control), + tegra210_i2s_cget_stereo_to_mono, + tegra210_i2s_cput_stereo_to_mono), SOC_ENUM_EXT("Capture Mono To Stereo", tegra210_i2s_mono_conv_enum, - tegra210_i2s_get_control, tegra210_i2s_put_control), + tegra210_i2s_cget_mono_to_stereo, + tegra210_i2s_cput_mono_to_stereo), SOC_ENUM_EXT("Playback Stereo To Mono", tegra210_i2s_stereo_conv_enum, - tegra210_i2s_get_control, tegra210_i2s_put_control), + tegra210_i2s_pget_mono_to_stereo, + tegra210_i2s_pput_mono_to_stereo), SOC_ENUM_EXT("Playback Mono To Stereo", tegra210_i2s_mono_conv_enum, - tegra210_i2s_get_control, tegra210_i2s_put_control), + tegra210_i2s_pget_stereo_to_mono, + tegra210_i2s_pput_stereo_to_mono), SOC_SINGLE_EXT("Playback FIFO Threshold", 0, 0, I2S_RX_FIFO_DEPTH - 1, - 0, tegra210_i2s_get_control, tegra210_i2s_put_control), - SOC_SINGLE_EXT("BCLK Ratio", 0, 0, INT_MAX, 0, tegra210_i2s_get_control, - tegra210_i2s_put_control), + 0, tegra210_i2s_pget_fifo_th, tegra210_i2s_pput_fifo_th), + SOC_SINGLE_EXT("BCLK Ratio", 0, 0, INT_MAX, 0, + tegra210_i2s_get_bclk_ratio, + tegra210_i2s_put_bclk_ratio), }; static const struct snd_soc_dapm_widget tegra210_i2s_widgets[] = { From a347dfa10262fa0a10e2b1970ea0194e3d4a3251 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:04 +0530 Subject: [PATCH 021/231] ASoC: tegra: Fix kcontrol put callback in DMIC The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Update the DMIC driver accordingly. Fixes: 8c8ff982e9e2 ("ASoC: tegra: Add Tegra210 based DMIC driver") Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-10-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_dmic.c | 185 ++++++++++++++++++++++++++------ 1 file changed, 150 insertions(+), 35 deletions(-) diff --git a/sound/soc/tegra/tegra210_dmic.c b/sound/soc/tegra/tegra210_dmic.c index ee2aedb0440f..db95794530f4 100644 --- a/sound/soc/tegra/tegra210_dmic.c +++ b/sound/soc/tegra/tegra210_dmic.c @@ -156,50 +156,162 @@ static int tegra210_dmic_hw_params(struct snd_pcm_substream *substream, return 0; } -static int tegra210_dmic_get_control(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) +static int tegra210_dmic_get_boost_gain(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); - if (strstr(kcontrol->id.name, "Boost Gain Volume")) - ucontrol->value.integer.value[0] = dmic->boost_gain; - else if (strstr(kcontrol->id.name, "Channel Select")) - ucontrol->value.enumerated.item[0] = dmic->ch_select; - else if (strstr(kcontrol->id.name, "Mono To Stereo")) - ucontrol->value.enumerated.item[0] = dmic->mono_to_stereo; - else if (strstr(kcontrol->id.name, "Stereo To Mono")) - ucontrol->value.enumerated.item[0] = dmic->stereo_to_mono; - else if (strstr(kcontrol->id.name, "OSR Value")) - ucontrol->value.enumerated.item[0] = dmic->osr_val; - else if (strstr(kcontrol->id.name, "LR Polarity Select")) - ucontrol->value.enumerated.item[0] = dmic->lrsel; + ucontrol->value.integer.value[0] = dmic->boost_gain; return 0; } -static int tegra210_dmic_put_control(struct snd_kcontrol *kcontrol, +static int tegra210_dmic_put_boost_gain(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + int value = ucontrol->value.integer.value[0]; + + if (value == dmic->boost_gain) + return 0; + + dmic->boost_gain = value; + + return 1; +} + +static int tegra210_dmic_get_ch_select(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + + ucontrol->value.enumerated.item[0] = dmic->ch_select; + + return 0; +} + +static int tegra210_dmic_put_ch_select(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dmic->ch_select) + return 0; + + dmic->ch_select = value; + + return 1; +} + +static int tegra210_dmic_get_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + + ucontrol->value.enumerated.item[0] = dmic->mono_to_stereo; + + return 0; +} + +static int tegra210_dmic_put_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dmic->mono_to_stereo) + return 0; + + dmic->mono_to_stereo = value; + + return 1; +} + +static int tegra210_dmic_get_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + + ucontrol->value.enumerated.item[0] = dmic->stereo_to_mono; + + return 0; +} + +static int tegra210_dmic_put_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dmic->stereo_to_mono) + return 0; + + dmic->stereo_to_mono = value; + + return 1; +} + +static int tegra210_dmic_get_osr_val(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); - if (strstr(kcontrol->id.name, "Boost Gain Volume")) - dmic->boost_gain = ucontrol->value.integer.value[0]; - else if (strstr(kcontrol->id.name, "Channel Select")) - dmic->ch_select = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Mono To Stereo")) - dmic->mono_to_stereo = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Stereo To Mono")) - dmic->stereo_to_mono = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "OSR Value")) - dmic->osr_val = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "LR Polarity Select")) - dmic->lrsel = ucontrol->value.enumerated.item[0]; + ucontrol->value.enumerated.item[0] = dmic->osr_val; return 0; } +static int tegra210_dmic_put_osr_val(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dmic->osr_val) + return 0; + + dmic->osr_val = value; + + return 1; +} + +static int tegra210_dmic_get_pol_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + + ucontrol->value.enumerated.item[0] = dmic->lrsel; + + return 0; +} + +static int tegra210_dmic_put_pol_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *comp = snd_soc_kcontrol_component(kcontrol); + struct tegra210_dmic *dmic = snd_soc_component_get_drvdata(comp); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dmic->lrsel) + return 0; + + dmic->lrsel = value; + + return 1; +} + static const struct snd_soc_dai_ops tegra210_dmic_dai_ops = { .hw_params = tegra210_dmic_hw_params, }; @@ -286,19 +398,22 @@ static const struct soc_enum tegra210_dmic_lrsel_enum = static const struct snd_kcontrol_new tegra210_dmic_controls[] = { SOC_SINGLE_EXT("Boost Gain Volume", 0, 0, MAX_BOOST_GAIN, 0, - tegra210_dmic_get_control, tegra210_dmic_put_control), + tegra210_dmic_get_boost_gain, + tegra210_dmic_put_boost_gain), SOC_ENUM_EXT("Channel Select", tegra210_dmic_ch_enum, - tegra210_dmic_get_control, tegra210_dmic_put_control), + tegra210_dmic_get_ch_select, tegra210_dmic_put_ch_select), SOC_ENUM_EXT("Mono To Stereo", - tegra210_dmic_mono_conv_enum, tegra210_dmic_get_control, - tegra210_dmic_put_control), + tegra210_dmic_mono_conv_enum, + tegra210_dmic_get_mono_to_stereo, + tegra210_dmic_put_mono_to_stereo), SOC_ENUM_EXT("Stereo To Mono", - tegra210_dmic_stereo_conv_enum, tegra210_dmic_get_control, - tegra210_dmic_put_control), + tegra210_dmic_stereo_conv_enum, + tegra210_dmic_get_stereo_to_mono, + tegra210_dmic_put_stereo_to_mono), SOC_ENUM_EXT("OSR Value", tegra210_dmic_osr_enum, - tegra210_dmic_get_control, tegra210_dmic_put_control), + tegra210_dmic_get_osr_val, tegra210_dmic_put_osr_val), SOC_ENUM_EXT("LR Polarity Select", tegra210_dmic_lrsel_enum, - tegra210_dmic_get_control, tegra210_dmic_put_control), + tegra210_dmic_get_pol_sel, tegra210_dmic_put_pol_sel), }; static const struct snd_soc_component_driver tegra210_dmic_compnt = { From d6202a57e79d102271d38c34481fedc9d4c79694 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:05 +0530 Subject: [PATCH 022/231] ASoC: tegra: Fix kcontrol put callback in DSPK The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Update the DSPK driver accordingly. Fixes: 327ef6470266 ("ASoC: tegra: Add Tegra186 based DSPK driver") Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Signed-off-by: Sameer Pujar Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-11-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra186_dspk.c | 178 ++++++++++++++++++++++++++------ 1 file changed, 146 insertions(+), 32 deletions(-) diff --git a/sound/soc/tegra/tegra186_dspk.c b/sound/soc/tegra/tegra186_dspk.c index 67269e77d6e8..a74c980ee775 100644 --- a/sound/soc/tegra/tegra186_dspk.c +++ b/sound/soc/tegra/tegra186_dspk.c @@ -26,50 +26,162 @@ static const struct reg_default tegra186_dspk_reg_defaults[] = { { TEGRA186_DSPK_CODEC_CTRL, 0x03000000 }, }; -static int tegra186_dspk_get_control(struct snd_kcontrol *kcontrol, +static int tegra186_dspk_get_fifo_th(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); - if (strstr(kcontrol->id.name, "FIFO Threshold")) - ucontrol->value.integer.value[0] = dspk->rx_fifo_th; - else if (strstr(kcontrol->id.name, "OSR Value")) - ucontrol->value.enumerated.item[0] = dspk->osr_val; - else if (strstr(kcontrol->id.name, "LR Polarity Select")) - ucontrol->value.enumerated.item[0] = dspk->lrsel; - else if (strstr(kcontrol->id.name, "Channel Select")) - ucontrol->value.enumerated.item[0] = dspk->ch_sel; - else if (strstr(kcontrol->id.name, "Mono To Stereo")) - ucontrol->value.enumerated.item[0] = dspk->mono_to_stereo; - else if (strstr(kcontrol->id.name, "Stereo To Mono")) - ucontrol->value.enumerated.item[0] = dspk->stereo_to_mono; + ucontrol->value.integer.value[0] = dspk->rx_fifo_th; return 0; } -static int tegra186_dspk_put_control(struct snd_kcontrol *kcontrol, +static int tegra186_dspk_put_fifo_th(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + int value = ucontrol->value.integer.value[0]; + + if (value == dspk->rx_fifo_th) + return 0; + + dspk->rx_fifo_th = value; + + return 1; +} + +static int tegra186_dspk_get_osr_val(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); - if (strstr(kcontrol->id.name, "FIFO Threshold")) - dspk->rx_fifo_th = ucontrol->value.integer.value[0]; - else if (strstr(kcontrol->id.name, "OSR Value")) - dspk->osr_val = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "LR Polarity Select")) - dspk->lrsel = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Channel Select")) - dspk->ch_sel = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Mono To Stereo")) - dspk->mono_to_stereo = ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Stereo To Mono")) - dspk->stereo_to_mono = ucontrol->value.enumerated.item[0]; + ucontrol->value.enumerated.item[0] = dspk->osr_val; return 0; } +static int tegra186_dspk_put_osr_val(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dspk->osr_val) + return 0; + + dspk->osr_val = value; + + return 1; +} + +static int tegra186_dspk_get_pol_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + + ucontrol->value.enumerated.item[0] = dspk->lrsel; + + return 0; +} + +static int tegra186_dspk_put_pol_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dspk->lrsel) + return 0; + + dspk->lrsel = value; + + return 1; +} + +static int tegra186_dspk_get_ch_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + + ucontrol->value.enumerated.item[0] = dspk->ch_sel; + + return 0; +} + +static int tegra186_dspk_put_ch_sel(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dspk->ch_sel) + return 0; + + dspk->ch_sel = value; + + return 1; +} + +static int tegra186_dspk_get_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + + ucontrol->value.enumerated.item[0] = dspk->mono_to_stereo; + + return 0; +} + +static int tegra186_dspk_put_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dspk->mono_to_stereo) + return 0; + + dspk->mono_to_stereo = value; + + return 1; +} + +static int tegra186_dspk_get_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + + ucontrol->value.enumerated.item[0] = dspk->stereo_to_mono; + + return 0; +} + +static int tegra186_dspk_put_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *codec = snd_soc_kcontrol_component(kcontrol); + struct tegra186_dspk *dspk = snd_soc_component_get_drvdata(codec); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == dspk->stereo_to_mono) + return 0; + + dspk->stereo_to_mono = value; + + return 1; +} + static int __maybe_unused tegra186_dspk_runtime_suspend(struct device *dev) { struct tegra186_dspk *dspk = dev_get_drvdata(dev); @@ -278,17 +390,19 @@ static const struct soc_enum tegra186_dspk_lrsel_enum = static const struct snd_kcontrol_new tegrat186_dspk_controls[] = { SOC_SINGLE_EXT("FIFO Threshold", SND_SOC_NOPM, 0, TEGRA186_DSPK_RX_FIFO_DEPTH - 1, 0, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_fifo_th, tegra186_dspk_put_fifo_th), SOC_ENUM_EXT("OSR Value", tegra186_dspk_osr_enum, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_osr_val, tegra186_dspk_put_osr_val), SOC_ENUM_EXT("LR Polarity Select", tegra186_dspk_lrsel_enum, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_pol_sel, tegra186_dspk_put_pol_sel), SOC_ENUM_EXT("Channel Select", tegra186_dspk_ch_sel_enum, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_ch_sel, tegra186_dspk_put_ch_sel), SOC_ENUM_EXT("Mono To Stereo", tegra186_dspk_mono_conv_enum, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_mono_to_stereo, + tegra186_dspk_put_mono_to_stereo), SOC_ENUM_EXT("Stereo To Mono", tegra186_dspk_stereo_conv_enum, - tegra186_dspk_get_control, tegra186_dspk_put_control), + tegra186_dspk_get_stereo_to_mono, + tegra186_dspk_put_stereo_to_mono), }; static const struct snd_soc_component_driver tegra186_dspk_cmpnt = { From a4e37950c9e9b126f9cbee79b8ab94a94646dcf1 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:06 +0530 Subject: [PATCH 023/231] ASoC: tegra: Fix kcontrol put callback in AHUB The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Update the AHUB driver accordingly. Fixes: 16e1bcc2caf4 ("ASoC: tegra: Add Tegra210 based AHUB driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-12-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_ahub.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sound/soc/tegra/tegra210_ahub.c b/sound/soc/tegra/tegra210_ahub.c index a1989eae2b52..388b815443c7 100644 --- a/sound/soc/tegra/tegra210_ahub.c +++ b/sound/soc/tegra/tegra210_ahub.c @@ -62,6 +62,7 @@ static int tegra_ahub_put_value_enum(struct snd_kcontrol *kctl, unsigned int *item = uctl->value.enumerated.item; unsigned int value = e->values[item[0]]; unsigned int i, bit_pos, reg_idx = 0, reg_val = 0; + int change = 0; if (item[0] >= e->items) return -EINVAL; @@ -86,12 +87,14 @@ static int tegra_ahub_put_value_enum(struct snd_kcontrol *kctl, /* Update widget power if state has changed */ if (snd_soc_component_test_bits(cmpnt, update[i].reg, - update[i].mask, update[i].val)) - snd_soc_dapm_mux_update_power(dapm, kctl, item[0], e, - &update[i]); + update[i].mask, + update[i].val)) + change |= snd_soc_dapm_mux_update_power(dapm, kctl, + item[0], e, + &update[i]); } - return 0; + return change; } static struct snd_soc_dai_driver tegra210_ahub_dais[] = { From c7b34b51bbac6ab64e873f6c9bd43564a7442e33 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:07 +0530 Subject: [PATCH 024/231] ASoC: tegra: Fix kcontrol put callback in MVC The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Filter out duplicate updates in MVC driver. Fixes: e539891f9687 ("ASoC: tegra: Add Tegra210 based MVC driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-13-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_mvc.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/sound/soc/tegra/tegra210_mvc.c b/sound/soc/tegra/tegra210_mvc.c index b7e317065251..85b155887ec2 100644 --- a/sound/soc/tegra/tegra210_mvc.c +++ b/sound/soc/tegra/tegra210_mvc.c @@ -136,7 +136,7 @@ static int tegra210_mvc_put_mute(struct snd_kcontrol *kcontrol, struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_mvc *mvc = snd_soc_component_get_drvdata(cmpnt); unsigned int value; - u8 mute_mask; + u8 new_mask, old_mask; int err; pm_runtime_get_sync(cmpnt->dev); @@ -148,11 +148,19 @@ static int tegra210_mvc_put_mute(struct snd_kcontrol *kcontrol, if (err < 0) goto end; - mute_mask = ucontrol->value.integer.value[0]; + regmap_read(mvc->regmap, TEGRA210_MVC_CTRL, &value); + + old_mask = (value >> TEGRA210_MVC_MUTE_SHIFT) & TEGRA210_MUTE_MASK_EN; + new_mask = ucontrol->value.integer.value[0]; + + if (new_mask == old_mask) { + err = 0; + goto end; + } err = regmap_update_bits(mvc->regmap, mc->reg, TEGRA210_MVC_MUTE_MASK, - mute_mask << TEGRA210_MVC_MUTE_SHIFT); + new_mask << TEGRA210_MVC_MUTE_SHIFT); if (err < 0) goto end; @@ -195,7 +203,7 @@ static int tegra210_mvc_put_vol(struct snd_kcontrol *kcontrol, unsigned int reg = mc->reg; unsigned int value; u8 chan; - int err; + int err, old_volume; pm_runtime_get_sync(cmpnt->dev); @@ -207,10 +215,16 @@ static int tegra210_mvc_put_vol(struct snd_kcontrol *kcontrol, goto end; chan = (reg - TEGRA210_MVC_TARGET_VOL) / REG_SIZE; + old_volume = mvc->volume[chan]; tegra210_mvc_conv_vol(mvc, chan, ucontrol->value.integer.value[0]); + if (mvc->volume[chan] == old_volume) { + err = 0; + goto end; + } + /* Configure init volume same as target volume */ regmap_write(mvc->regmap, TEGRA210_MVC_REG_OFFSET(TEGRA210_MVC_INIT_VOL, chan), From b31f8febd1850bbe74aba184779ec54552d92752 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:08 +0530 Subject: [PATCH 025/231] ASoC: tegra: Fix kcontrol put callback in SFC The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Filter out duplicate updates in SFC driver. Fixes: b2f74ec53a6c ("ASoC: tegra: Add Tegra210 based SFC driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-14-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_sfc.c | 124 ++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 31 deletions(-) diff --git a/sound/soc/tegra/tegra210_sfc.c b/sound/soc/tegra/tegra210_sfc.c index cb592ef55bd3..7a2227ed3df6 100644 --- a/sound/soc/tegra/tegra210_sfc.c +++ b/sound/soc/tegra/tegra210_sfc.c @@ -3244,49 +3244,107 @@ static int tegra210_sfc_init(struct snd_soc_dapm_widget *w, return tegra210_sfc_write_coeff_ram(cmpnt); } -static int tegra210_sfc_get_control(struct snd_kcontrol *kcontrol, +static int tegra210_sfc_iget_stereo_to_mono(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); - if (strstr(kcontrol->id.name, "Input Stereo To Mono")) - ucontrol->value.enumerated.item[0] = - sfc->stereo_to_mono[SFC_RX_PATH]; - else if (strstr(kcontrol->id.name, "Input Mono To Stereo")) - ucontrol->value.enumerated.item[0] = - sfc->mono_to_stereo[SFC_RX_PATH]; - else if (strstr(kcontrol->id.name, "Output Stereo To Mono")) - ucontrol->value.enumerated.item[0] = - sfc->stereo_to_mono[SFC_TX_PATH]; - else if (strstr(kcontrol->id.name, "Output Mono To Stereo")) - ucontrol->value.enumerated.item[0] = - sfc->mono_to_stereo[SFC_TX_PATH]; + ucontrol->value.enumerated.item[0] = sfc->stereo_to_mono[SFC_RX_PATH]; return 0; } -static int tegra210_sfc_put_control(struct snd_kcontrol *kcontrol, +static int tegra210_sfc_iput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == sfc->stereo_to_mono[SFC_RX_PATH]) + return 0; + + sfc->stereo_to_mono[SFC_RX_PATH] = value; + + return 1; +} + +static int tegra210_sfc_iget_mono_to_stereo(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); - if (strstr(kcontrol->id.name, "Input Stereo To Mono")) - sfc->stereo_to_mono[SFC_RX_PATH] = - ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Input Mono To Stereo")) - sfc->mono_to_stereo[SFC_RX_PATH] = - ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Output Stereo To Mono")) - sfc->stereo_to_mono[SFC_TX_PATH] = - ucontrol->value.enumerated.item[0]; - else if (strstr(kcontrol->id.name, "Output Mono To Stereo")) - sfc->mono_to_stereo[SFC_TX_PATH] = - ucontrol->value.enumerated.item[0]; - else + ucontrol->value.enumerated.item[0] = sfc->mono_to_stereo[SFC_RX_PATH]; + + return 0; +} + +static int tegra210_sfc_iput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == sfc->mono_to_stereo[SFC_RX_PATH]) return 0; + sfc->mono_to_stereo[SFC_RX_PATH] = value; + + return 1; +} + +static int tegra210_sfc_oget_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + + ucontrol->value.enumerated.item[0] = sfc->stereo_to_mono[SFC_TX_PATH]; + + return 0; +} + +static int tegra210_sfc_oput_stereo_to_mono(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == sfc->stereo_to_mono[SFC_TX_PATH]) + return 0; + + sfc->stereo_to_mono[SFC_TX_PATH] = value; + + return 1; +} + +static int tegra210_sfc_oget_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + + ucontrol->value.enumerated.item[0] = sfc->mono_to_stereo[SFC_TX_PATH]; + + return 0; +} + +static int tegra210_sfc_oput_mono_to_stereo(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); + struct tegra210_sfc *sfc = snd_soc_component_get_drvdata(cmpnt); + unsigned int value = ucontrol->value.enumerated.item[0]; + + if (value == sfc->mono_to_stereo[SFC_TX_PATH]) + return 0; + + sfc->mono_to_stereo[SFC_TX_PATH] = value; + return 1; } @@ -3387,13 +3445,17 @@ static const struct soc_enum tegra210_sfc_mono_conv_enum = static const struct snd_kcontrol_new tegra210_sfc_controls[] = { SOC_ENUM_EXT("Input Stereo To Mono", tegra210_sfc_stereo_conv_enum, - tegra210_sfc_get_control, tegra210_sfc_put_control), + tegra210_sfc_iget_stereo_to_mono, + tegra210_sfc_iput_stereo_to_mono), SOC_ENUM_EXT("Input Mono To Stereo", tegra210_sfc_mono_conv_enum, - tegra210_sfc_get_control, tegra210_sfc_put_control), + tegra210_sfc_iget_mono_to_stereo, + tegra210_sfc_iput_mono_to_stereo), SOC_ENUM_EXT("Output Stereo To Mono", tegra210_sfc_stereo_conv_enum, - tegra210_sfc_get_control, tegra210_sfc_put_control), + tegra210_sfc_oget_stereo_to_mono, + tegra210_sfc_oput_stereo_to_mono), SOC_ENUM_EXT("Output Mono To Stereo", tegra210_sfc_mono_conv_enum, - tegra210_sfc_get_control, tegra210_sfc_put_control), + tegra210_sfc_oget_mono_to_stereo, + tegra210_sfc_oput_mono_to_stereo), }; static const struct snd_soc_component_driver tegra210_sfc_cmpnt = { From 8db78ace1ba897302131422ce15c5eb04510cef8 Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:09 +0530 Subject: [PATCH 026/231] ASoC: tegra: Fix kcontrol put callback in AMX The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Filter out duplicate updates in AMX driver. Fixes: 77f7df346c45 ("ASoC: tegra: Add Tegra210 based AMX driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-15-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_amx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/tegra/tegra210_amx.c b/sound/soc/tegra/tegra210_amx.c index af9bddfc3120..689576302ede 100644 --- a/sound/soc/tegra/tegra210_amx.c +++ b/sound/soc/tegra/tegra210_amx.c @@ -222,6 +222,9 @@ static int tegra210_amx_put_byte_map(struct snd_kcontrol *kcontrol, int reg = mc->reg; int value = ucontrol->value.integer.value[0]; + if (value == bytes_map[reg]) + return 0; + if (value >= 0 && value <= 255) { /* Update byte map and enable slot */ bytes_map[reg] = value; From 3c97881b8c8a2aa8afd4d7a379b7ff03884c9e4a Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:10 +0530 Subject: [PATCH 027/231] ASoC: tegra: Fix kcontrol put callback in ADX The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Filter out duplicate updates in ADX driver. Fixes: a99ab6f395a9 ("ASoC: tegra: Add Tegra210 based ADX driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-16-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_adx.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/tegra/tegra210_adx.c b/sound/soc/tegra/tegra210_adx.c index d7c7849c2f92..933c4503fe50 100644 --- a/sound/soc/tegra/tegra210_adx.c +++ b/sound/soc/tegra/tegra210_adx.c @@ -193,6 +193,9 @@ static int tegra210_adx_put_byte_map(struct snd_kcontrol *kcontrol, struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value;; + if (value == bytes_map[mc->reg]) + return 0; + if (value >= 0 && value <= 255) { /* update byte map and enable slot */ bytes_map[mc->reg] = value; From 8cf72c4e75a0265135d34a8e29224b4c1e92b51c Mon Sep 17 00:00:00 2001 From: Sameer Pujar Date: Thu, 18 Nov 2021 12:37:11 +0530 Subject: [PATCH 028/231] ASoC: tegra: Fix kcontrol put callback in Mixer The kcontrol put callback is expected to return 1 when there is change in HW or when the update is acknowledged by driver. This would ensure that change notifications are sent to subscribed applications. Filter out duplicate updates in Mixer driver. Fixes: 05bb3d5ec64a ("ASoC: tegra: Add Tegra210 based Mixer driver") Signed-off-by: Sameer Pujar Suggested-by: Jaroslav Kysela Suggested-by: Mark Brown Reviewed-by: Takashi Iwai Link: https://lore.kernel.org/r/1637219231-406-17-git-send-email-spujar@nvidia.com Signed-off-by: Mark Brown --- sound/soc/tegra/tegra210_mixer.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/sound/soc/tegra/tegra210_mixer.c b/sound/soc/tegra/tegra210_mixer.c index 55e61776c565..51d375573cfa 100644 --- a/sound/soc/tegra/tegra210_mixer.c +++ b/sound/soc/tegra/tegra210_mixer.c @@ -192,24 +192,24 @@ static int tegra210_mixer_get_gain(struct snd_kcontrol *kcontrol, return 0; } -static int tegra210_mixer_put_gain(struct snd_kcontrol *kcontrol, - struct snd_ctl_elem_value *ucontrol) +static int tegra210_mixer_apply_gain(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol, + bool instant_gain) { struct soc_mixer_control *mc = (struct soc_mixer_control *)kcontrol->private_value; struct snd_soc_component *cmpnt = snd_soc_kcontrol_component(kcontrol); struct tegra210_mixer *mixer = snd_soc_component_get_drvdata(cmpnt); unsigned int reg = mc->reg, id; - bool instant_gain = false; int err; - if (strstr(kcontrol->id.name, "Instant Gain Volume")) - instant_gain = true; - /* Save gain value for specific MIXER input */ id = (reg - TEGRA210_MIXER_GAIN_CFG_RAM_ADDR_0) / TEGRA210_MIXER_GAIN_CFG_RAM_ADDR_STRIDE; + if (mixer->gain_value[id] == ucontrol->value.integer.value[0]) + return 0; + mixer->gain_value[id] = ucontrol->value.integer.value[0]; err = tegra210_mixer_configure_gain(cmpnt, id, instant_gain); @@ -221,6 +221,18 @@ static int tegra210_mixer_put_gain(struct snd_kcontrol *kcontrol, return 1; } +static int tegra210_mixer_put_gain(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return tegra210_mixer_apply_gain(kcontrol, ucontrol, false); +} + +static int tegra210_mixer_put_instant_gain(struct snd_kcontrol *kcontrol, + struct snd_ctl_elem_value *ucontrol) +{ + return tegra210_mixer_apply_gain(kcontrol, ucontrol, true); +} + static int tegra210_mixer_set_audio_cif(struct tegra210_mixer *mixer, struct snd_pcm_hw_params *params, unsigned int reg, @@ -388,7 +400,7 @@ ADDER_CTRL_DECL(adder5, TEGRA210_MIXER_TX5_ADDER_CONFIG); SOC_SINGLE_EXT("RX" #id " Instant Gain Volume", \ MIXER_GAIN_CFG_RAM_ADDR((id) - 1), 0, \ 0x20000, 0, tegra210_mixer_get_gain, \ - tegra210_mixer_put_gain), + tegra210_mixer_put_instant_gain), /* Volume controls for all MIXER inputs */ static const struct snd_kcontrol_new tegra210_mixer_gain_ctls[] = { From 73185a13773af10264f9d8ee70386c01c849ff2c Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Thu, 11 Nov 2021 23:24:52 +0100 Subject: [PATCH 029/231] scsi: ufs: ufshpb: Fix warning in ufshpb_set_hpb_read_to_upiu() Fix the following sparse warnings in ufshpb_set_hpb_read_to_upiu(): sparse warnings: (new ones prefixed by >>) drivers/scsi/ufs/ufshpb.c:335:27: sparse: sparse: cast from restricted __be64 drivers/scsi/ufs/ufshpb.c:335:25: sparse: expected restricted __be64 [usertype] ppn_tmp drivers/scsi/ufs/ufshpb.c:335:25: sparse: got unsigned long long [usertype] Link: https://lore.kernel.org/r/20211111222452.384089-1-huobean@gmail.com Reported-by: kernel test robot Reviewed-by: Bart Van Assche Signed-off-by: Bean Huo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshpb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshpb.c b/drivers/scsi/ufs/ufshpb.c index 2e31e1413826..ded5ba9b1466 100644 --- a/drivers/scsi/ufs/ufshpb.c +++ b/drivers/scsi/ufs/ufshpb.c @@ -331,7 +331,7 @@ ufshpb_set_hpb_read_to_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp, cdb[0] = UFSHPB_READ; if (hba->dev_quirks & UFS_DEVICE_QUIRK_SWAP_L2P_ENTRY_FOR_HPB_READ) - ppn_tmp = swab64(ppn); + ppn_tmp = (__force __be64)swab64((__force u64)ppn); /* ppn value is stored as big-endian in the host memory */ memcpy(&cdb[6], &ppn_tmp, sizeof(__be64)); From e11e285b9cd132db21568b5d29c291f590841944 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 9 Nov 2021 14:52:19 +0300 Subject: [PATCH 030/231] scsi: qla2xxx: edif: Fix off by one bug in qla_edif_app_getfcinfo() The > comparison needs to be >= to prevent accessing one element beyond the end of the app_reply->ports[] array. Link: https://lore.kernel.org/r/20211109115219.GE16587@kili Fixes: 7878f22a2e03 ("scsi: qla2xxx: edif: Add getfcinfo and statistic bsgs") Reviewed-by: Ewan D. Milne Reviewed-by: Himanshu Madhani Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_edif.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_edif.c b/drivers/scsi/qla2xxx/qla_edif.c index 2e37b189cb75..53d2b8562027 100644 --- a/drivers/scsi/qla2xxx/qla_edif.c +++ b/drivers/scsi/qla2xxx/qla_edif.c @@ -865,7 +865,7 @@ qla_edif_app_getfcinfo(scsi_qla_host_t *vha, struct bsg_job *bsg_job) "APP request entry - portid=%06x.\n", tdid.b24); /* Ran out of space */ - if (pcnt > app_req.num_ports) + if (pcnt >= app_req.num_ports) break; if (tdid.b24 != 0 && tdid.b24 != fcport->d_id.b24) From 36e07d7ede88a1f1ef8f0f209af5b7612324ac2c Mon Sep 17 00:00:00 2001 From: George Kennedy Date: Tue, 9 Nov 2021 13:57:27 -0500 Subject: [PATCH 031/231] scsi: scsi_debug: Fix type in min_t to avoid stack OOB Change min_t() to use type "u32" instead of type "int" to avoid stack out of bounds. With min_t() type "int" the values get sign extended and the larger value gets used causing stack out of bounds. BUG: KASAN: stack-out-of-bounds in memcpy include/linux/fortify-string.h:191 [inline] BUG: KASAN: stack-out-of-bounds in sg_copy_buffer+0x1de/0x240 lib/scatterlist.c:976 Read of size 127 at addr ffff888072607128 by task syz-executor.7/18707 CPU: 1 PID: 18707 Comm: syz-executor.7 Not tainted 5.15.0-syzk #1 Hardware name: Red Hat KVM, BIOS 1.13.0-2 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x89/0xb5 lib/dump_stack.c:106 print_address_description.constprop.9+0x28/0x160 mm/kasan/report.c:256 __kasan_report mm/kasan/report.c:442 [inline] kasan_report.cold.14+0x7d/0x117 mm/kasan/report.c:459 check_region_inline mm/kasan/generic.c:183 [inline] kasan_check_range+0x1a3/0x210 mm/kasan/generic.c:189 memcpy+0x23/0x60 mm/kasan/shadow.c:65 memcpy include/linux/fortify-string.h:191 [inline] sg_copy_buffer+0x1de/0x240 lib/scatterlist.c:976 sg_copy_from_buffer+0x33/0x40 lib/scatterlist.c:1000 fill_from_dev_buffer.part.34+0x82/0x130 drivers/scsi/scsi_debug.c:1162 fill_from_dev_buffer drivers/scsi/scsi_debug.c:1888 [inline] resp_readcap16+0x365/0x3b0 drivers/scsi/scsi_debug.c:1887 schedule_resp+0x4d8/0x1a70 drivers/scsi/scsi_debug.c:5478 scsi_debug_queuecommand+0x8c9/0x1ec0 drivers/scsi/scsi_debug.c:7533 scsi_dispatch_cmd drivers/scsi/scsi_lib.c:1520 [inline] scsi_queue_rq+0x16b0/0x2d40 drivers/scsi/scsi_lib.c:1699 blk_mq_dispatch_rq_list+0xb9b/0x2700 block/blk-mq.c:1639 __blk_mq_sched_dispatch_requests+0x28f/0x590 block/blk-mq-sched.c:325 blk_mq_sched_dispatch_requests+0x105/0x190 block/blk-mq-sched.c:358 __blk_mq_run_hw_queue+0xe5/0x150 block/blk-mq.c:1761 __blk_mq_delay_run_hw_queue+0x4f8/0x5c0 block/blk-mq.c:1838 blk_mq_run_hw_queue+0x18d/0x350 block/blk-mq.c:1891 blk_mq_sched_insert_request+0x3db/0x4e0 block/blk-mq-sched.c:474 blk_execute_rq_nowait+0x16b/0x1c0 block/blk-exec.c:62 sg_common_write.isra.18+0xeb3/0x2000 drivers/scsi/sg.c:836 sg_new_write.isra.19+0x570/0x8c0 drivers/scsi/sg.c:774 sg_ioctl_common+0x14d6/0x2710 drivers/scsi/sg.c:939 sg_ioctl+0xa2/0x180 drivers/scsi/sg.c:1165 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:874 [inline] __se_sys_ioctl fs/ioctl.c:860 [inline] __x64_sys_ioctl+0x19d/0x220 fs/ioctl.c:860 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae Link: https://lore.kernel.org/r/1636484247-21254-1-git-send-email-george.kennedy@oracle.com Reported-by: syzkaller Acked-by: Douglas Gilbert Signed-off-by: George Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 1d0278da9041..ab01ef7d37f4 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -1189,7 +1189,7 @@ static int p_fill_from_dev_buffer(struct scsi_cmnd *scp, const void *arr, __func__, off_dst, scsi_bufflen(scp), act_len, scsi_get_resid(scp)); n = scsi_bufflen(scp) - (off_dst + act_len); - scsi_set_resid(scp, min_t(int, scsi_get_resid(scp), n)); + scsi_set_resid(scp, min_t(u32, scsi_get_resid(scp), n)); return 0; } @@ -1562,7 +1562,8 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) unsigned char pq_pdt; unsigned char *arr; unsigned char *cmd = scp->cmnd; - int alloc_len, n, ret; + u32 alloc_len, n; + int ret; bool have_wlun, is_disk, is_zbc, is_disk_zbc; alloc_len = get_unaligned_be16(cmd + 3); @@ -1585,7 +1586,8 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) kfree(arr); return check_condition_result; } else if (0x1 & cmd[1]) { /* EVPD bit set */ - int lu_id_num, port_group_id, target_dev_id, len; + int lu_id_num, port_group_id, target_dev_id; + u32 len; char lu_id_str[6]; int host_no = devip->sdbg_host->shost->host_no; @@ -1676,9 +1678,9 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) kfree(arr); return check_condition_result; } - len = min(get_unaligned_be16(arr + 2) + 4, alloc_len); + len = min_t(u32, get_unaligned_be16(arr + 2) + 4, alloc_len); ret = fill_from_dev_buffer(scp, arr, - min(len, SDEBUG_MAX_INQ_ARR_SZ)); + min_t(u32, len, SDEBUG_MAX_INQ_ARR_SZ)); kfree(arr); return ret; } @@ -1714,7 +1716,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) } put_unaligned_be16(0x2100, arr + n); /* SPL-4 no version claimed */ ret = fill_from_dev_buffer(scp, arr, - min_t(int, alloc_len, SDEBUG_LONG_INQ_SZ)); + min_t(u32, alloc_len, SDEBUG_LONG_INQ_SZ)); kfree(arr); return ret; } @@ -1729,8 +1731,8 @@ static int resp_requests(struct scsi_cmnd *scp, unsigned char *cmd = scp->cmnd; unsigned char arr[SCSI_SENSE_BUFFERSIZE]; /* assume >= 18 bytes */ bool dsense = !!(cmd[1] & 1); - int alloc_len = cmd[4]; - int len = 18; + u32 alloc_len = cmd[4]; + u32 len = 18; int stopped_state = atomic_read(&devip->stopped); memset(arr, 0, sizeof(arr)); @@ -1774,7 +1776,7 @@ static int resp_requests(struct scsi_cmnd *scp, arr[7] = 0xa; } } - return fill_from_dev_buffer(scp, arr, min_t(int, len, alloc_len)); + return fill_from_dev_buffer(scp, arr, min_t(u32, len, alloc_len)); } static int resp_start_stop(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) @@ -2312,7 +2314,8 @@ static int resp_mode_sense(struct scsi_cmnd *scp, { int pcontrol, pcode, subpcode, bd_len; unsigned char dev_spec; - int alloc_len, offset, len, target_dev_id; + u32 alloc_len, offset, len; + int target_dev_id; int target = scp->device->id; unsigned char *ap; unsigned char arr[SDEBUG_MAX_MSENSE_SZ]; @@ -2468,7 +2471,7 @@ static int resp_mode_sense(struct scsi_cmnd *scp, arr[0] = offset - 1; else put_unaligned_be16((offset - 2), arr + 0); - return fill_from_dev_buffer(scp, arr, min_t(int, alloc_len, offset)); + return fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, offset)); } #define SDEBUG_MAX_MSELECT_SZ 512 @@ -2583,7 +2586,8 @@ static int resp_ie_l_pg(unsigned char *arr) static int resp_log_sense(struct scsi_cmnd *scp, struct sdebug_dev_info *devip) { - int ppc, sp, pcode, subpcode, alloc_len, len, n; + int ppc, sp, pcode, subpcode; + u32 alloc_len, len, n; unsigned char arr[SDEBUG_MAX_LSENSE_SZ]; unsigned char *cmd = scp->cmnd; @@ -2653,9 +2657,9 @@ static int resp_log_sense(struct scsi_cmnd *scp, mk_sense_invalid_fld(scp, SDEB_IN_CDB, 3, -1); return check_condition_result; } - len = min_t(int, get_unaligned_be16(arr + 2) + 4, alloc_len); + len = min_t(u32, get_unaligned_be16(arr + 2) + 4, alloc_len); return fill_from_dev_buffer(scp, arr, - min_t(int, len, SDEBUG_MAX_INQ_ARR_SZ)); + min_t(u32, len, SDEBUG_MAX_INQ_ARR_SZ)); } static inline bool sdebug_dev_is_zoned(struct sdebug_dev_info *devip) @@ -4430,7 +4434,7 @@ static int resp_report_zones(struct scsi_cmnd *scp, put_unaligned_be64(sdebug_capacity - 1, arr + 8); rep_len = (unsigned long)desc - (unsigned long)arr; - ret = fill_from_dev_buffer(scp, arr, min_t(int, alloc_len, rep_len)); + ret = fill_from_dev_buffer(scp, arr, min_t(u32, alloc_len, rep_len)); fini: read_unlock(macc_lckp); From cc03facb1c4248997592fc683518c00cc257db1a Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Wed, 10 Nov 2021 10:51:33 +0000 Subject: [PATCH 032/231] scsi: ufs: ufs-mediatek: Add put_device() after of_find_device_by_node() This was found by coccicheck: ./drivers/scsi/ufs/ufs-mediatek.c, 211, 1-7, ERROR missing put_device; call of_find_device_by_node on line 1185, but without a corresponding object release within this function. Link: https://lore.kernel.org/r/20211110105133.150171-1-ye.guojin@zte.com.cn Reported-by: Zeal Robot Reviewed-by: Peter Wang Signed-off-by: Ye Guojin Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufs-mediatek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/ufs/ufs-mediatek.c b/drivers/scsi/ufs/ufs-mediatek.c index fc5b214347b3..5393b5c9dd9c 100644 --- a/drivers/scsi/ufs/ufs-mediatek.c +++ b/drivers/scsi/ufs/ufs-mediatek.c @@ -1189,6 +1189,7 @@ static int ufs_mtk_probe(struct platform_device *pdev) } link = device_link_add(dev, &reset_pdev->dev, DL_FLAG_AUTOPROBE_CONSUMER); + put_device(&reset_pdev->dev); if (!link) { dev_notice(dev, "add reset device_link fail\n"); goto skip_reset; From 0ee4ba13e09c9d9c1cb6abb59da8295d9952328b Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Wed, 17 Nov 2021 16:19:09 +0530 Subject: [PATCH 033/231] scsi: mpt3sas: Fix kernel panic during drive powercycle test While looping over shost's sdev list it is possible that one of the drives is getting removed and its sas_target object is freed but its sdev object remains intact. Consequently, a kernel panic can occur while the driver is trying to access the sas_address field of sas_target object without also checking the sas_target object for NULL. Link: https://lore.kernel.org/r/20211117104909.2069-1-sreekanth.reddy@broadcom.com Fixes: f92363d12359 ("[SCSI] mpt3sas: add new driver supporting 12GB SAS") Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_scsih.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index cee7170beae8..bb0036b41825 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -3869,7 +3869,7 @@ _scsih_ublock_io_device(struct MPT3SAS_ADAPTER *ioc, shost_for_each_device(sdev, ioc->shost) { sas_device_priv_data = sdev->hostdata; - if (!sas_device_priv_data) + if (!sas_device_priv_data || !sas_device_priv_data->sas_target) continue; if (sas_device_priv_data->sas_target->sas_address != sas_address) From 91202a01a2fb2b78da3d03811b6d3d973ae426aa Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Wed, 17 Nov 2021 16:20:58 +0530 Subject: [PATCH 034/231] scsi: mpt3sas: Fix system going into read-only mode While determining the SAS address of a drive, the driver checks whether the handle number is less than the HBA phy count or not. If the handle number is less than the HBA phy count then driver assumes that this handle belongs to HBA and hence it assigns the HBA SAS address. During IOC firmware downgrade operation, if the number of HBA phys is reduced and the OS drive's device handle drops below the phy count while determining the drive's SAS address, the driver ends up using the HBA's SAS address. This leads to a mismatch of drive's SAS address and hence the driver unregisters the OS drive and the system goes into read-only mode. Update the IOC's num_phys to the HBA phy count provided by actual loaded firmware. Link: https://lore.kernel.org/r/20211117105058.3505-1-sreekanth.reddy@broadcom.com Fixes: a5e99fda0172 ("scsi: mpt3sas: Update hba_port objects after host reset") Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.h | 4 ++ drivers/scsi/mpt3sas/mpt3sas_scsih.c | 57 +++++++++++++++++++++++++++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h index db6a759de1e9..a0af986633d2 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.h +++ b/drivers/scsi/mpt3sas/mpt3sas_base.h @@ -142,6 +142,8 @@ #define MPT_MAX_CALLBACKS 32 +#define MPT_MAX_HBA_NUM_PHYS 32 + #define INTERNAL_CMDS_COUNT 10 /* reserved cmds */ /* reserved for issuing internally framed scsi io cmds */ #define INTERNAL_SCSIIO_CMDS_COUNT 3 @@ -798,6 +800,7 @@ struct _sas_phy { * @enclosure_handle: handle for this a member of an enclosure * @device_info: bitwise defining capabilities of this sas_host/expander * @responding: used in _scsih_expander_device_mark_responding + * @nr_phys_allocated: Allocated memory for this many count phys * @phy: a list of phys that make up this sas_host/expander * @sas_port_list: list of ports attached to this sas_host/expander * @port: hba port entry containing node's port number info @@ -813,6 +816,7 @@ struct _sas_node { u16 enclosure_handle; u64 enclosure_logical_id; u8 responding; + u8 nr_phys_allocated; struct hba_port *port; struct _sas_phy *phy; struct list_head sas_port_list; diff --git a/drivers/scsi/mpt3sas/mpt3sas_scsih.c b/drivers/scsi/mpt3sas/mpt3sas_scsih.c index bb0036b41825..00792767c620 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_scsih.c +++ b/drivers/scsi/mpt3sas/mpt3sas_scsih.c @@ -6406,11 +6406,26 @@ _scsih_sas_port_refresh(struct MPT3SAS_ADAPTER *ioc) int i, j, count = 0, lcount = 0; int ret; u64 sas_addr; + u8 num_phys; drsprintk(ioc, ioc_info(ioc, "updating ports for sas_host(0x%016llx)\n", (unsigned long long)ioc->sas_hba.sas_address)); + mpt3sas_config_get_number_hba_phys(ioc, &num_phys); + if (!num_phys) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return; + } + + if (num_phys > ioc->sas_hba.nr_phys_allocated) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + return; + } + ioc->sas_hba.num_phys = num_phys; + port_table = kcalloc(ioc->sas_hba.num_phys, sizeof(struct hba_port), GFP_KERNEL); if (!port_table) @@ -6611,6 +6626,30 @@ _scsih_sas_host_refresh(struct MPT3SAS_ADAPTER *ioc) ioc->sas_hba.phy[i].hba_vphy = 1; } + /* + * Add new HBA phys to STL if these new phys got added as part + * of HBA Firmware upgrade/downgrade operation. + */ + if (!ioc->sas_hba.phy[i].phy) { + if ((mpt3sas_config_get_phy_pg0(ioc, &mpi_reply, + &phy_pg0, i))) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + continue; + } + ioc_status = le16_to_cpu(mpi_reply.IOCStatus) & + MPI2_IOCSTATUS_MASK; + if (ioc_status != MPI2_IOCSTATUS_SUCCESS) { + ioc_err(ioc, "failure at %s:%d/%s()!\n", + __FILE__, __LINE__, __func__); + continue; + } + ioc->sas_hba.phy[i].phy_id = i; + mpt3sas_transport_add_host_phy(ioc, + &ioc->sas_hba.phy[i], phy_pg0, + ioc->sas_hba.parent_dev); + continue; + } ioc->sas_hba.phy[i].handle = ioc->sas_hba.handle; attached_handle = le16_to_cpu(sas_iounit_pg0->PhyData[i]. AttachedDevHandle); @@ -6622,6 +6661,19 @@ _scsih_sas_host_refresh(struct MPT3SAS_ADAPTER *ioc) attached_handle, i, link_rate, ioc->sas_hba.phy[i].port); } + /* + * Clear the phy details if this phy got disabled as part of + * HBA Firmware upgrade/downgrade operation. + */ + for (i = ioc->sas_hba.num_phys; + i < ioc->sas_hba.nr_phys_allocated; i++) { + if (ioc->sas_hba.phy[i].phy && + ioc->sas_hba.phy[i].phy->negotiated_linkrate >= + SAS_LINK_RATE_1_5_GBPS) + mpt3sas_transport_update_links(ioc, + ioc->sas_hba.sas_address, 0, i, + MPI2_SAS_NEG_LINK_RATE_PHY_DISABLED, NULL); + } out: kfree(sas_iounit_pg0); } @@ -6654,7 +6706,10 @@ _scsih_sas_host_add(struct MPT3SAS_ADAPTER *ioc) __FILE__, __LINE__, __func__); return; } - ioc->sas_hba.phy = kcalloc(num_phys, + + ioc->sas_hba.nr_phys_allocated = max_t(u8, + MPT_MAX_HBA_NUM_PHYS, num_phys); + ioc->sas_hba.phy = kcalloc(ioc->sas_hba.nr_phys_allocated, sizeof(struct _sas_phy), GFP_KERNEL); if (!ioc->sas_hba.phy) { ioc_err(ioc, "failure at %s:%d/%s()!\n", From 5ecae9f8c705fae85fe4d2ed9f1b9cddf91e88e9 Mon Sep 17 00:00:00 2001 From: Sreekanth Reddy Date: Wed, 17 Nov 2021 18:02:15 +0530 Subject: [PATCH 035/231] scsi: mpt3sas: Fix incorrect system timestamp For updating the IOC firmware's timestamp with system timestamp, the driver issues the Mpi26IoUnitControlRequest message. While framing the Mpi26IoUnitControlRequest, the driver should copy the lower 32 bits of the current timestamp into IOCParameterValue field and the higher 32 bits into Reserved7 field. Link: https://lore.kernel.org/r/20211117123215.25487-1-sreekanth.reddy@broadcom.com Fixes: f98790c00375 ("scsi: mpt3sas: Sync time periodically between driver and firmware") Signed-off-by: Sreekanth Reddy Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/mpt3sas_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c index 27eb652b564f..81dab9b82f79 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_base.c +++ b/drivers/scsi/mpt3sas/mpt3sas_base.c @@ -639,8 +639,8 @@ static void _base_sync_drv_fw_timestamp(struct MPT3SAS_ADAPTER *ioc) mpi_request->IOCParameter = MPI26_SET_IOC_PARAMETER_SYNC_TIMESTAMP; current_time = ktime_get_real(); TimeStamp = ktime_to_ms(current_time); - mpi_request->Reserved7 = cpu_to_le32(TimeStamp & 0xFFFFFFFF); - mpi_request->IOCParameterValue = cpu_to_le32(TimeStamp >> 32); + mpi_request->Reserved7 = cpu_to_le32(TimeStamp >> 32); + mpi_request->IOCParameterValue = cpu_to_le32(TimeStamp & 0xFFFFFFFF); init_completion(&ioc->scsih_cmds.done); ioc->put_smid_default(ioc, smid); dinitprintk(ioc, ioc_info(ioc, From e2a49a95b571d9d208f28a03d63353374e724f13 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 17 Nov 2021 15:39:28 -0600 Subject: [PATCH 036/231] scsi: target: core: Use RCU helpers for INQUIRY t10_alua_tg_pt_gp Fix the sparse warnings about t10_alua_tg_pt_gp accesses in target_core_spc.c caused by commit 7324f47d4293 ("scsi: target: Replace lun_tg_pt_gp_lock with rcu in I/O path") That commit replaced the lun_tg_pt_gp_lock use in the I/O path, but it didn't update the INQUIRY code. Link: https://lore.kernel.org/r/20211117213928.8634-1-michael.christie@oracle.com Reported-by: kernel test robot Reviewed-by: Maurizio Lombardi Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/target/target_core_spc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/target/target_core_spc.c b/drivers/target/target_core_spc.c index 22703a0dbd07..4c76498d3fb0 100644 --- a/drivers/target/target_core_spc.c +++ b/drivers/target/target_core_spc.c @@ -40,11 +40,11 @@ static void spc_fill_alua_data(struct se_lun *lun, unsigned char *buf) * * See spc4r17 section 6.4.2 Table 135 */ - spin_lock(&lun->lun_tg_pt_gp_lock); - tg_pt_gp = lun->lun_tg_pt_gp; + rcu_read_lock(); + tg_pt_gp = rcu_dereference(lun->lun_tg_pt_gp); if (tg_pt_gp) buf[5] |= tg_pt_gp->tg_pt_gp_alua_access_type; - spin_unlock(&lun->lun_tg_pt_gp_lock); + rcu_read_unlock(); } static u16 @@ -325,14 +325,14 @@ check_t10_vend_desc: * Get the PROTOCOL IDENTIFIER as defined by spc4r17 * section 7.5.1 Table 362 */ - spin_lock(&lun->lun_tg_pt_gp_lock); - tg_pt_gp = lun->lun_tg_pt_gp; + rcu_read_lock(); + tg_pt_gp = rcu_dereference(lun->lun_tg_pt_gp); if (!tg_pt_gp) { - spin_unlock(&lun->lun_tg_pt_gp_lock); + rcu_read_unlock(); goto check_lu_gp; } tg_pt_gp_id = tg_pt_gp->tg_pt_gp_id; - spin_unlock(&lun->lun_tg_pt_gp_lock); + rcu_read_unlock(); buf[off] = tpg->proto_id << 4; buf[off++] |= 0x1; /* CODE SET == Binary */ From 9c6603e1faf880bada541e9cce6514d2f3248da0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 18 Nov 2021 11:49:00 +0300 Subject: [PATCH 037/231] scsi: target: configfs: Delete unnecessary checks for NULL The "item" pointer is always going to be valid pointer and does not need to be checked. But if "item" were NULL then item_to_lun() would not return a NULL, but instead, the container_of() pointer math would return a value in the error pointer range. This confuses static checkers since it looks like a NULL vs IS_ERR() bug. Delete the bogus checks. Link: https://lore.kernel.org/r/20211118084900.GA24550@kili Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/target/target_core_fabric_configfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 0b65de9f2df1..95a88f6224cd 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -520,7 +520,7 @@ static ssize_t target_fabric_port_alua_tg_pt_gp_show(struct config_item *item, { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_show_tg_pt_gp_info(lun, page); @@ -531,7 +531,7 @@ static ssize_t target_fabric_port_alua_tg_pt_gp_store(struct config_item *item, { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_store_tg_pt_gp_info(lun, page, count); @@ -542,7 +542,7 @@ static ssize_t target_fabric_port_alua_tg_pt_offline_show( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_show_offline_bit(lun, page); @@ -553,7 +553,7 @@ static ssize_t target_fabric_port_alua_tg_pt_offline_store( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_store_offline_bit(lun, page, count); @@ -564,7 +564,7 @@ static ssize_t target_fabric_port_alua_tg_pt_status_show( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_show_secondary_status(lun, page); @@ -575,7 +575,7 @@ static ssize_t target_fabric_port_alua_tg_pt_status_store( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_store_secondary_status(lun, page, count); @@ -586,7 +586,7 @@ static ssize_t target_fabric_port_alua_tg_pt_write_md_show( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_show_secondary_write_metadata(lun, page); @@ -597,7 +597,7 @@ static ssize_t target_fabric_port_alua_tg_pt_write_md_store( { struct se_lun *lun = item_to_lun(item); - if (!lun || !lun->lun_se_dev) + if (!lun->lun_se_dev) return -ENODEV; return core_alua_store_secondary_write_metadata(lun, page, count); From d8af404ffce71448f29bbc19a05e3d095baf98eb Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 17 Nov 2021 17:59:01 -0800 Subject: [PATCH 038/231] iomap: Fix inline extent handling in iomap_readpage Before commit 740499c78408 ("iomap: fix the iomap_readpage_actor return value for inline data"), when hitting an IOMAP_INLINE extent, iomap_readpage_actor would report having read the entire page. Since then, it only reports having read the inline data (iomap->length). This will force iomap_readpage into another iteration, and the filesystem will report an unaligned hole after the IOMAP_INLINE extent. But iomap_readpage_actor (now iomap_readpage_iter) isn't prepared to deal with unaligned extents, it will get things wrong on filesystems with a block size smaller than the page size, and we'll eventually run into the following warning in iomap_iter_advance: WARN_ON_ONCE(iter->processed > iomap_length(iter)); Fix that by changing iomap_readpage_iter to return 0 when hitting an inline extent; this will cause iomap_iter to stop immediately. To fix readahead as well, change iomap_readahead_iter to pass on iomap_readpage_iter return values less than or equal to zero. Fixes: 740499c78408 ("iomap: fix the iomap_readpage_actor return value for inline data") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Andreas Gruenbacher Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 1753c26c8e76..fe10d8a30f6b 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -256,8 +256,13 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) - return min(iomap_read_inline_data(iter, page), length); + if (iomap->type == IOMAP_INLINE) { + loff_t ret = iomap_read_inline_data(iter, page); + + if (ret < 0) + return ret; + return 0; + } /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(iter->inode, page); @@ -370,6 +375,8 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_page_in_bio = false; } ret = iomap_readpage_iter(iter, ctx, done); + if (ret <= 0) + return ret; } return done; From 756e1fc16505c31c9f86b602fcb8e2bc55c4b7e5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 4 Nov 2021 16:41:06 +0000 Subject: [PATCH 039/231] KVM: RISC-V: Unmap stage2 mapping when deleting/moving a memslot Unmap stage2 page tables when a memslot is being deleted or moved. It's the architectures' responsibility to ensure existing mappings are removed when kvm_arch_flush_shadow_memslot() returns. Fixes: 9d05c1fee837 ("RISC-V: KVM: Implement stage2 page table programming") Signed-off-by: Sean Christopherson Signed-off-by: Anup Patel --- arch/riscv/kvm/mmu.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index d81bae8eb55e..fc058ff5f4b6 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -453,6 +453,12 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + gpa_t gpa = slot->base_gfn << PAGE_SHIFT; + phys_addr_t size = slot->npages << PAGE_SHIFT; + + spin_lock(&kvm->mmu_lock); + stage2_unmap_range(kvm, gpa, size, false); + spin_unlock(&kvm->mmu_lock); } void kvm_arch_commit_memory_region(struct kvm *kvm, From 74c2e97b01846eb237b7819a3e2944455cfdb26a Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 17 Nov 2021 10:30:29 +0530 Subject: [PATCH 040/231] RISC-V: KVM: Fix incorrect KVM_MAX_VCPUS value The KVM_MAX_VCPUS value is supposed to be aligned with number of VMID bits in the hgatp CSR but the current KVM_MAX_VCPUS value is aligned with number of ASID bits in the satp CSR. Fixes: 99cdc6c18c2d ("RISC-V: Add initial skeletal KVM support") Signed-off-by: Anup Patel Reviewed-by: Atish Patra --- arch/riscv/include/asm/kvm_host.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 25ba21f98504..2639b9ee48f9 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -12,14 +12,12 @@ #include #include #include +#include #include #include -#ifdef CONFIG_64BIT -#define KVM_MAX_VCPUS (1U << 16) -#else -#define KVM_MAX_VCPUS (1U << 9) -#endif +#define KVM_MAX_VCPUS \ + ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1) #define KVM_HALT_POLL_NS_DEFAULT 500000 From 28c916ade1bd4205958f74bb817fd3a05dbb7afc Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 18 Nov 2021 16:30:14 +0100 Subject: [PATCH 041/231] ASoC: soc-acpi: Set mach->id field on comp_ids matches Commit dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") and commit 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") simplified the match tables in soc-acpi-intel-byt-match.c and soc-acpi-intel-cht-match.c by merging identical entries using the new .comp_ids snd_soc_acpi_mach field to point a single entry to multiple ACPI HIDs and clearing the previously unique per entry .id field. But various machine drivers from sound/soc/intel/boards rely on mach->id in one or more ways, e.g. some drivers contain the following snippets: adev = acpi_dev_get_first_match_dev(mach->id, NULL, -1); pkg_found = snd_soc_acpi_find_package_from_hid(mach->id, ... if (!strncmp(snd_soc_cards[i].codec_id, mach->id, 8)) { ... All of which are broken by the match table shrinking. Make the snd_soc_acpi_mach.id field non const (the storage for the tables already is non const) and on a comps_ids match copy the matching HID to the id field to fix this. Fixes: dac7cbd55dca ("ASoC: Intel: soc-acpi-byt: shrink tables using compatible IDs") Fixes: 959ae8215a9e ("ASoC: Intel: soc-acpi-cht: shrink tables using compatible IDs") Suggested-by: Pierre-Louis Bossart Cc: Pierre-Louis Bossart Cc: Brent Lu Signed-off-by: Hans de Goede Reviewed-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20211118153014.349222-1-hdegoede@redhat.com Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 2 +- sound/soc/soc-acpi.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index 31f4c4f9aeea..ac0893df9c76 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -147,7 +147,7 @@ struct snd_soc_acpi_link_adr { */ /* Descriptor for SST ASoC machine driver */ struct snd_soc_acpi_mach { - const u8 id[ACPI_ID_LEN]; + u8 id[ACPI_ID_LEN]; const struct snd_soc_acpi_codecs *comp_ids; const u32 link_mask; const struct snd_soc_acpi_link_adr *links; diff --git a/sound/soc/soc-acpi.c b/sound/soc/soc-acpi.c index 2ae99b49d3f5..cbd7ea48837b 100644 --- a/sound/soc/soc-acpi.c +++ b/sound/soc/soc-acpi.c @@ -20,8 +20,10 @@ static bool snd_soc_acpi_id_present(struct snd_soc_acpi_mach *machine) if (comp_ids) { for (i = 0; i < comp_ids->num_codecs; i++) { - if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) + if (acpi_dev_present(comp_ids->codecs[i], NULL, -1)) { + strscpy(machine->id, comp_ids->codecs[i], ACPI_ID_LEN); return true; + } } } From 428ee30a05cd1362c8aa86a4c909b0d1c6bc48a4 Mon Sep 17 00:00:00 2001 From: Nicolas Frattaroli Date: Sun, 21 Nov 2021 16:05:20 +0100 Subject: [PATCH 042/231] ASoC: rk817: Add module alias for rk817-codec Without a module alias, autoloading the driver does not occurr when it is built as a module. By adding a module alias, the driver now probes fine automatically and therefore analog audio output works as it should. Fixes: 0d6a04da9b25 ("ASoC: Add Rockchip rk817 audio CODEC support") Signed-off-by: Nicolas Frattaroli Link: https://lore.kernel.org/r/20211121150521.159543-1-frattaroli.nicolas@gmail.com Signed-off-by: Mark Brown --- sound/soc/codecs/rk817_codec.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/rk817_codec.c b/sound/soc/codecs/rk817_codec.c index 943d7d933e81..03f24edfe4f6 100644 --- a/sound/soc/codecs/rk817_codec.c +++ b/sound/soc/codecs/rk817_codec.c @@ -539,3 +539,4 @@ module_platform_driver(rk817_codec_driver); MODULE_DESCRIPTION("ASoC RK817 codec driver"); MODULE_AUTHOR("binyuan "); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:rk817-codec"); From fe785f56ad5886c08d1cadd9e8b4e1ff6a1866f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 18 Nov 2021 15:21:02 +0100 Subject: [PATCH 043/231] iwlwifi: pcie: fix constant-conversion warning Both gcc-11 and clang point out a potential issue with integer overflow when the iwl_dev_info_table[] array is empty. This is what clang warns: drivers/net/wireless/intel/iwlwifi/pcie/drv.c:1344:42: error: implicit conversion from 'unsigned long' to 'int' changes value from 18446744073709551615 to -1 [-Werror,-Wconstant-conversion] for (i = ARRAY_SIZE(iwl_dev_info_table) - 1; i >= 0; i--) { ~ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~ This is still harmless, as the loop correctly terminates, but adding an extra range check makes that obvious to both readers and to the compiler. Fixes: 3f7320428fa4 ("iwlwifi: pcie: simplify iwl_pci_find_dev_info()") Reported-by: kernel test robot Cc: Nick Desaulniers Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211118142124.526901-1-arnd@kernel.org --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index c574f041f096..395e328c6a07 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1339,9 +1339,13 @@ iwl_pci_find_dev_info(u16 device, u16 subsystem_device, u16 mac_type, u8 mac_step, u16 rf_type, u8 cdb, u8 rf_id, u8 no_160, u8 cores) { + int num_devices = ARRAY_SIZE(iwl_dev_info_table); int i; - for (i = ARRAY_SIZE(iwl_dev_info_table) - 1; i >= 0; i--) { + if (!num_devices) + return NULL; + + for (i = num_devices - 1; i >= 0; i--) { const struct iwl_dev_info *dev_info = &iwl_dev_info_table[i]; if (dev_info->device != (u16)IWL_CFG_ANY && From 1b54403c9cc444b6e0ade1f441efdf1270877ace Mon Sep 17 00:00:00 2001 From: chongjiapeng Date: Tue, 2 Nov 2021 15:38:47 +0800 Subject: [PATCH 044/231] iwlwifi: Fix missing error code in iwl_pci_probe() The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'ret'. Eliminate the follow smatch warning: drivers/net/wireless/intel/iwlwifi/pcie/drv.c:1376 iwl_pci_probe() warn: missing error code 'ret'. Reported-by: Abaci Robot Fixes: 1f171f4f1437 ("iwlwifi: Add support for getting rf id with blank otp") Signed-off-by: chongjiapeng Acked-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1635838727-128735-1-git-send-email-jiapeng.chong@linux.alibaba.com --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 395e328c6a07..5ce07f28e7c3 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -1446,8 +1446,10 @@ static int iwl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) */ if (iwl_trans->trans_cfg->rf_id && iwl_trans->trans_cfg->device_family >= IWL_DEVICE_FAMILY_9000 && - !CSR_HW_RFID_TYPE(iwl_trans->hw_rf_id) && get_crf_id(iwl_trans)) + !CSR_HW_RFID_TYPE(iwl_trans->hw_rf_id) && get_crf_id(iwl_trans)) { + ret = -EINVAL; goto out_free_trans; + } dev_info = iwl_pci_find_dev_info(pdev->device, pdev->subsystem_device, CSR_HW_REV_TYPE(iwl_trans->hw_rev), From 5283dd677e52af9db6fe6ad11b2f12220d519d0c Mon Sep 17 00:00:00 2001 From: Mordechay Goodstein Date: Wed, 10 Nov 2021 15:01:59 +0200 Subject: [PATCH 045/231] iwlwifi: mvm: retry init flow if failed In some very rare cases the init flow may fail. In many cases, this is recoverable, so we can retry. Implement a loop to retry two more times after the first attempt failed. This can happen in two different situations, namely during probe and during mac80211 start. For the first case, a simple loop is enough. For the second case, we need to add a flag to prevent mac80211 from trying to restart it as well, leaving full control with the driver. Cc: Signed-off-by: Mordechay Goodstein Signed-off-by: Luca Coelho Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/iwlwifi.20211110150132.57514296ecab.I52a0411774b700bdc7dedb124d8b59bf99456eb2@changeid --- drivers/net/wireless/intel/iwlwifi/iwl-drv.c | 22 +++++++++++------ drivers/net/wireless/intel/iwlwifi/iwl-drv.h | 3 +++ .../net/wireless/intel/iwlwifi/mvm/mac80211.c | 24 ++++++++++++++++++- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 3 +++ drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 3 +++ 5 files changed, 47 insertions(+), 8 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c index 36196e07b1a0..5cec467b995b 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.c @@ -1313,23 +1313,31 @@ _iwl_op_mode_start(struct iwl_drv *drv, struct iwlwifi_opmode_table *op) const struct iwl_op_mode_ops *ops = op->ops; struct dentry *dbgfs_dir = NULL; struct iwl_op_mode *op_mode = NULL; + int retry, max_retry = !!iwlwifi_mod_params.fw_restart * IWL_MAX_INIT_RETRY; + + for (retry = 0; retry <= max_retry; retry++) { #ifdef CONFIG_IWLWIFI_DEBUGFS - drv->dbgfs_op_mode = debugfs_create_dir(op->name, - drv->dbgfs_drv); - dbgfs_dir = drv->dbgfs_op_mode; + drv->dbgfs_op_mode = debugfs_create_dir(op->name, + drv->dbgfs_drv); + dbgfs_dir = drv->dbgfs_op_mode; #endif - op_mode = ops->start(drv->trans, drv->trans->cfg, &drv->fw, dbgfs_dir); + op_mode = ops->start(drv->trans, drv->trans->cfg, + &drv->fw, dbgfs_dir); + + if (op_mode) + return op_mode; + + IWL_ERR(drv, "retry init count %d\n", retry); #ifdef CONFIG_IWLWIFI_DEBUGFS - if (!op_mode) { debugfs_remove_recursive(drv->dbgfs_op_mode); drv->dbgfs_op_mode = NULL; - } #endif + } - return op_mode; + return NULL; } static void _iwl_op_mode_stop(struct iwl_drv *drv) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-drv.h b/drivers/net/wireless/intel/iwlwifi/iwl-drv.h index 2e2d60a58692..0fd009e6d685 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-drv.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-drv.h @@ -89,4 +89,7 @@ void iwl_drv_stop(struct iwl_drv *drv); #define IWL_EXPORT_SYMBOL(sym) #endif +/* max retry for init flow */ +#define IWL_MAX_INIT_RETRY 2 + #endif /* __iwl_drv_h__ */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 9fb9c7dad314..897e3b91ddb2 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -16,6 +16,7 @@ #include #include +#include "iwl-drv.h" #include "iwl-op-mode.h" #include "iwl-io.h" #include "mvm.h" @@ -1117,9 +1118,30 @@ static int iwl_mvm_mac_start(struct ieee80211_hw *hw) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); int ret; + int retry, max_retry = 0; mutex_lock(&mvm->mutex); - ret = __iwl_mvm_mac_start(mvm); + + /* we are starting the mac not in error flow, and restart is enabled */ + if (!test_bit(IWL_MVM_STATUS_HW_RESTART_REQUESTED, &mvm->status) && + iwlwifi_mod_params.fw_restart) { + max_retry = IWL_MAX_INIT_RETRY; + /* + * This will prevent mac80211 recovery flows to trigger during + * init failures + */ + set_bit(IWL_MVM_STATUS_STARTING, &mvm->status); + } + + for (retry = 0; retry <= max_retry; retry++) { + ret = __iwl_mvm_mac_start(mvm); + if (!ret) + break; + + IWL_ERR(mvm, "mac start retry %d\n", retry); + } + clear_bit(IWL_MVM_STATUS_STARTING, &mvm->status); + mutex_unlock(&mvm->mutex); return ret; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index 2b1dcd60e00f..a72d85086fe3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -1123,6 +1123,8 @@ struct iwl_mvm { * @IWL_MVM_STATUS_FIRMWARE_RUNNING: firmware is running * @IWL_MVM_STATUS_NEED_FLUSH_P2P: need to flush P2P bcast STA * @IWL_MVM_STATUS_IN_D3: in D3 (or at least about to go into it) + * @IWL_MVM_STATUS_STARTING: starting mac, + * used to disable restart flow while in STARTING state */ enum iwl_mvm_status { IWL_MVM_STATUS_HW_RFKILL, @@ -1134,6 +1136,7 @@ enum iwl_mvm_status { IWL_MVM_STATUS_FIRMWARE_RUNNING, IWL_MVM_STATUS_NEED_FLUSH_P2P, IWL_MVM_STATUS_IN_D3, + IWL_MVM_STATUS_STARTING, }; /* Keep track of completed init configuration */ diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 232ad531d612..ce7160670aa7 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -1600,6 +1600,9 @@ void iwl_mvm_nic_restart(struct iwl_mvm *mvm, bool fw_error) */ if (!mvm->fw_restart && fw_error) { iwl_fw_error_collect(&mvm->fwrt, false); + } else if (test_bit(IWL_MVM_STATUS_STARTING, + &mvm->status)) { + IWL_ERR(mvm, "Starting mac, retry will be triggered anyway\n"); } else if (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status)) { struct iwl_mvm_reprobe *reprobe; From f5cecf1d4c5ff76172928bc32e99ca56a5ca2f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Bartosik?= Date: Wed, 10 Nov 2021 22:57:44 +0100 Subject: [PATCH 046/231] iwlwifi: fix warnings produced by kernel debug options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix warnings produced by: - lockdep_assert_wiphy() in function reg_process_self_managed_hint(), - wiphy_dereference() in function iwl_mvm_init_fw_regd(). Both function are expected to be called in critical section. The warnings were discovered when running v5.15 kernel with debug options enabled: 1) Hardware name: Google Delbin/Delbin RIP: 0010:reg_process_self_managed_hint+0x254/0x347 [cfg80211] ... Call Trace: regulatory_set_wiphy_regd_sync+0x3d/0xb0 iwl_mvm_init_mcc+0x49d/0x5a2 iwl_op_mode_mvm_start+0x1b58/0x2507 ? iwl_mvm_reprobe_wk+0x94/0x94 _iwl_op_mode_start+0x146/0x1a3 iwl_opmode_register+0xda/0x13d init_module+0x28/0x1000 2) drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c:263 suspicious rcu_dereference_protected() usage! ... Hardware name: Google Delbin/Delbin, BIOS Google_Delbin Call Trace: dump_stack_lvl+0xb1/0xe6 iwl_mvm_init_fw_regd+0x2e7/0x379 iwl_mvm_init_mcc+0x2c6/0x5a2 iwl_op_mode_mvm_start+0x1b58/0x2507 ? iwl_mvm_reprobe_wk+0x94/0x94 _iwl_op_mode_start+0x146/0x1a3 iwl_opmode_register+0xda/0x13d init_module+0x28/0x100 Fixes: a05829a7222e ("cfg80211: avoid holding the RTNL when calling the driver") Signed-off-by: Łukasz Bartosik Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211110215744.5487-1-lukasz.bartosik@semihalf.com --- drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index ce7160670aa7..cd08e289cd9a 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -686,6 +686,7 @@ static int iwl_mvm_start_get_nvm(struct iwl_mvm *mvm) int ret; rtnl_lock(); + wiphy_lock(mvm->hw->wiphy); mutex_lock(&mvm->mutex); ret = iwl_run_init_mvm_ucode(mvm); @@ -701,6 +702,7 @@ static int iwl_mvm_start_get_nvm(struct iwl_mvm *mvm) iwl_mvm_stop_device(mvm); mutex_unlock(&mvm->mutex); + wiphy_unlock(mvm->hw->wiphy); rtnl_unlock(); if (ret < 0) From a571bc28326d9f3e13f5f2d9cda2883e0631b0ce Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 11 Nov 2021 08:23:11 +0100 Subject: [PATCH 047/231] iwlwifi: Fix memory leaks in error handling path Should an error occur (invalid TLV len or memory allocation failure), the memory already allocated in 'reduce_power_data' should be freed before returning, otherwise it is leaking. Fixes: 9dad325f9d57 ("iwlwifi: support loading the reduced power table from UEFI") Signed-off-by: Christophe JAILLET Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/1504cd7d842d13ddb8244e18004523128d5c9523.1636615284.git.christophe.jaillet@wanadoo.fr --- drivers/net/wireless/intel/iwlwifi/fw/uefi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/fw/uefi.c b/drivers/net/wireless/intel/iwlwifi/fw/uefi.c index c875bf35533c..009dd4be597b 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/uefi.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/uefi.c @@ -86,6 +86,7 @@ static void *iwl_uefi_reduce_power_section(struct iwl_trans *trans, if (len < tlv_len) { IWL_ERR(trans, "invalid TLV len: %zd/%u\n", len, tlv_len); + kfree(reduce_power_data); reduce_power_data = ERR_PTR(-EINVAL); goto out; } @@ -105,6 +106,7 @@ static void *iwl_uefi_reduce_power_section(struct iwl_trans *trans, IWL_DEBUG_FW(trans, "Couldn't allocate (more) reduce_power_data\n"); + kfree(reduce_power_data); reduce_power_data = ERR_PTR(-ENOMEM); goto out; } @@ -134,6 +136,10 @@ static void *iwl_uefi_reduce_power_section(struct iwl_trans *trans, done: if (!size) { IWL_DEBUG_FW(trans, "Empty REDUCE_POWER, skipping.\n"); + /* Better safe than sorry, but 'reduce_power_data' should + * always be NULL if !size. + */ + kfree(reduce_power_data); reduce_power_data = ERR_PTR(-ENOENT); goto out; } From 5737b4515deea0829c138ab5201160345ec67d49 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Fri, 19 Nov 2021 13:45:10 +0800 Subject: [PATCH 048/231] rtw89: update partition size of firmware header on skb->data The partition size is used to tell hardware the size of piece we are going to send a firmware. The old code updates the size in constant buffer of firmware, and leads system crash. To fix this, update the size on skb->data after we copy the firmware data into skb. Buglink: https://bugzilla.opensuse.org/show_bug.cgi?id=1188303 Fixes: e3ec7017f6a2 ("rtw89: add Realtek 802.11ax driver") Reported-by: Takashi Iwai Signed-off-by: Ping-Ke Shih Tested-by: Takashi Iwai Tested-by: Larry Finger Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211119054512.10620-2-pkshih@realtek.com --- drivers/net/wireless/realtek/rtw89/fw.c | 2 +- drivers/net/wireless/realtek/rtw89/fw.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index 212aaf577d3c..65ef3dc9d061 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -91,7 +91,6 @@ static int rtw89_fw_hdr_parser(struct rtw89_dev *rtwdev, const u8 *fw, u32 len, info->section_num = GET_FW_HDR_SEC_NUM(fw); info->hdr_len = RTW89_FW_HDR_SIZE + info->section_num * RTW89_FW_SECTION_HDR_SIZE; - SET_FW_HDR_PART_SIZE(fw, FWDL_SECTION_PER_PKT_LEN); bin = fw + info->hdr_len; @@ -275,6 +274,7 @@ static int __rtw89_fw_download_hdr(struct rtw89_dev *rtwdev, const u8 *fw, u32 l } skb_put_data(skb, fw, len); + SET_FW_HDR_PART_SIZE(skb->data, FWDL_SECTION_PER_PKT_LEN); rtw89_h2c_pkt_set_hdr_fwdl(rtwdev, skb, FWCMD_TYPE_H2C, H2C_CAT_MAC, H2C_CL_MAC_FWDL, H2C_FUNC_MAC_FWHDR_DL, len); diff --git a/drivers/net/wireless/realtek/rtw89/fw.h b/drivers/net/wireless/realtek/rtw89/fw.h index 7ee0d9323310..36e8d0da6c1e 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.h +++ b/drivers/net/wireless/realtek/rtw89/fw.h @@ -282,8 +282,10 @@ struct rtw89_h2creg_sch_tx_en { le32_get_bits(*((__le32 *)(fwhdr) + 6), GENMASK(15, 8)) #define GET_FW_HDR_CMD_VERSERION(fwhdr) \ le32_get_bits(*((__le32 *)(fwhdr) + 7), GENMASK(31, 24)) -#define SET_FW_HDR_PART_SIZE(fwhdr, val) \ - le32p_replace_bits((__le32 *)(fwhdr) + 7, val, GENMASK(15, 0)) +static inline void SET_FW_HDR_PART_SIZE(void *fwhdr, u32 val) +{ + le32p_replace_bits((__le32 *)fwhdr + 7, val, GENMASK(15, 0)); +} #define SET_CTRL_INFO_MACID(table, val) \ le32p_replace_bits((__le32 *)(table) + 0, val, GENMASK(6, 0)) From 6e53d6d26920d5221d3f4d4f5ffdd629ea69aa5c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 18 Nov 2021 13:47:48 +0100 Subject: [PATCH 049/231] mt76: mt7915: fix NULL pointer dereference in mt7915_get_phy_mode Fix the following NULL pointer dereference in mt7915_get_phy_mode routine adding an ibss interface to the mt7915 driver. [ 101.137097] wlan0: Trigger new scan to find an IBSS to join [ 102.827039] wlan0: Creating new IBSS network, BSSID 26:a4:50:1a:6e:69 [ 103.064756] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 [ 103.073670] Mem abort info: [ 103.076520] ESR = 0x96000005 [ 103.079614] EC = 0x25: DABT (current EL), IL = 32 bits [ 103.084934] SET = 0, FnV = 0 [ 103.088042] EA = 0, S1PTW = 0 [ 103.091215] Data abort info: [ 103.094104] ISV = 0, ISS = 0x00000005 [ 103.098041] CM = 0, WnR = 0 [ 103.101044] user pgtable: 4k pages, 39-bit VAs, pgdp=00000000460b1000 [ 103.107565] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 [ 103.116590] Internal error: Oops: 96000005 [#1] SMP [ 103.189066] CPU: 1 PID: 333 Comm: kworker/u4:3 Not tainted 5.10.75 #0 [ 103.195498] Hardware name: MediaTek MT7622 RFB1 board (DT) [ 103.201124] Workqueue: phy0 ieee80211_iface_work [mac80211] [ 103.206695] pstate: 20000005 (nzCv daif -PAN -UAO -TCO BTYPE=--) [ 103.212705] pc : mt7915_get_phy_mode+0x68/0x120 [mt7915e] [ 103.218103] lr : mt7915_mcu_add_bss_info+0x11c/0x760 [mt7915e] [ 103.223927] sp : ffffffc011cdb9e0 [ 103.227235] x29: ffffffc011cdb9e0 x28: ffffff8006563098 [ 103.232545] x27: ffffff8005f4da22 x26: ffffff800685ac40 [ 103.237855] x25: 0000000000000001 x24: 000000000000011f [ 103.243165] x23: ffffff8005f4e260 x22: ffffff8006567918 [ 103.248475] x21: ffffff8005f4df80 x20: ffffff800685ac58 [ 103.253785] x19: ffffff8006744400 x18: 0000000000000000 [ 103.259094] x17: 0000000000000000 x16: 0000000000000001 [ 103.264403] x15: 000899c3a2d9d2e4 x14: 000899bdc3c3a1c8 [ 103.269713] x13: 0000000000000000 x12: 0000000000000000 [ 103.275024] x11: ffffffc010e30c20 x10: 0000000000000000 [ 103.280333] x9 : 0000000000000050 x8 : ffffff8006567d88 [ 103.285642] x7 : ffffff8006563b5c x6 : ffffff8006563b44 [ 103.290952] x5 : 0000000000000002 x4 : 0000000000000001 [ 103.296262] x3 : 0000000000000001 x2 : 0000000000000001 [ 103.301572] x1 : 0000000000000000 x0 : 0000000000000011 [ 103.306882] Call trace: [ 103.309328] mt7915_get_phy_mode+0x68/0x120 [mt7915e] [ 103.314378] mt7915_bss_info_changed+0x198/0x200 [mt7915e] [ 103.319941] ieee80211_bss_info_change_notify+0x128/0x290 [mac80211] [ 103.326360] __ieee80211_sta_join_ibss+0x308/0x6c4 [mac80211] [ 103.332171] ieee80211_sta_create_ibss+0x8c/0x10c [mac80211] [ 103.337895] ieee80211_ibss_work+0x3dc/0x614 [mac80211] [ 103.343185] ieee80211_iface_work+0x388/0x3f0 [mac80211] [ 103.348495] process_one_work+0x288/0x690 [ 103.352499] worker_thread+0x70/0x464 [ 103.356157] kthread+0x144/0x150 [ 103.359380] ret_from_fork+0x10/0x18 [ 103.362952] Code: 394008c3 52800220 394000e4 7100007f (39400023) Fixes: 37f4ca907c46 ("mt76: mt7915: register per-phy HE capabilities for each interface") Fixes: e57b7901469f ("mt76: add mac80211 driver for MT7915 PCIe-based chipsets") Signed-off-by: Lorenzo Bianconi Acked-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/ddae419a740f1fb9e48afd432035e9f394f512ee.1637239456.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/mt7915/mcu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c index 899957b9d0f1..852d5d97c70b 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mcu.c @@ -176,7 +176,7 @@ mt7915_get_phy_mode(struct ieee80211_vif *vif, struct ieee80211_sta *sta) if (ht_cap->ht_supported) mode |= PHY_MODE_GN; - if (he_cap->has_he) + if (he_cap && he_cap->has_he) mode |= PHY_MODE_AX_24G; } else if (band == NL80211_BAND_5GHZ) { mode |= PHY_MODE_A; @@ -187,7 +187,7 @@ mt7915_get_phy_mode(struct ieee80211_vif *vif, struct ieee80211_sta *sta) if (vht_cap->vht_supported) mode |= PHY_MODE_AC; - if (he_cap->has_he) + if (he_cap && he_cap->has_he) mode |= PHY_MODE_AX_5G; } From 064a91771f7aae4ea2d13033b64e921951d216ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 23 Nov 2021 09:40:35 +1100 Subject: [PATCH 050/231] SUNRPC: use different lock keys for INET6 and LOCAL xprtsock.c reclassifies sock locks based on the protocol. However there are 3 protocols and only 2 classification keys. The same key is used for both INET6 and LOCAL. This causes lockdep complaints. The complaints started since Commit ea9afca88bbe ("SUNRPC: Replace use of socket sk_callback_lock with sock_lock") which resulted in the sock locks beings used more. So add another key, and renumber them slightly. Fixes: ea9afca88bbe ("SUNRPC: Replace use of socket sk_callback_lock with sock_lock") Fixes: 176e21ee2ec8 ("SUNRPC: Support for RPC over AF_LOCAL transports") Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index ae48c9c84ee1..d8ee06a9650a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1720,15 +1720,15 @@ static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port) } #ifdef CONFIG_DEBUG_LOCK_ALLOC -static struct lock_class_key xs_key[2]; -static struct lock_class_key xs_slock_key[2]; +static struct lock_class_key xs_key[3]; +static struct lock_class_key xs_slock_key[3]; static inline void xs_reclassify_socketu(struct socket *sock) { struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_LOCAL-RPC", - &xs_slock_key[1], "sk_lock-AF_LOCAL-RPC", &xs_key[1]); + &xs_slock_key[0], "sk_lock-AF_LOCAL-RPC", &xs_key[0]); } static inline void xs_reclassify_socket4(struct socket *sock) @@ -1736,7 +1736,7 @@ static inline void xs_reclassify_socket4(struct socket *sock) struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET-RPC", - &xs_slock_key[0], "sk_lock-AF_INET-RPC", &xs_key[0]); + &xs_slock_key[1], "sk_lock-AF_INET-RPC", &xs_key[1]); } static inline void xs_reclassify_socket6(struct socket *sock) @@ -1744,7 +1744,7 @@ static inline void xs_reclassify_socket6(struct socket *sock) struct sock *sk = sock->sk; sock_lock_init_class_and_name(sk, "slock-AF_INET6-RPC", - &xs_slock_key[1], "sk_lock-AF_INET6-RPC", &xs_key[1]); + &xs_slock_key[2], "sk_lock-AF_INET6-RPC", &xs_key[2]); } static inline void xs_reclassify_socket(int family, struct socket *sock) From e0a2c28da11e2c2b963fc01d50acbf03045ac732 Mon Sep 17 00:00:00 2001 From: George Kennedy Date: Thu, 18 Nov 2021 14:03:28 -0500 Subject: [PATCH 051/231] scsi: scsi_debug: Sanity check block descriptor length in resp_mode_select() In resp_mode_select() sanity check the block descriptor len to avoid UAF. BUG: KASAN: use-after-free in resp_mode_select+0xa4c/0xb40 drivers/scsi/scsi_debug.c:2509 Read of size 1 at addr ffff888026670f50 by task scsicmd/15032 CPU: 1 PID: 15032 Comm: scsicmd Not tainted 5.15.0-01d0625 #15 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Call Trace: dump_stack_lvl+0x89/0xb5 lib/dump_stack.c:107 print_address_description.constprop.9+0x28/0x160 mm/kasan/report.c:257 kasan_report.cold.14+0x7d/0x117 mm/kasan/report.c:443 __asan_report_load1_noabort+0x14/0x20 mm/kasan/report_generic.c:306 resp_mode_select+0xa4c/0xb40 drivers/scsi/scsi_debug.c:2509 schedule_resp+0x4af/0x1a10 drivers/scsi/scsi_debug.c:5483 scsi_debug_queuecommand+0x8c9/0x1e70 drivers/scsi/scsi_debug.c:7537 scsi_queue_rq+0x16b4/0x2d10 drivers/scsi/scsi_lib.c:1521 blk_mq_dispatch_rq_list+0xb9b/0x2700 block/blk-mq.c:1640 __blk_mq_sched_dispatch_requests+0x28f/0x590 block/blk-mq-sched.c:325 blk_mq_sched_dispatch_requests+0x105/0x190 block/blk-mq-sched.c:358 __blk_mq_run_hw_queue+0xe5/0x150 block/blk-mq.c:1762 __blk_mq_delay_run_hw_queue+0x4f8/0x5c0 block/blk-mq.c:1839 blk_mq_run_hw_queue+0x18d/0x350 block/blk-mq.c:1891 blk_mq_sched_insert_request+0x3db/0x4e0 block/blk-mq-sched.c:474 blk_execute_rq_nowait+0x16b/0x1c0 block/blk-exec.c:63 sg_common_write.isra.18+0xeb3/0x2000 drivers/scsi/sg.c:837 sg_new_write.isra.19+0x570/0x8c0 drivers/scsi/sg.c:775 sg_ioctl_common+0x14d6/0x2710 drivers/scsi/sg.c:941 sg_ioctl+0xa2/0x180 drivers/scsi/sg.c:1166 __x64_sys_ioctl+0x19d/0x220 fs/ioctl.c:52 do_syscall_64+0x3a/0x80 arch/x86/entry/common.c:50 entry_SYSCALL_64_after_hwframe+0x44/0xae arch/x86/entry/entry_64.S:113 Link: https://lore.kernel.org/r/1637262208-28850-1-git-send-email-george.kennedy@oracle.com Reported-by: syzkaller Acked-by: Douglas Gilbert Signed-off-by: George Kennedy Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index ab01ef7d37f4..e5cbeb701629 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -2502,11 +2502,11 @@ static int resp_mode_select(struct scsi_cmnd *scp, __func__, param_len, res); md_len = mselect6 ? (arr[0] + 1) : (get_unaligned_be16(arr + 0) + 2); bd_len = mselect6 ? arr[3] : get_unaligned_be16(arr + 6); - if (md_len > 2) { + off = bd_len + (mselect6 ? 4 : 8); + if (md_len > 2 || off >= res) { mk_sense_invalid_fld(scp, SDEB_IN_DATA, 0, -1); return check_condition_result; } - off = bd_len + (mselect6 ? 4 : 8); mpage = arr[off] & 0x3f; ps = !!(arr[off] & 0x80); if (ps) { From eb97545d6264b341b06ba7603f52ff6c0b2af6ea Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Sat, 20 Nov 2021 10:49:17 -0600 Subject: [PATCH 052/231] scsi: core: sysfs: Fix setting device state to SDEV_RUNNING This fixes an issue added in commit 4edd8cd4e86d ("scsi: core: sysfs: Fix hang when device state is set via sysfs") where if userspace is requesting to set the device state to SDEV_RUNNING when the state is already SDEV_RUNNING, we return -EINVAL instead of count. The commmit above set ret to count for this case, when it should have set it to 0. Link: https://lore.kernel.org/r/20211120164917.4924-1-michael.christie@oracle.com Fixes: 4edd8cd4e86d ("scsi: core: sysfs: Fix hang when device state is set via sysfs") Reviewed-by: Lee Duncan Signed-off-by: Mike Christie Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 7afcec250f9b..d4edce930a4a 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -812,7 +812,7 @@ store_state_field(struct device *dev, struct device_attribute *attr, mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_RUNNING && state == SDEV_RUNNING) { - ret = count; + ret = 0; } else { ret = scsi_device_set_state(sdev, state); if (ret == 0 && state == SDEV_RUNNING) From 2d62253eb1b60f4ce8b39125eee282739b519297 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Mon, 22 Nov 2021 15:12:23 +0900 Subject: [PATCH 053/231] scsi: scsi_debug: Zero clear zones at reset write pointer When a reset is requested the position of the write pointer is updated but the data in the corresponding zone is not cleared. Instead scsi_debug returns any data written before the write pointer was reset. This is an error and prevents using scsi_debug for stale page cache testing of the BLKRESETZONE ioctl. Zero written data in the zone when resetting the write pointer. Link: https://lore.kernel.org/r/20211122061223.298890-1-shinichiro.kawasaki@wdc.com Fixes: f0d1cf9378bd ("scsi: scsi_debug: Add ZBC zone commands") Reviewed-by: Damien Le Moal Acked-by: Douglas Gilbert Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index e5cbeb701629..3c0da3770edf 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -4657,6 +4657,7 @@ static void zbc_rwp_zone(struct sdebug_dev_info *devip, struct sdeb_zone_state *zsp) { enum sdebug_z_cond zc; + struct sdeb_store_info *sip = devip2sip(devip, false); if (zbc_zone_is_conv(zsp)) return; @@ -4668,6 +4669,10 @@ static void zbc_rwp_zone(struct sdebug_dev_info *devip, if (zsp->z_cond == ZC4_CLOSED) devip->nr_closed--; + if (zsp->z_wp > zsp->z_start) + memset(sip->storep + zsp->z_start * sdebug_sector_size, 0, + (zsp->z_wp - zsp->z_start) * sdebug_sector_size); + zsp->z_non_seq_resource = false; zsp->z_wp = zsp->z_start; zsp->z_cond = ZC1_EMPTY; From 57bbeacdbee72a54eb97d56b876cf9c94059fc34 Mon Sep 17 00:00:00 2001 From: Huang Jianan Date: Thu, 18 Nov 2021 21:58:44 +0800 Subject: [PATCH 054/231] erofs: fix deadlock when shrink erofs slab We observed the following deadlock in the stress test under low memory scenario: Thread A Thread B - erofs_shrink_scan - erofs_try_to_release_workgroup - erofs_workgroup_try_to_freeze -- A - z_erofs_do_read_page - z_erofs_collection_begin - z_erofs_register_collection - erofs_insert_workgroup - xa_lock(&sbi->managed_pslots) -- B - erofs_workgroup_get - erofs_wait_on_workgroup_freezed -- A - xa_erase - xa_lock(&sbi->managed_pslots) -- B To fix this, it needs to hold xa_lock before freezing the workgroup since xarray will be touched then. So let's hold the lock before accessing each workgroup, just like what we did with the radix tree before. [ Gao Xiang: Jianhua Hao also reports this issue at https://lore.kernel.org/r/b10b85df30694bac8aadfe43537c897a@xiaomi.com ] Link: https://lore.kernel.org/r/20211118135844.3559-1-huangjianan@oppo.com Fixes: 64094a04414f ("erofs: convert workstn to XArray") Reviewed-by: Chao Yu Reviewed-by: Gao Xiang Signed-off-by: Huang Jianan Reported-by: Jianhua Hao Signed-off-by: Gao Xiang --- fs/erofs/utils.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index 84da2c280012..ec9a1d780dc1 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -150,7 +150,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, * however in order to avoid some race conditions, add a * DBG_BUGON to observe this in advance. */ - DBG_BUGON(xa_erase(&sbi->managed_pslots, grp->index) != grp); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp); /* last refcount should be connected with its managed pslot. */ erofs_workgroup_unfreeze(grp, 0); @@ -165,15 +165,19 @@ static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, unsigned int freed = 0; unsigned long index; + xa_lock(&sbi->managed_pslots); xa_for_each(&sbi->managed_pslots, index, grp) { /* try to shrink each valid workgroup */ if (!erofs_try_to_release_workgroup(sbi, grp)) continue; + xa_unlock(&sbi->managed_pslots); ++freed; if (!--nr_shrink) - break; + return freed; + xa_lock(&sbi->managed_pslots); } + xa_unlock(&sbi->managed_pslots); return freed; } From d257cc8cb8d5355ffc43a96bab94db7b5a324803 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Nov 2021 20:29:12 -0500 Subject: [PATCH 055/231] locking/rwsem: Make handoff bit handling more consistent There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com --- kernel/locking/rwsem.c | 175 ++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 88 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..e039cf1605af 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,17 +514,24 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); - if (list_empty(&sem->wait_list)) { - /* hit end of list above */ - adjustment -= RWSEM_FLAG_WAITERS; - } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; + oldcount = atomic_long_read(&sem->count); + if (list_empty(&sem->wait_list)) { + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ + adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -956,7 +1001,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1047,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1061,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1119,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1137,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1148,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } From 14c24048841151548a3f4d9e218510c844c1b737 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Nov 2021 17:44:55 +0800 Subject: [PATCH 056/231] locking/rwsem: Optimize down_read_trylock() under highly contended case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found that a process with 10 thousnads threads has been encountered a regression problem from Linux-v4.14 to Linux-v5.4. It is a kind of workload which will concurrently allocate lots of memory in different threads sometimes. In this case, we will see the down_read_trylock() with a high hotspot. Therefore, we suppose that rwsem has a regression at least since Linux-v5.4. In order to easily debug this problem, we write a simply benchmark to create the similar situation lile the following. ```c++ #include #include #include #include #include #include #include #include #include volatile int mutex; void trigger(int cpu, char* ptr, std::size_t sz) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); assert(pthread_setaffinity_np(pthread_self(), sizeof(set), &set) == 0); while (mutex); for (std::size_t i = 0; i < sz; i += 4096) { *ptr = '\0'; ptr += 4096; } } int main(int argc, char* argv[]) { std::size_t sz = 100; if (argc > 1) sz = atoi(argv[1]); auto nproc = std::thread::hardware_concurrency(); std::vector thr; sz <<= 30; auto* ptr = mmap(nullptr, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); assert(ptr != MAP_FAILED); char* cptr = static_cast(ptr); auto run = sz / nproc; run = (run >> 12) << 12; mutex = 1; for (auto i = 0U; i < nproc; ++i) { thr.emplace_back(std::thread([i, cptr, run]() { trigger(i, cptr, run); })); cptr += run; } rusage usage_start; getrusage(RUSAGE_SELF, &usage_start); auto start = std::chrono::system_clock::now(); mutex = 0; for (auto& t : thr) t.join(); rusage usage_end; getrusage(RUSAGE_SELF, &usage_end); auto end = std::chrono::system_clock::now(); timeval utime; timeval stime; timersub(&usage_end.ru_utime, &usage_start.ru_utime, &utime); timersub(&usage_end.ru_stime, &usage_start.ru_stime, &stime); printf("usr: %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("sys: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("real: %lu\n", std::chrono::duration_cast(end - start).count()); return 0; } ``` The functionality of above program is simply which creates `nproc` threads and each of them are trying to touch memory (trigger page fault) on different CPU. Then we will see the similar profile by `perf top`. 25.55% [kernel] [k] down_read_trylock 14.78% [kernel] [k] handle_mm_fault 13.45% [kernel] [k] up_read 8.61% [kernel] [k] clear_page_erms 3.89% [kernel] [k] __do_page_fault The highest hot instruction, which accounts for about 92%, in down_read_trylock() is cmpxchg like the following. 91.89 │ lock cmpxchg %rdx,(%rdi) Sice the problem is found by migrating from Linux-v4.14 to Linux-v5.4, so we easily found that the commit ddb20d1d3aed ("locking/rwsem: Optimize down_read_trylock()") caused the regression. The reason is that the commit assumes the rwsem is not contended at all. But it is not always true for mmap lock which could be contended with thousands threads. So most threads almost need to run at least 2 times of "cmpxchg" to acquire the lock. The overhead of atomic operation is higher than non-atomic instructions, which caused the regression. By using the above benchmark, the real executing time on a x86-64 system before and after the patch were: Before Patch After Patch # of Threads real real reduced by ------------ ------ ------ ---------- 1 65,373 65,206 ~0.0% 4 15,467 15,378 ~0.5% 40 6,214 5,528 ~11.0% For the uncontended case, the new down_read_trylock() is the same as before. For the contended cases, the new down_read_trylock() is faster than before. The more contended, the more fast. Signed-off-by: Muchun Song Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211118094455.9068-1-songmuchun@bytedance.com --- kernel/locking/rwsem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e039cf1605af..04a74d040a6d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1248,17 +1248,14 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - /* - * Optimize for the case when the rwsem is not locked at all. - */ - tmp = RWSEM_UNLOCKED_VALUE; - do { + tmp = atomic_long_read(&sem->count); + while (!(tmp & RWSEM_READ_FAILED_MASK)) { if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - tmp + RWSEM_READER_BIAS)) { + tmp + RWSEM_READER_BIAS)) { rwsem_set_reader_owned(sem); return 1; } - } while (!(tmp & RWSEM_READ_FAILED_MASK)); + } return 0; } From 73743c3b092277febbf69b250ce8ebbca0525aa2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 9 Nov 2021 13:22:32 +0100 Subject: [PATCH 057/231] perf: Ignore sigtrap for tracepoints destined for other tasks syzbot reported that the warning in perf_sigtrap() fires, saying that the event's task does not match current: | WARNING: CPU: 0 PID: 9090 at kernel/events/core.c:6446 perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | Modules linked in: | CPU: 0 PID: 9090 Comm: syz-executor.1 Not tainted 5.15.0-syzkaller #0 | Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 | RIP: 0010:perf_sigtrap kernel/events/core.c:6446 [inline] | RIP: 0010:perf_pending_event_disable kernel/events/core.c:6470 [inline] | RIP: 0010:perf_pending_event+0x40d/0x4b0 kernel/events/core.c:6513 | ... | Call Trace: | | irq_work_single+0x106/0x220 kernel/irq_work.c:211 | irq_work_run_list+0x6a/0x90 kernel/irq_work.c:242 | irq_work_run+0x4f/0xd0 kernel/irq_work.c:251 | __sysvec_irq_work+0x95/0x3d0 arch/x86/kernel/irq_work.c:22 | sysvec_irq_work+0x8e/0xc0 arch/x86/kernel/irq_work.c:17 | | | asm_sysvec_irq_work+0x12/0x20 arch/x86/include/asm/idtentry.h:664 | RIP: 0010:__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:152 [inline] | RIP: 0010:_raw_spin_unlock_irqrestore+0x38/0x70 kernel/locking/spinlock.c:194 | ... | coredump_task_exit kernel/exit.c:371 [inline] | do_exit+0x1865/0x25c0 kernel/exit.c:771 | do_group_exit+0xe7/0x290 kernel/exit.c:929 | get_signal+0x3b0/0x1ce0 kernel/signal.c:2820 | arch_do_signal_or_restart+0x2a9/0x1c40 arch/x86/kernel/signal.c:868 | handle_signal_work kernel/entry/common.c:148 [inline] | exit_to_user_mode_loop kernel/entry/common.c:172 [inline] | exit_to_user_mode_prepare+0x17d/0x290 kernel/entry/common.c:207 | __syscall_exit_to_user_mode_work kernel/entry/common.c:289 [inline] | syscall_exit_to_user_mode+0x19/0x60 kernel/entry/common.c:300 | do_syscall_64+0x42/0xb0 arch/x86/entry/common.c:86 | entry_SYSCALL_64_after_hwframe+0x44/0xae On x86 this shouldn't happen, which has arch_irq_work_raise(). The test program sets up a perf event with sigtrap set to fire on the 'sched_wakeup' tracepoint, which fired in ttwu_do_wakeup(). This happened because the 'sched_wakeup' tracepoint also takes a task argument passed on to perf_tp_event(), which is used to deliver the event to that other task. Since we cannot deliver synchronous signals to other tasks, skip an event if perf_tp_event() is targeted at another task and perf_event_attr::sigtrap is set, which will avoid ever entering perf_sigtrap() for such events. Fixes: 97ba62b27867 ("perf: Add support for SIGTRAP on perf events") Reported-by: syzbot+663359e32ce6f1a305ad@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YYpoCOBmC/kJWfmI@elver.google.com --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..30d94f68c5bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9759,6 +9759,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } From 8a6cc0ded6d942e4a506c421c4d87a634bda6e75 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 22 Nov 2021 17:23:56 -0600 Subject: [PATCH 058/231] ASoC: Intel: soc-acpi: add entry for ESSX8336 on CML We have configurations for this codec on APL, GLK, JSL and TGL, somehow the information that some designs rely on CometLake was not shared. BugLink: https://github.com/thesofproject/linux/issues/3248 Fixes: 790049fb6623 ("ASoC: Intel: soc-acpi: apl/glk/tgl: add entry for devices based on ES8336 codec") Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20211122232356.23505-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/soc-acpi-intel-cml-match.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/soc/intel/common/soc-acpi-intel-cml-match.c b/sound/soc/intel/common/soc-acpi-intel-cml-match.c index b4eb0c97edf1..4eebc79d4b48 100644 --- a/sound/soc/intel/common/soc-acpi-intel-cml-match.c +++ b/sound/soc/intel/common/soc-acpi-intel-cml-match.c @@ -81,6 +81,12 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_cml_machines[] = { .sof_fw_filename = "sof-cml.ri", .sof_tplg_filename = "sof-cml-da7219-max98390.tplg", }, + { + .id = "ESSX8336", + .drv_name = "sof-essx8336", + .sof_fw_filename = "sof-cml.ri", + .sof_tplg_filename = "sof-cml-es8336.tplg", + }, {}, }; EXPORT_SYMBOL_GPL(snd_soc_acpi_intel_cml_machines); From 5a3ba99b62d8486de0316334e72ac620d4b94fdd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 23 Nov 2021 08:36:18 +0000 Subject: [PATCH 059/231] ipmi: msghandler: Make symbol 'remove_work_wq' static The sparse tool complains as follows: drivers/char/ipmi/ipmi_msghandler.c:194:25: warning: symbol 'remove_work_wq' was not declared. Should it be static? This symbol is not used outside of ipmi_msghandler.c, so marks it static. Fixes: 1d49eb91e86e ("ipmi: Move remove_work to dedicated workqueue") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Message-Id: <20211123083618.2366808-1-weiyongjun1@huawei.com> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 1ade72bfae0f..a2ec0171363a 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -191,7 +191,7 @@ struct ipmi_user { struct work_struct remove_work; }; -struct workqueue_struct *remove_work_wq; +static struct workqueue_struct *remove_work_wq; static struct ipmi_user *acquire_ipmi_user(struct ipmi_user *user, int *index) __acquires(user->release_barrier) From ae26c08e6c8071ba8febb0c7c0829da96c75248c Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 22 Nov 2021 17:22:54 -0600 Subject: [PATCH 060/231] ALSA: intel-dsp-config: add quirk for CML devices based on ES8336 codec We've added quirks for ESS8336 but missed CML, add quirks for both LP and H versions. BugLink: https://github.com/thesofproject/linux/issues/3248 Fixes: 9d36ceab9415 ("ALSA: intel-dsp-config: add quirk for APL/GLK/TGL devices based on ES8336 codec") Signed-off-by: Pierre-Louis Bossart Link: https://lore.kernel.org/r/20211122232254.23362-1-pierre-louis.bossart@linux.intel.com Signed-off-by: Takashi Iwai --- sound/hda/intel-dsp-config.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sound/hda/intel-dsp-config.c b/sound/hda/intel-dsp-config.c index 10a0bffc3cf6..4208fa8a4db5 100644 --- a/sound/hda/intel-dsp-config.c +++ b/sound/hda/intel-dsp-config.c @@ -252,6 +252,11 @@ static const struct config_entry config_table[] = { .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, .device = 0x02c8, }, + { + .flags = FLAG_SOF, + .device = 0x02c8, + .codec_hid = "ESSX8336", + }, /* Cometlake-H */ { .flags = FLAG_SOF, @@ -276,6 +281,11 @@ static const struct config_entry config_table[] = { .flags = FLAG_SOF | FLAG_SOF_ONLY_IF_DMIC_OR_SOUNDWIRE, .device = 0x06c8, }, + { + .flags = FLAG_SOF, + .device = 0x06c8, + .codec_hid = "ESSX8336", + }, #endif /* Icelake */ From cf0b0e3712f7af90006f8317ff27278094c2c128 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 19 Nov 2021 13:16:27 +1000 Subject: [PATCH 061/231] KVM: PPC: Book3S HV: Prevent POWER7/8 TLB flush flushing SLB The POWER9 ERAT flush instruction is a SLBIA with IH=7, which is a reserved value on POWER7/8. On POWER8 this invalidates the SLB entries above index 0, similarly to SLBIA IH=0. If the SLB entries are invalidated, and then the guest is bypassed, the host SLB does not get re-loaded, so the bolted entries above 0 will be lost. This can result in kernel stack access causing a SLB fault. Kernel stack access causing a SLB fault was responsible for the infamous mega bug (search "Fix SLB reload bug"). Although since commit 48e7b7695745 ("powerpc/64s/hash: Convert SLB miss handlers to C") that starts using the kernel stack in the SLB miss handler, it might only result in an infinite loop of SLB faults. In any case it's a bug. Fix this by only executing the instruction on >= POWER9 where IH=7 is defined not to invalidate the SLB. POWER7/8 don't require this ERAT flush. Fixes: 500871125920 ("KVM: PPC: Book3S HV: Invalidate ERAT when flushing guest TLB entries") Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: Nicholas Piggin Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20211119031627.577853-1-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_builtin.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index fcf4760a3a0e..70b7a8f97153 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -695,6 +695,7 @@ static void flush_guest_tlb(struct kvm *kvm) "r" (0) : "memory"); } asm volatile("ptesync": : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory"); } else { for (set = 0; set < kvm->arch.tlb_sets; ++set) { @@ -705,7 +706,9 @@ static void flush_guest_tlb(struct kvm *kvm) rb += PPC_BIT(51); /* increment set number */ } asm volatile("ptesync": : :"memory"); - asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); + // POWER9 congruence-class TLBIEL leaves ERAT. Flush it now. + if (cpu_has_feature(CPU_FTR_ARCH_300)) + asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory"); } } From 5bb60ea611db1e04814426ed4bd1c95d1487678e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 18 Nov 2021 10:39:53 +0100 Subject: [PATCH 062/231] powerpc/32: Fix hardlockup on vmap stack overflow Since the commit c118c7303ad5 ("powerpc/32: Fix vmap stack - Do not activate MMU before reading task struct") a vmap stack overflow results in a hard lockup. This is because emergency_ctx is still addressed with its virtual address allthough data MMU is not active anymore at that time. Fix it by using a physical address instead. Fixes: c118c7303ad5 ("powerpc/32: Fix vmap stack - Do not activate MMU before reading task struct") Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ce30364fb7ccda489272af4a1612b6aa147e1d23.1637227521.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_32.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/head_32.h b/arch/powerpc/kernel/head_32.h index 6b1ec9e3541b..349c4a820231 100644 --- a/arch/powerpc/kernel/head_32.h +++ b/arch/powerpc/kernel/head_32.h @@ -202,11 +202,11 @@ vmap_stack_overflow: mfspr r1, SPRN_SPRG_THREAD lwz r1, TASK_CPU - THREAD(r1) slwi r1, r1, 3 - addis r1, r1, emergency_ctx@ha + addis r1, r1, emergency_ctx-PAGE_OFFSET@ha #else - lis r1, emergency_ctx@ha + lis r1, emergency_ctx-PAGE_OFFSET@ha #endif - lwz r1, emergency_ctx@l(r1) + lwz r1, emergency_ctx-PAGE_OFFSET@l(r1) addi r1, r1, THREAD_SIZE - INT_FRAME_SIZE EXCEPTION_PROLOG_2 0 vmap_stack_overflow prepare_transfer_to_handler From c0f2077baa4113f38f008b8e912b9fb3ff8d43df Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Nov 2021 08:04:34 +0100 Subject: [PATCH 063/231] x86/boot: Mark prepare_command_line() __init Fix: WARNING: modpost: vmlinux.o(.text.unlikely+0x64d0): Section mismatch in reference \ from the function prepare_command_line() to the variable .init.data:command_line The function prepare_command_line() references the variable __initdata command_line. This is often because prepare_command_line lacks a __initdata annotation or the annotation of command_line is wrong. Apparently some toolchains do different inlining decisions. Reported-by: Stephen Rothwell Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YZySgpmBcNNM2qca@zn.tnic --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c410be738ae7..6a190c7f4d71 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -742,7 +742,7 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } -static char *prepare_command_line(void) +static char * __init prepare_command_line(void) { #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE From dce1ca0525bfdc8a69a9343bc714fbc19a2f04b3 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 23 Nov 2021 11:40:47 +0000 Subject: [PATCH 064/231] sched/scs: Reset task stack state in bringup_cpu() To hot unplug a CPU, the idle task on that CPU calls a few layers of C code before finally leaving the kernel. When KASAN is in use, poisoned shadow is left around for each of the active stack frames, and when shadow call stacks are in use. When shadow call stacks (SCS) are in use the task's saved SCS SP is left pointing at an arbitrary point within the task's shadow call stack. When a CPU is offlined than onlined back into the kernel, this stale state can adversely affect execution. Stale KASAN shadow can alias new stackframes and result in bogus KASAN warnings. A stale SCS SP is effectively a memory leak, and prevents a portion of the shadow call stack being used. Across a number of hotplug cycles the idle task's entire shadow call stack can become unusable. We previously fixed the KASAN issue in commit: e1b77c92981a5222 ("sched/kasan: remove stale KASAN poison after hotplug") ... by removing any stale KASAN stack poison immediately prior to onlining a CPU. Subsequently in commit: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") ... the refactoring left the KASAN and SCS cleanup in one-time idle thread initialization code rather than something invoked prior to each CPU being onlined, breaking both as above. We fixed SCS (but not KASAN) in commit: 63acd42c0d4942f7 ("sched/scs: Reset the shadow stack when idle_task_exit") ... but as this runs in the context of the idle task being offlined it's potentially fragile. To fix these consistently and more robustly, reset the SCS SP and KASAN shadow of a CPU's idle task immediately before we online that CPU in bringup_cpu(). This ensures the idle task always has a consistent state when it is running, and removes the need to so so when exiting an idle task. Whenever any thread is created, dup_task_struct() will give the task a stack which is free of KASAN shadow, and initialize the task's SCS SP, so there's no need to specially initialize either for idle thread within init_idle(), as this was only necessary to handle hotplug cycles. I've tested this on arm64 with: * gcc 11.1.0, defconfig +KASAN_INLINE, KASAN_STACK * clang 12.0.0, defconfig +KASAN_INLINE, KASAN_STACK, SHADOW_CALL_STACK ... offlining and onlining CPUS with: | while true; do | for C in /sys/devices/system/cpu/cpu*/online; do | echo 0 > $C; | echo 1 > $C; | done | done Fixes: f1a0a376ca0c4ef1 ("sched/core: Initialize the idle task with preemption disabled") Reported-by: Qian Cai Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Qian Cai Link: https://lore.kernel.org/lkml/20211115113310.35693-1-mark.rutland@arm.com/ --- kernel/cpu.c | 7 +++++++ kernel/sched/core.c | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..407a2568f35e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -587,6 +588,12 @@ static int bringup_cpu(unsigned int cpu) struct task_struct *idle = idle_thread_get(cpu); int ret; + /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3c9b0fda64ac..76f9deeaa942 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8619,9 +8619,6 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_IDLE | PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); - scs_task_reset(idle); - kasan_unpoison_task_stack(idle); - #ifdef CONFIG_SMP /* * It's possible that init_idle() gets called multiple times on a task, @@ -8777,7 +8774,6 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } - scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } From 872fc0b6bde8b2dd6891c740cd792d214255dca3 Mon Sep 17 00:00:00 2001 From: Lucas Tanure Date: Tue, 23 Nov 2021 16:31:39 +0000 Subject: [PATCH 065/231] ASoC: cs35l41: Set the max SPI speed for the whole device Higher speeds are only supported when PLL is enabled, but the current driver doesn't enable PLL outside of stream use cases, so better to set the lowest SPI speed accepted by the entire device. Move the current frequency set to the spi sub-driver so the whole device can benefit from that speed. spi-max-frequency property could be used, but ACPI systems don't support it, so by setting it in the spi sub-driver probe both Device Trees and ACPI systems are supported. Signed-off-by: Lucas Tanure Reviewed-by: Charles Keepax Link: https://lore.kernel.org/r/20211123163149.1530535-2-tanureal@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l41-spi.c | 32 +++----------------------------- sound/soc/codecs/cs35l41.c | 7 ------- sound/soc/codecs/cs35l41.h | 4 +--- 3 files changed, 4 insertions(+), 39 deletions(-) diff --git a/sound/soc/codecs/cs35l41-spi.c b/sound/soc/codecs/cs35l41-spi.c index 90a921f726c3..3fa99741779a 100644 --- a/sound/soc/codecs/cs35l41-spi.c +++ b/sound/soc/codecs/cs35l41-spi.c @@ -42,34 +42,6 @@ static const struct spi_device_id cs35l41_id_spi[] = { MODULE_DEVICE_TABLE(spi, cs35l41_id_spi); -static void cs35l41_spi_otp_setup(struct cs35l41_private *cs35l41, - bool is_pre_setup, unsigned int *freq) -{ - struct spi_device *spi; - u32 orig_spi_freq; - - spi = to_spi_device(cs35l41->dev); - - if (!spi) { - dev_err(cs35l41->dev, "%s: No SPI device\n", __func__); - return; - } - - if (is_pre_setup) { - orig_spi_freq = spi->max_speed_hz; - if (orig_spi_freq > CS35L41_SPI_MAX_FREQ_OTP) { - spi->max_speed_hz = CS35L41_SPI_MAX_FREQ_OTP; - spi_setup(spi); - } - *freq = orig_spi_freq; - } else { - if (spi->max_speed_hz != *freq) { - spi->max_speed_hz = *freq; - spi_setup(spi); - } - } -} - static int cs35l41_spi_probe(struct spi_device *spi) { const struct regmap_config *regmap_config = &cs35l41_regmap_spi; @@ -81,6 +53,9 @@ static int cs35l41_spi_probe(struct spi_device *spi) if (!cs35l41) return -ENOMEM; + spi->max_speed_hz = CS35L41_SPI_MAX_FREQ; + spi_setup(spi); + spi_set_drvdata(spi, cs35l41); cs35l41->regmap = devm_regmap_init_spi(spi, regmap_config); if (IS_ERR(cs35l41->regmap)) { @@ -91,7 +66,6 @@ static int cs35l41_spi_probe(struct spi_device *spi) cs35l41->dev = &spi->dev; cs35l41->irq = spi->irq; - cs35l41->otp_setup = cs35l41_spi_otp_setup; return cs35l41_probe(cs35l41, pdata); } diff --git a/sound/soc/codecs/cs35l41.c b/sound/soc/codecs/cs35l41.c index 9d0530dde996..9c4d481f7614 100644 --- a/sound/soc/codecs/cs35l41.c +++ b/sound/soc/codecs/cs35l41.c @@ -302,7 +302,6 @@ static int cs35l41_otp_unpack(void *data) const struct cs35l41_otp_packed_element_t *otp_map; struct cs35l41_private *cs35l41 = data; int bit_offset, word_offset, ret, i; - unsigned int orig_spi_freq; unsigned int bit_sum = 8; u32 otp_val, otp_id_reg; u32 *otp_mem; @@ -326,9 +325,6 @@ static int cs35l41_otp_unpack(void *data) goto err_otp_unpack; } - if (cs35l41->otp_setup) - cs35l41->otp_setup(cs35l41, true, &orig_spi_freq); - ret = regmap_bulk_read(cs35l41->regmap, CS35L41_OTP_MEM0, otp_mem, CS35L41_OTP_SIZE_WORDS); if (ret < 0) { @@ -336,9 +332,6 @@ static int cs35l41_otp_unpack(void *data) goto err_otp_unpack; } - if (cs35l41->otp_setup) - cs35l41->otp_setup(cs35l41, false, &orig_spi_freq); - otp_map = otp_map_match->map; bit_offset = otp_map_match->bit_offset; diff --git a/sound/soc/codecs/cs35l41.h b/sound/soc/codecs/cs35l41.h index 6cffe8a55beb..48485b08a6f1 100644 --- a/sound/soc/codecs/cs35l41.h +++ b/sound/soc/codecs/cs35l41.h @@ -726,7 +726,7 @@ #define CS35L41_FS2_WINDOW_MASK 0x00FFF800 #define CS35L41_FS2_WINDOW_SHIFT 12 -#define CS35L41_SPI_MAX_FREQ_OTP 4000000 +#define CS35L41_SPI_MAX_FREQ 4000000 #define CS35L41_RX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE) #define CS35L41_TX_FORMATS (SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_S24_LE) @@ -764,8 +764,6 @@ struct cs35l41_private { int irq; /* GPIO for /RST */ struct gpio_desc *reset_gpio; - void (*otp_setup)(struct cs35l41_private *cs35l41, bool is_pre_setup, - unsigned int *freq); }; int cs35l41_probe(struct cs35l41_private *cs35l41, From 86f74ba3fef56dd1cee19b7a15ae27fc0da5bb61 Mon Sep 17 00:00:00 2001 From: Ranjani Sridharan Date: Tue, 23 Nov 2021 18:57:59 +0200 Subject: [PATCH 066/231] ASoC: SOF: hda: reset DAI widget before reconfiguring it It is not unusual for ALSA/ASoC hw_params callbacks to be invoked multiple times. Reset and free the DAI widget before reconfiguring it to keep the DAI widget use_count balanced. Fixes: 0acb48dd31e3 ("ASoC: SOF: Intel: hda: make sure DAI widget is set up before IPC") Signed-off-by: Ranjani Sridharan Reviewed-by: Paul Olaru Reviewed-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20211123165759.127884-1-kai.vehmanen@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/sof/intel/hda.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/sof/intel/hda.c b/sound/soc/sof/intel/hda.c index 568d351b7a4e..2c0d4d06ab36 100644 --- a/sound/soc/sof/intel/hda.c +++ b/sound/soc/sof/intel/hda.c @@ -58,6 +58,13 @@ int hda_ctrl_dai_widget_setup(struct snd_soc_dapm_widget *w) return -EINVAL; } + /* DAI already configured, reset it before reconfiguring it */ + if (sof_dai->configured) { + ret = hda_ctrl_dai_widget_free(w); + if (ret < 0) + return ret; + } + config = &sof_dai->dai_config[sof_dai->current_config]; /* From 83bb2c1a01d7127d5adc7d69d7aaa3f7072de2b4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 10:20:06 +0000 Subject: [PATCH 067/231] KVM: arm64: Save PSTATE early on exit In order to be able to use primitives such as vcpu_mode_is_32bit(), we need to synchronize the guest PSTATE. However, this is currently done deep into the bowels of the world-switch code, and we do have helpers evaluating this much earlier (__vgic_v3_perform_cpuif_access and handle_aarch32_guest, for example). Move the saving of the guest pstate into the early fixups, which cures the first issue. The second one will be addressed separately. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 6 ++++++ arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 7 ++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 7a0af1d39303..d79fd101615f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -429,6 +429,12 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) */ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) { + /* + * Save PSTATE early so that we can evaluate the vcpu mode + * early on. + */ + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index de7e14c862e6..7ecca8b07851 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) { ctxt->regs.pc = read_sysreg_el2(SYS_ELR); - ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Guest PSTATE gets saved at guest fixup time in all + * cases. We still need to handle the nVHE host side here. + */ + if (!has_vhe() && ctxt->__hyp_running_vcpu) + ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR); if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2); From 7183b2b5ae6b8d77a37069566d77cf2a74060f7e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 16 Nov 2021 12:39:35 +0000 Subject: [PATCH 068/231] KVM: arm64: Move pkvm's special 32bit handling into a generic infrastructure Protected KVM is trying to turn AArch32 exceptions into an illegal exception entry. Unfortunately, it does that in a way that is a bit abrupt, and too early for PSTATE to be available. Instead, move it to the fixup code, which is a more reasonable place for it. This will also be useful for the NV code. Reviewed-by: Fuad Tabba Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 8 ++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 8 +------- arch/arm64/kvm/hyp/vhe/switch.c | 4 ++++ 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index d79fd101615f..96c5f3fb7838 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -403,6 +403,8 @@ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *); static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu); +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code); + /* * Allow the hypervisor to handle the exit with an exit handler if it has one. * @@ -435,6 +437,12 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) */ vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + /* + * Check whether we want to repaint the state one way or + * another. + */ + early_exit_filter(vcpu, exit_code); + if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index c0e3fed26d93..d13115a12434 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -233,7 +233,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) * Returns false if the guest ran in AArch32 when it shouldn't have, and * thus should exit to the host, or true if a the guest run loop can continue. */ -static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) { struct kvm *kvm = kern_hyp_va(vcpu->kvm); @@ -248,10 +248,7 @@ static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code) vcpu->arch.target = -1; *exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT); *exit_code |= ARM_EXCEPTION_IL; - return false; } - - return true; } /* Switch to the guest for legacy non-VHE systems */ @@ -316,9 +313,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) /* Jump in the fire! */ exit_code = __guest_enter(vcpu); - if (unlikely(!handle_aarch32_guest(vcpu, &exit_code))) - break; - /* And we're baaack! */ } while (fixup_guest_exit(vcpu, &exit_code)); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 5a2cb5d9bc4b..fbb26b93c347 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -112,6 +112,10 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu) return hyp_exit_handlers; } +static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code) +{ +} + /* Switch to the guest for VHE systems running in EL2 */ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { From ebb75b1b43d3e2bafc4d33eb4b1ae9c8d2759771 Mon Sep 17 00:00:00 2001 From: Deren Wu Date: Mon, 22 Nov 2021 23:10:27 +0800 Subject: [PATCH 069/231] mt76: fix timestamp check in tx_status Should keep SKBs only if timeout timestamp is still after jiffies. Otherwise, report tx status and drop it direclty. Fixes: bd1e3e7b693c ("mt76: introduce packet_id idr") Signed-off-by: Deren Wu Acked-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/7e3784949c0b29a00465966b89fdb0192bd0298e.1637593492.git.deren.wu@mediatek.com --- drivers/net/wireless/mediatek/mt76/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mediatek/mt76/tx.c b/drivers/net/wireless/mediatek/mt76/tx.c index 11719ef034d8..6b8c9dc80542 100644 --- a/drivers/net/wireless/mediatek/mt76/tx.c +++ b/drivers/net/wireless/mediatek/mt76/tx.c @@ -173,7 +173,7 @@ mt76_tx_status_skb_get(struct mt76_dev *dev, struct mt76_wcid *wcid, int pktid, if (!(cb->flags & MT_TX_CB_DMA_DONE)) continue; - if (!time_is_after_jiffies(cb->jiffies + + if (time_is_after_jiffies(cb->jiffies + MT_TX_STATUS_SKB_TIMEOUT)) continue; } From 2a9e9857473bfc5721092ff274bc1e371e5a0d2f Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 22 Nov 2021 18:34:03 +0100 Subject: [PATCH 070/231] mt76: fix possible pktid leak Fix a possible idr pkt-id leak if the packet is dropped on tx side Fixes: bd1e3e7b693c ("mt76: introduce packet_id idr") Signed-off-by: Lorenzo Bianconi Acked-by: Felix Fietkau Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/a560caffcc24452fb48af53904bbe5c45ea5db93.1637602268.git.lorenzo@kernel.org --- .../wireless/mediatek/mt76/mt7615/pci_mac.c | 3 +-- .../wireless/mediatek/mt76/mt7615/usb_sdio.c | 23 +++++++++++-------- .../wireless/mediatek/mt76/mt76x02_usb_core.c | 8 ++++++- .../net/wireless/mediatek/mt76/mt7915/mac.c | 15 ++++++------ .../wireless/mediatek/mt76/mt7921/sdio_mac.c | 16 ++++++++----- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c index 5ee52cd70a4b..d1806f198aed 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/pci_mac.c @@ -143,8 +143,6 @@ int mt7615_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, if (!wcid) wcid = &dev->mt76.global_wcid; - pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb); - if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && msta) { struct mt7615_phy *phy = &dev->phy; @@ -164,6 +162,7 @@ int mt7615_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, if (id < 0) return id; + pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb); mt7615_mac_write_txwi(dev, txwi_ptr, tx_info->skb, wcid, sta, pid, key, false); diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c index bd2939ebcbf4..bfe6c1579dc1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c @@ -43,17 +43,11 @@ EXPORT_SYMBOL_GPL(mt7663_usb_sdio_reg_map); static void mt7663_usb_sdio_write_txwi(struct mt7615_dev *dev, struct mt76_wcid *wcid, enum mt76_txq_id qid, struct ieee80211_sta *sta, - struct sk_buff *skb) + int pid, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *key = info->control.hw_key; __le32 *txwi; - int pid; - - if (!wcid) - wcid = &dev->mt76.global_wcid; - - pid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); txwi = (__le32 *)(skb->data - MT_USB_TXD_SIZE); memset(txwi, 0, MT_USB_TXD_SIZE); @@ -195,9 +189,12 @@ int mt7663_usb_sdio_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, struct sk_buff *skb = tx_info->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct mt7615_sta *msta; - int pad; + int pad, err, pktid; msta = wcid ? container_of(wcid, struct mt7615_sta, wcid) : NULL; + if (!wcid) + wcid = &dev->mt76.global_wcid; + if ((info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) && msta && !msta->rate_probe) { /* request to configure sampling rate */ @@ -207,7 +204,8 @@ int mt7663_usb_sdio_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, spin_unlock_bh(&dev->mt76.lock); } - mt7663_usb_sdio_write_txwi(dev, wcid, qid, sta, skb); + pktid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); + mt7663_usb_sdio_write_txwi(dev, wcid, qid, sta, pktid, skb); if (mt76_is_usb(mdev)) { u32 len = skb->len; @@ -217,7 +215,12 @@ int mt7663_usb_sdio_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, pad = round_up(skb->len, 4) - skb->len; } - return mt76_skb_adjust_pad(skb, pad); + err = mt76_skb_adjust_pad(skb, pad); + if (err) + /* Release pktid in case of error. */ + idr_remove(&wcid->pktid, pktid); + + return err; } EXPORT_SYMBOL_GPL(mt7663_usb_sdio_tx_prepare_skb); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c index efd70ddc2fd1..2c6c03809b20 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_usb_core.c @@ -72,6 +72,7 @@ int mt76x02u_tx_prepare_skb(struct mt76_dev *mdev, void *data, bool ampdu = IEEE80211_SKB_CB(tx_info->skb)->flags & IEEE80211_TX_CTL_AMPDU; enum mt76_qsel qsel; u32 flags; + int err; mt76_insert_hdr_pad(tx_info->skb); @@ -106,7 +107,12 @@ int mt76x02u_tx_prepare_skb(struct mt76_dev *mdev, void *data, ewma_pktlen_add(&msta->pktlen, tx_info->skb->len); } - return mt76x02u_skb_dma_info(tx_info->skb, WLAN_PORT, flags); + err = mt76x02u_skb_dma_info(tx_info->skb, WLAN_PORT, flags); + if (err && wcid) + /* Release pktid in case of error. */ + idr_remove(&wcid->pktid, pid); + + return err; } EXPORT_SYMBOL_GPL(mt76x02u_tx_prepare_skb); diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c index 5fcf35f2d9fb..809dc18e5083 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/mac.c @@ -1151,8 +1151,14 @@ int mt7915_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, } } - pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb); + t = (struct mt76_txwi_cache *)(txwi + mdev->drv->txwi_size); + t->skb = tx_info->skb; + id = mt76_token_consume(mdev, &t); + if (id < 0) + return id; + + pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb); mt7915_mac_write_txwi(dev, txwi_ptr, tx_info->skb, wcid, pid, key, false); @@ -1178,13 +1184,6 @@ int mt7915_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, txp->bss_idx = mvif->idx; } - t = (struct mt76_txwi_cache *)(txwi + mdev->drv->txwi_size); - t->skb = tx_info->skb; - - id = mt76_token_consume(mdev, &t); - if (id < 0) - return id; - txp->token = cpu_to_le16(id); if (test_bit(MT_WCID_FLAG_4ADDR, &wcid->flags)) txp->rept_wds_wcid = cpu_to_le16(wcid->idx); diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c index 137f86a6dbf8..85b3d88f8ecc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c @@ -142,14 +142,12 @@ out: static void mt7921s_write_txwi(struct mt7921_dev *dev, struct mt76_wcid *wcid, enum mt76_txq_id qid, struct ieee80211_sta *sta, - struct sk_buff *skb) + int pid, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *key = info->control.hw_key; __le32 *txwi; - int pid; - pid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); txwi = (__le32 *)(skb->data - MT_SDIO_TXD_SIZE); memset(txwi, 0, MT_SDIO_TXD_SIZE); mt7921_mac_write_txwi(dev, txwi, skb, wcid, key, pid, false); @@ -164,7 +162,7 @@ int mt7921s_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, struct mt7921_dev *dev = container_of(mdev, struct mt7921_dev, mt76); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_info->skb); struct sk_buff *skb = tx_info->skb; - int pad; + int err, pad, pktid; if (unlikely(tx_info->skb->len <= ETH_HLEN)) return -EINVAL; @@ -181,12 +179,18 @@ int mt7921s_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, } } - mt7921s_write_txwi(dev, wcid, qid, sta, skb); + pktid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); + mt7921s_write_txwi(dev, wcid, qid, sta, pktid, skb); mt7921_skb_add_sdio_hdr(skb, MT7921_SDIO_DATA); pad = round_up(skb->len, 4) - skb->len; - return mt76_skb_adjust_pad(skb, pad); + err = mt76_skb_adjust_pad(skb, pad); + if (err) + /* Release pktid in case of error. */ + idr_remove(&wcid->pktid, pktid); + + return err; } void mt7921s_tx_complete_skb(struct mt76_dev *mdev, struct mt76_queue_entry *e) From a1de97fe296c52eafc6590a3506f4bbd44ecb19a Mon Sep 17 00:00:00 2001 From: Yang Xu Date: Wed, 24 Nov 2021 10:06:02 -0800 Subject: [PATCH 071/231] xfs: Fix the free logic of state in xfs_attr_node_hasname When testing xfstests xfs/126 on lastest upstream kernel, it will hang on some machine. Adding a getxattr operation after xattr corrupted, I can reproduce it 100%. The deadlock as below: [983.923403] task:setfattr state:D stack: 0 pid:17639 ppid: 14687 flags:0x00000080 [ 983.923405] Call Trace: [ 983.923410] __schedule+0x2c4/0x700 [ 983.923412] schedule+0x37/0xa0 [ 983.923414] schedule_timeout+0x274/0x300 [ 983.923416] __down+0x9b/0xf0 [ 983.923451] ? xfs_buf_find.isra.29+0x3c8/0x5f0 [xfs] [ 983.923453] down+0x3b/0x50 [ 983.923471] xfs_buf_lock+0x33/0xf0 [xfs] [ 983.923490] xfs_buf_find.isra.29+0x3c8/0x5f0 [xfs] [ 983.923508] xfs_buf_get_map+0x4c/0x320 [xfs] [ 983.923525] xfs_buf_read_map+0x53/0x310 [xfs] [ 983.923541] ? xfs_da_read_buf+0xcf/0x120 [xfs] [ 983.923560] xfs_trans_read_buf_map+0x1cf/0x360 [xfs] [ 983.923575] ? xfs_da_read_buf+0xcf/0x120 [xfs] [ 983.923590] xfs_da_read_buf+0xcf/0x120 [xfs] [ 983.923606] xfs_da3_node_read+0x1f/0x40 [xfs] [ 983.923621] xfs_da3_node_lookup_int+0x69/0x4a0 [xfs] [ 983.923624] ? kmem_cache_alloc+0x12e/0x270 [ 983.923637] xfs_attr_node_hasname+0x6e/0xa0 [xfs] [ 983.923651] xfs_has_attr+0x6e/0xd0 [xfs] [ 983.923664] xfs_attr_set+0x273/0x320 [xfs] [ 983.923683] xfs_xattr_set+0x87/0xd0 [xfs] [ 983.923686] __vfs_removexattr+0x4d/0x60 [ 983.923688] __vfs_removexattr_locked+0xac/0x130 [ 983.923689] vfs_removexattr+0x4e/0xf0 [ 983.923690] removexattr+0x4d/0x80 [ 983.923693] ? __check_object_size+0xa8/0x16b [ 983.923695] ? strncpy_from_user+0x47/0x1a0 [ 983.923696] ? getname_flags+0x6a/0x1e0 [ 983.923697] ? _cond_resched+0x15/0x30 [ 983.923699] ? __sb_start_write+0x1e/0x70 [ 983.923700] ? mnt_want_write+0x28/0x50 [ 983.923701] path_removexattr+0x9b/0xb0 [ 983.923702] __x64_sys_removexattr+0x17/0x20 [ 983.923704] do_syscall_64+0x5b/0x1a0 [ 983.923705] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 983.923707] RIP: 0033:0x7f080f10ee1b When getxattr calls xfs_attr_node_get function, xfs_da3_node_lookup_int fails with EFSCORRUPTED in xfs_attr_node_hasname because we have use blocktrash to random it in xfs/126. So it free state in internal and xfs_attr_node_get doesn't do xfs_buf_trans release job. Then subsequent removexattr will hang because of it. This bug was introduced by kernel commit 07120f1abdff ("xfs: Add xfs_has_attr and subroutines"). It adds xfs_attr_node_hasname helper and said caller will be responsible for freeing the state in this case. But xfs_attr_node_hasname will free state itself instead of caller if xfs_da3_node_lookup_int fails. Fix this bug by moving the step of free state into caller. Also, use "goto error/out" instead of returning error directly in xfs_attr_node_addname_find_attr and xfs_attr_node_removename_setup function because we should free state ourselves. Fixes: 07120f1abdff ("xfs: Add xfs_has_attr and subroutines") Signed-off-by: Yang Xu Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_attr.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index fbc9d816882c..23523b802539 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1077,21 +1077,18 @@ xfs_attr_node_hasname( state = xfs_da_state_alloc(args); if (statep != NULL) - *statep = NULL; + *statep = state; /* * Search to see if name exists, and get back a pointer to it. */ error = xfs_da3_node_lookup_int(state, &retval); - if (error) { - xfs_da_state_free(state); - return error; - } + if (error) + retval = error; - if (statep != NULL) - *statep = state; - else + if (!statep) xfs_da_state_free(state); + return retval; } @@ -1112,7 +1109,7 @@ xfs_attr_node_addname_find_attr( */ retval = xfs_attr_node_hasname(args, &dac->da_state); if (retval != -ENOATTR && retval != -EEXIST) - return retval; + goto error; if (retval == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) goto error; @@ -1337,7 +1334,7 @@ int xfs_attr_node_removename_setup( error = xfs_attr_node_hasname(args, state); if (error != -EEXIST) - return error; + goto out; error = 0; ASSERT((*state)->path.blk[(*state)->path.active - 1].bp != NULL); From 1090427bf18f9835b3ccbd36edf43f2509444e27 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Nov 2021 10:06:02 -0800 Subject: [PATCH 072/231] xfs: remove xfs_inew_wait With the remove of xfs_dqrele_all_inodes, xfs_inew_wait and all the infrastructure used to wake the XFS_INEW bit waitqueue is unused. Reported-by: kernel test robot Fixes: 777eb1fa857e ("xfs: remove xfs_dqrele_all_inodes") Signed-off-by: Christoph Hellwig Reviewed-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_icache.c | 21 --------------------- fs/xfs/xfs_inode.h | 4 +--- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e1472004170e..da4af2142a2b 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -289,22 +289,6 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } -static inline void -xfs_inew_wait( - struct xfs_inode *ip) -{ - wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); - DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); - - do { - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - if (!xfs_iflags_test(ip, XFS_INEW)) - break; - schedule(); - } while (true); - finish_wait(wq, &wait.wq_entry); -} - /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -368,18 +352,13 @@ xfs_iget_recycle( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); error = xfs_reinit_inode(mp, inode); if (error) { - bool wake; - /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); - if (wake) - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e635a3d64cba..c447bf04205a 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -231,8 +231,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define __XFS_INEW_BIT 3 /* inode has just been allocated */ -#define XFS_INEW (1 << __XFS_INEW_BIT) +#define XFS_INEW (1 << 3) /* inode has just been allocated */ #define XFS_IPRESERVE_DM_FIELDS (1 << 4) /* has legacy DMAPI fields set */ #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ @@ -492,7 +491,6 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); - wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) From 5ad448ce2976f829d95dcae5e6e91f6686b0e4de Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 24 Nov 2021 10:15:47 -0800 Subject: [PATCH 073/231] iomap: iomap_read_inline_data cleanup Change iomap_read_inline_data to return 0 or an error code; this simplifies the callers. Add a description. Signed-off-by: Andreas Gruenbacher Reviewed-by: Christoph Hellwig [djwong: document the return value of iomap_read_inline_data explicitly] Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/iomap/buffered-io.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index fe10d8a30f6b..71a36ae120ee 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -205,7 +205,16 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static loff_t iomap_read_inline_data(const struct iomap_iter *iter, +/** + * iomap_read_inline_data - copy inline data into the page cache + * @iter: iteration structure + * @page: page to copy to + * + * Copy the inline data in @iter into @page and zero out the rest of the page. + * Only a single IOMAP_INLINE extent is allowed at the end of each file. + * Returns zero for success to complete the read, or the usual negative errno. + */ +static int iomap_read_inline_data(const struct iomap_iter *iter, struct page *page) { const struct iomap *iomap = iomap_iter_srcmap(iter); @@ -214,7 +223,7 @@ static loff_t iomap_read_inline_data(const struct iomap_iter *iter, void *addr; if (PageUptodate(page)) - return PAGE_SIZE - poff; + return 0; if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) return -EIO; @@ -231,7 +240,7 @@ static loff_t iomap_read_inline_data(const struct iomap_iter *iter, memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); - return PAGE_SIZE - poff; + return 0; } static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, @@ -256,13 +265,8 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - loff_t ret = iomap_read_inline_data(iter, page); - - if (ret < 0) - return ret; - return 0; - } + if (iomap->type == IOMAP_INLINE) + return iomap_read_inline_data(iter, page); /* zero post-eof blocks as the page may be mapped */ iop = iomap_page_create(iter->inode, page); @@ -587,15 +591,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, static int iomap_write_begin_inline(const struct iomap_iter *iter, struct page *page) { - int ret; - /* needs more work for the tailpacking case; disable for now */ if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) return -EIO; - ret = iomap_read_inline_data(iter, page); - if (ret < 0) - return ret; - return 0; + return iomap_read_inline_data(iter, page); } static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, From 6318cb887548c70778d10c0fcb7134b4454ab8a6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:02 -0500 Subject: [PATCH 074/231] Revert "virtio-scsi: don't let virtio core to validate used buffer length" This reverts commit c57911ebfbfe745cb95da2bcf547c5bae000590f. Attempts to validate length in the core did not work out. We'll drop them for now, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/scsi/virtio_scsi.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 19f7d7b90625..28e1d98ae102 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -977,7 +977,6 @@ static unsigned int features[] = { static struct virtio_driver virtio_scsi_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From 2b17d9f84884a37f1324be5526c6600e97a47fbe Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:09 -0500 Subject: [PATCH 075/231] Revert "virtio-blk: don't let virtio core to validate used length" This reverts commit a40392edf1b2c7822bc0ce68413106661a9d4232. Attempts to validate length in the core did not work out. We'll drop them, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/block/virtio_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 97bf051a50ce..1a1b1189225d 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1049,7 +1049,6 @@ static struct virtio_driver virtio_blk = { .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From fcfb65f8a922c7dd25a2e9913601dae979ce6560 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:13 -0500 Subject: [PATCH 076/231] Revert "virtio-net: don't let virtio core to validate used length" This reverts commit 816625c13652cef5b2c49082d652875da6f2ad7a. Attempts to validate length in the core did not work out. We'll drop them, so revert the dependent changes in drivers. Signed-off-by: Michael S. Tsirkin --- drivers/net/virtio_net.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 1771d6e5224f..55db6a336f7e 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3423,7 +3423,6 @@ static struct virtio_driver virtio_net_driver = { .feature_table_size = ARRAY_SIZE(features), .feature_table_legacy = features_legacy, .feature_table_size_legacy = ARRAY_SIZE(features_legacy), - .suppress_used_validation = true, .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, From f124034faa911ed534bf8c4881ad98dbbde2a966 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 24 Nov 2021 18:44:17 -0500 Subject: [PATCH 077/231] Revert "virtio_ring: validate used buffer length" This reverts commit 939779f5152d161b34f612af29e7dc1ac4472fcf. Attempts to validate length in the core did not work out: there turn out to exist multiple broken devices, and in particular legacy devices are known to be broken in this respect. We have ideas for handling this better in the next version but for now let's revert to a known good state to make sure drivers work for people. Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 60 ------------------------------------ include/linux/virtio.h | 2 -- 2 files changed, 62 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 00f64f2f8b72..6d2614e34470 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -14,9 +14,6 @@ #include #include -static bool force_used_validation = false; -module_param(force_used_validation, bool, 0444); - #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ @@ -185,9 +182,6 @@ struct vring_virtqueue { } packed; }; - /* Per-descriptor in buffer length */ - u32 *buflen; - /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -496,7 +490,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; - u32 buflen = 0; START_USE(vq); @@ -578,7 +571,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, indirect); - buflen += sg->length; } } /* Last one doesn't continue. */ @@ -618,10 +610,6 @@ static inline int virtqueue_add_split(struct virtqueue *_vq, else vq->split.desc_state[head].indir_desc = ctx; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[head] = buflen; - /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); @@ -796,11 +784,6 @@ static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[i])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[i]); - return NULL; - } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; @@ -1079,7 +1062,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, unsigned int i, n, err_idx; u16 head, id; dma_addr_t addr; - u32 buflen = 0; head = vq->packed.next_avail_idx; desc = alloc_indirect_packed(total_sg, gfp); @@ -1109,8 +1091,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); i++; - if (n >= out_sgs) - buflen += sg->length; } } @@ -1164,10 +1144,6 @@ static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - vq->num_added += 1; pr_debug("Added buffer head %i to %p\n", head, vq); @@ -1203,7 +1179,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; - u32 buflen = 0; START_USE(vq); @@ -1283,8 +1258,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } - if (n >= out_sgs) - buflen += sg->length; } } @@ -1304,10 +1277,6 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, vq->packed.desc_state[id].indir_desc = ctx; vq->packed.desc_state[id].last = prev; - /* Store in buffer length if necessary */ - if (vq->buflen) - vq->buflen[id] = buflen; - /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising @@ -1494,11 +1463,6 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } - if (vq->buflen && unlikely(*len > vq->buflen[id])) { - BAD_RING(vq, "used len %d is larger than in buflen %u\n", - *len, vq->buflen[id]); - return NULL; - } /* detach_buf_packed clears data, so grab it now. */ ret = vq->packed.desc_state[id].data; @@ -1704,7 +1668,6 @@ static struct virtqueue *vring_create_virtqueue_packed( struct vring_virtqueue *vq; struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; @@ -1794,15 +1757,6 @@ static struct virtqueue *vring_create_virtqueue_packed( if (!vq->packed.desc_extra) goto err_desc_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* No callback? Tell other side not to bother us. */ if (!callback) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; @@ -1815,8 +1769,6 @@ static struct virtqueue *vring_create_virtqueue_packed( spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->packed.desc_extra); err_desc_extra: kfree(vq->packed.desc_state); err_desc_state: @@ -2224,7 +2176,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, void (*callback)(struct virtqueue *), const char *name) { - struct virtio_driver *drv = drv_to_virtio(vdev->dev.driver); struct vring_virtqueue *vq; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) @@ -2284,15 +2235,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (!vq->split.desc_extra) goto err_extra; - if (!drv->suppress_used_validation || force_used_validation) { - vq->buflen = kmalloc_array(vring.num, sizeof(*vq->buflen), - GFP_KERNEL); - if (!vq->buflen) - goto err_buflen; - } else { - vq->buflen = NULL; - } - /* Put everything in free lists. */ vq->free_head = 0; memset(vq->split.desc_state, 0, vring.num * @@ -2303,8 +2245,6 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, spin_unlock(&vdev->vqs_list_lock); return &vq->vq; -err_buflen: - kfree(vq->split.desc_extra); err_extra: kfree(vq->split.desc_state); err_state: diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 44d0e09da2d9..41edbc01ffa4 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -152,7 +152,6 @@ size_t virtio_max_dma_size(struct virtio_device *vdev); * @feature_table_size: number of entries in the feature table array. * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. - * @suppress_used_validation: set to not have core validate used length * @probe: the function to call when a device is found. Returns 0 or -errno. * @scan: optional function to call after successful probe; intended * for virtio-scsi to invoke a scan. @@ -169,7 +168,6 @@ struct virtio_driver { unsigned int feature_table_size; const unsigned int *feature_table_legacy; unsigned int feature_table_size_legacy; - bool suppress_used_validation; int (*validate)(struct virtio_device *dev); int (*probe)(struct virtio_device *dev); void (*scan)(struct virtio_device *dev); From 49d8c5ffad07ca014cfae72a1b9b8c52b6ad9cb8 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 22 Nov 2021 17:35:24 +0100 Subject: [PATCH 078/231] vhost/vsock: fix incorrect used length reported to the guest The "used length" reported by calling vhost_add_used() must be the number of bytes written by the device (using "in" buffers). In vhost_vsock_handle_tx_kick() the device only reads the guest buffers (they are all "out" buffers), without writing anything, so we must pass 0 as "used length" to comply virtio spec. Fixes: 433fc58e6bf2 ("VSOCK: Introduce vhost_vsock.ko") Cc: stable@vger.kernel.org Reported-by: Halil Pasic Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211122163525.294024-2-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi Reviewed-by: Halil Pasic --- drivers/vhost/vsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 938aefbc75ec..4e3b95af7ee4 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -554,7 +554,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) virtio_transport_free_pkt(pkt); len += sizeof(pkt->hdr); - vhost_add_used(vq, head, len); + vhost_add_used(vq, head, 0); total_len += len; added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); From 11708ff92c1dba9aaa59168c46c5317677595942 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 22 Nov 2021 17:35:25 +0100 Subject: [PATCH 079/231] vhost/vsock: cleanup removing `len` variable We can increment `total_len` directly and remove `len` since it is no longer used for vhost_add_used(). Signed-off-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211122163525.294024-3-sgarzare@redhat.com Signed-off-by: Michael S. Tsirkin Reviewed-by: Stefan Hajnoczi --- drivers/vhost/vsock.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 4e3b95af7ee4..d6ca1c7ad513 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -511,8 +511,6 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) vhost_disable_notify(&vsock->dev, vq); do { - u32 len; - if (!vhost_vsock_more_replies(vsock)) { /* Stop tx until the device processes already * pending replies. Leave tx virtqueue @@ -540,7 +538,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) continue; } - len = pkt->len; + total_len += sizeof(pkt->hdr) + pkt->len; /* Deliver to monitoring devices all received packets */ virtio_transport_deliver_tap_pkt(pkt); @@ -553,9 +551,7 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) else virtio_transport_free_pkt(pkt); - len += sizeof(pkt->hdr); vhost_add_used(vq, head, 0); - total_len += len; added = true; } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); From 0466a39bd0b6c462338f10d18076703d14a552de Mon Sep 17 00:00:00 2001 From: Ye Guojin Date: Wed, 17 Nov 2021 06:39:55 +0000 Subject: [PATCH 080/231] virtio-blk: modify the value type of num in virtio_queue_rq() This was found by coccicheck: ./drivers/block/virtio_blk.c, 334, 14-17, WARNING Unsigned expression compared with zero num < 0 Reported-by: Zeal Robot Signed-off-by: Ye Guojin Link: https://lore.kernel.org/r/20211117063955.160777-1-ye.guojin@zte.com.cn Signed-off-by: Michael S. Tsirkin Fixes: 02746e26c39e ("virtio-blk: avoid preallocating big SGL for data") Reviewed-by: Stefano Garzarella Reviewed-by: Max Gurtovoy Reviewed-by: Stefan Hajnoczi --- drivers/block/virtio_blk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1a1b1189225d..6ae38776e30e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -316,7 +316,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; - unsigned int num; + int num; int qid = hctx->queue_num; bool notify = false; blk_status_t status; From ea8f17e44fa7d54fae287ccbe30ce269afb5ee42 Mon Sep 17 00:00:00 2001 From: Wu Zongyong Date: Mon, 15 Nov 2021 11:16:42 +0800 Subject: [PATCH 081/231] vhost-vdpa: clean irqs before reseting vdpa device Vdpa devices should be reset after unseting irqs of virtqueues, or we will get errors when killing qemu process: >> pi_update_irte: failed to update PI IRTE >> irq bypass consumer (token 0000000065102a43) unregistration fails: -22 Signed-off-by: Wu Zongyong Link: https://lore.kernel.org/r/a2cb60cf73be9da5c4e6399242117d8818f975ae.1636946171.git.wuzongyong@linux.alibaba.com Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/vhost/vdpa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 01c59ce7e250..29cced1cd277 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -1014,12 +1014,12 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) mutex_lock(&d->mutex); filep->private_data = NULL; + vhost_vdpa_clean_irq(v); vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); vhost_vdpa_iotlb_free(v); vhost_vdpa_free_domain(v); vhost_vdpa_config_put(v); - vhost_vdpa_clean_irq(v); vhost_dev_cleanup(&v->vdev); kfree(v->vdev.vqs); mutex_unlock(&d->mutex); From bb93ce4b150dde79f58e34103cbd1fe829796649 Mon Sep 17 00:00:00 2001 From: Longpeng Date: Wed, 24 Nov 2021 09:52:15 +0800 Subject: [PATCH 082/231] vdpa_sim: avoid putting an uninitialized iova_domain The system will crash if we put an uninitialized iova_domain, this could happen when an error occurs before initializing the iova_domain in vdpasim_create(). BUG: kernel NULL pointer dereference, address: 0000000000000000 ... RIP: 0010:__cpuhp_state_remove_instance+0x96/0x1c0 ... Call Trace: put_iova_domain+0x29/0x220 vdpasim_free+0xd1/0x120 [vdpa_sim] vdpa_release_dev+0x21/0x40 [vdpa] device_release+0x33/0x90 kobject_release+0x63/0x160 vdpasim_create+0x127/0x2a0 [vdpa_sim] vdpasim_net_dev_add+0x7d/0xfe [vdpa_sim_net] vdpa_nl_cmd_dev_add_set_doit+0xe1/0x1a0 [vdpa] genl_family_rcv_msg_doit+0x112/0x140 genl_rcv_msg+0xdf/0x1d0 ... So we must make sure the iova_domain is already initialized before put it. In addition, we may get the following warning in this case: WARNING: ... drivers/iommu/iova.c:344 iova_cache_put+0x58/0x70 So we must make sure the iova_cache_put() is invoked only if the iova_cache_get() is already invoked. Let's fix it together. Cc: stable@vger.kernel.org Fixes: 4080fc106750 ("vdpa_sim: use iova module to allocate IOVA addresses") Signed-off-by: Longpeng Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://lore.kernel.org/r/20211124015215.119-1-longpeng2@huawei.com Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 5f484fff8dbe..41b0cd17fcba 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -591,8 +591,11 @@ static void vdpasim_free(struct vdpa_device *vdpa) vringh_kiov_cleanup(&vdpasim->vqs[i].in_iov); } - put_iova_domain(&vdpasim->iova); - iova_cache_put(); + if (vdpa_get_dma_dev(vdpa)) { + put_iova_domain(&vdpasim->iova); + iova_cache_put(); + } + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); From f8fbfd85f5c95fff477a7c19f576725945891d0c Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 7 Nov 2021 16:22:57 +0100 Subject: [PATCH 083/231] ksmbd: Fix an error handling path in 'smb2_sess_setup()' All the error handling paths of 'smb2_sess_setup()' end to 'out_err'. All but the new error handling path added by the commit given in the Fixes tag below. Fix this error handling path and branch to 'out_err' as well. Fixes: 0d994cd482ee ("ksmbd: add buffer validation in session setup") Cc: stable@vger.kernel.org # v5.15 Acked-by: Namjae Jeon Signed-off-by: Christophe JAILLET Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 121f8e8c70ac..7d2e8599dc27 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -1697,8 +1697,10 @@ int smb2_sess_setup(struct ksmbd_work *work) negblob_off = le16_to_cpu(req->SecurityBufferOffset); negblob_len = le16_to_cpu(req->SecurityBufferLength); if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) || - negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) - return -EINVAL; + negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) { + rc = -EINVAL; + goto out_err; + } negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + negblob_off); From 2d239f0f6ad0cffc4622a6b35d54aa0a123cc764 Mon Sep 17 00:00:00 2001 From: Salvatore Bonaccorso Date: Fri, 19 Nov 2021 22:12:14 +0100 Subject: [PATCH 084/231] docs: filesystem: cifs: ksmbd: Fix small layout issues In some senteces there were missing spaces between words. Fix wording in item to show which prints are enabled and add a space beween the cat command and its argument. Cc: Sergey Senozhatsky Cc: Steve French CC: Hyunchul Lee Cc: linux-cifs@vger.kernel.org Acked-by: Namjae Jeon Signed-off-by: Salvatore Bonaccorso Signed-off-by: Steve French --- Documentation/filesystems/cifs/ksmbd.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/cifs/ksmbd.rst b/Documentation/filesystems/cifs/ksmbd.rst index a1326157d53f..b0d354fd8066 100644 --- a/Documentation/filesystems/cifs/ksmbd.rst +++ b/Documentation/filesystems/cifs/ksmbd.rst @@ -50,11 +50,11 @@ ksmbd.mountd (user space daemon) -------------------------------- ksmbd.mountd is userspace process to, transfer user account and password that -are registered using ksmbd.adduser(part of utils for user space). Further it +are registered using ksmbd.adduser (part of utils for user space). Further it allows sharing information parameters that parsed from smb.conf to ksmbd in kernel. For the execution part it has a daemon which is continuously running and connected to the kernel interface using netlink socket, it waits for the -requests(dcerpc and share/user info). It handles RPC calls (at a minimum few +requests (dcerpc and share/user info). It handles RPC calls (at a minimum few dozen) that are most important for file server from NetShareEnum and NetServerGetInfo. Complete DCE/RPC response is prepared from the user space and passed over to the associated kernel thread for the client. @@ -154,11 +154,11 @@ Each layer 1. Enable all component prints # sudo ksmbd.control -d "all" -2. Enable one of components(smb, auth, vfs, oplock, ipc, conn, rdma) +2. Enable one of components (smb, auth, vfs, oplock, ipc, conn, rdma) # sudo ksmbd.control -d "smb" -3. Show what prints are enable. - # cat/sys/class/ksmbd-control/debug +3. Show what prints are enabled. + # cat /sys/class/ksmbd-control/debug [smb] auth vfs oplock ipc conn [rdma] 4. Disable prints: From 8e537d1465e7401f352a6e0a728a93f8cad5294a Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 21 Nov 2021 07:48:45 +0900 Subject: [PATCH 085/231] ksmbd: downgrade addition info error msg to debug in smb2_get_info_sec() While file transfer through windows client, This error flood message happen. This flood message will cause performance degradation and misunderstand server has problem. Fixes: e294f78d3478 ("ksmbd: allow PROTECTED_DACL_SECINFO and UNPROTECTED_DACL_SECINFO addition information in smb2 set info security") Cc: stable@vger.kernel.org # v5.15 Acked-by: Hyunchul Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 7d2e8599dc27..a513ea523250 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -5070,7 +5070,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | PROTECTED_DACL_SECINFO | UNPROTECTED_DACL_SECINFO)) { - pr_err("Unsupported addition info: 0x%x)\n", + ksmbd_debug(SMB, "Unsupported addition info: 0x%x)\n", addition_info); pntsd->revision = cpu_to_le16(1); From 1ec72153ff434ce75bace3044dc89a23a05d7064 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 21 Nov 2021 11:32:39 +0900 Subject: [PATCH 086/231] ksmbd: contain default data stream even if xattr is empty If xattr is not supported like exfat or fat, ksmbd server doesn't contain default data stream in FILE_STREAM_INFORMATION response. It will cause ppt or doc file update issue if local filesystem is such as ones. This patch move goto statement to contain it. Fixes: 9f6323311c70 ("ksmbd: add default data stream name in FILE_STREAM_INFORMATION") Cc: stable@vger.kernel.org # v5.15 Acked-by: Hyunchul Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index a513ea523250..370a32b93087 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -4459,6 +4459,12 @@ static void get_file_stream_info(struct ksmbd_work *work, &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + goto out; + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (xattr_list_len < 0) { goto out; @@ -4467,12 +4473,6 @@ static void get_file_stream_info(struct ksmbd_work *work, goto out; } - buf_free_len = - smb2_calc_max_out_buf_len(work, 8, - le32_to_cpu(req->OutputBufferLength)); - if (buf_free_len < 0) - goto out; - while (idx < xattr_list_len) { stream_name = xattr_list + idx; streamlen = strlen(stream_name); @@ -4516,6 +4516,7 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->NextEntryOffset = cpu_to_le32(next); } +out: if (!S_ISDIR(stat.mode) && buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) { file_info = (struct smb2_file_stream_info *) @@ -4524,14 +4525,13 @@ static void get_file_stream_info(struct ksmbd_work *work, "::$DATA", 7, conn->local_nls, 0); streamlen *= 2; file_info->StreamNameLength = cpu_to_le32(streamlen); - file_info->StreamSize = 0; - file_info->StreamAllocationSize = 0; + file_info->StreamSize = cpu_to_le64(stat.size); + file_info->StreamAllocationSize = cpu_to_le64(stat.blocks << 9); nbytes += sizeof(struct smb2_file_stream_info) + streamlen; } /* last entry offset should be 0 */ file_info->NextEntryOffset = 0; -out: kvfree(xattr_list); rsp->OutputBufferLength = cpu_to_le32(nbytes); From 178ca6f85aa3231094467691f5ea1ff2f398aa8d Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 24 Nov 2021 10:23:02 +0900 Subject: [PATCH 087/231] ksmbd: fix memleak in get_file_stream_info() Fix memleak in get_file_stream_info() Fixes: 34061d6b76a4 ("ksmbd: validate OutputBufferLength of QUERY_DIR, QUERY_INFO, IOCTL requests") Cc: stable@vger.kernel.org # v5.15 Reported-by: Coverity Scan Acked-by: Hyunchul Lee Signed-off-by: Namjae Jeon Signed-off-by: Steve French --- fs/ksmbd/smb2pdu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index 370a32b93087..49c9da37315c 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -4498,8 +4498,10 @@ static void get_file_stream_info(struct ksmbd_work *work, ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); next = sizeof(struct smb2_file_stream_info) + streamlen * 2; - if (next > buf_free_len) + if (next > buf_free_len) { + kfree(stream_buf); break; + } file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, From fbf3bce458214bb971d3d571515b3b129eac290b Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Fri, 19 Nov 2021 17:50:52 +0000 Subject: [PATCH 088/231] MIPS: boot/compressed/: add __ashldi3 to target for ZSTD compression Just like before with __bswapdi2(), for MIPS pre-boot when CONFIG_KERNEL_ZSTD=y the decompressor function will use __ashldi3(), so the object file should be added to the target object file. Fixes these build errors: mipsel-linux-ld: arch/mips/boot/compressed/decompress.o: in function `FSE_buildDTable_internal': decompress.c:(.text.FSE_buildDTable_internal+0x48): undefined reference to `__ashldi3' mipsel-linux-ld: arch/mips/boot/compressed/decompress.o: in function `FSE_decompress_wksp_body_default': decompress.c:(.text.FSE_decompress_wksp_body_default+0xa8): undefined reference to `__ashldi3' mipsel-linux-ld: arch/mips/boot/compressed/decompress.o: in function `ZSTD_getFrameHeader_advanced': decompress.c:(.text.ZSTD_getFrameHeader_advanced+0x134): undefined reference to `__ashldi3' Signed-off-by: Paul Cercueil Reviewed-by: Randy Dunlap Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/compressed/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 2861a05c2e0c..f27cf31b4140 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -52,7 +52,7 @@ endif vmlinuzobjs-$(CONFIG_KERNEL_XZ) += $(obj)/ashldi3.o -vmlinuzobjs-$(CONFIG_KERNEL_ZSTD) += $(obj)/bswapdi.o +vmlinuzobjs-$(CONFIG_KERNEL_ZSTD) += $(obj)/bswapdi.o $(obj)/ashldi3.o targets := $(notdir $(vmlinuzobjs-y)) From c33fdfbabb6c930454df017f3cd3507dc1a87d09 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 24 Nov 2021 13:03:23 -0800 Subject: [PATCH 089/231] ipmi: fix oob access due to uninit smi_msg type We're hitting OOB accesses in handle_ipmb_direct_rcv_rsp() (memcpy of size -1) after user space generates a message. Looks like the message is incorrectly assumed to be of the new IPMB type, because type is never set and message is allocated with kmalloc() not kzalloc(). Fixes: 059747c245f0 ("ipmi: Add support for IPMB direct messages") Signed-off-by: Jakub Kicinski Message-Id: <20211124210323.1950976-1-kuba@kernel.org> Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index a2ec0171363a..7d7df17d8b3d 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -5033,6 +5033,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void) if (rv) { rv->done = free_smi_msg; rv->user_data = NULL; + rv->type = IPMI_SMI_MSG_TYPE_NORMAL; atomic_inc(&smi_msg_inuse_count); } return rv; From 1cab5bd69eb1f995ced2d7576cb15f8a8941fd85 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 25 Nov 2021 19:39:32 +0800 Subject: [PATCH 090/231] MIPS: Fix using smp_processor_id() in preemptible in show_cpuinfo() There exists the following issue under DEBUG_PREEMPT: BUG: using smp_processor_id() in preemptible [00000000] code: systemd/1 caller is show_cpuinfo+0x460/0xea0 ... Call Trace: [] show_stack+0x94/0x128 [] dump_stack_lvl+0x94/0xd8 [] check_preemption_disabled+0x104/0x110 [] show_cpuinfo+0x460/0xea0 [] seq_read_iter+0xfc/0x4f8 [] new_sync_read+0x110/0x1b8 [] vfs_read+0x1b4/0x1d0 [] ksys_read+0xd0/0x110 [] syscall_common+0x34/0x58 We can see the following call trace: show_cpuinfo() cpu_has_fpu current_cpu_data smp_processor_id() $ addr2line -f -e vmlinux 0xffffffff802209c8 show_cpuinfo arch/mips/kernel/proc.c:188 $ head -188 arch/mips/kernel/proc.c | tail -1 if (cpu_has_fpu) arch/mips/include/asm/cpu-features.h # define cpu_has_fpu (current_cpu_data.options & MIPS_CPU_FPU) arch/mips/include/asm/cpu-info.h #define current_cpu_data cpu_data[smp_processor_id()] Based on the above analysis, fix the issue by using raw_cpu_has_fpu which calls raw_smp_processor_id() in show_cpuinfo(). Fixes: 626bfa037299 ("MIPS: kernel: proc: add CPU option reporting") Signed-off-by: Tiezhu Yang Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/proc.c b/arch/mips/kernel/proc.c index 376a6e2676e9..9f47a889b047 100644 --- a/arch/mips/kernel/proc.c +++ b/arch/mips/kernel/proc.c @@ -185,7 +185,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_puts(m, " tx39_cache"); if (cpu_has_octeon_cache) seq_puts(m, " octeon_cache"); - if (cpu_has_fpu) + if (raw_cpu_has_fpu) seq_puts(m, " fpu"); if (cpu_has_32fpr) seq_puts(m, " 32fpr"); From 7db5e9e9e5e6c10d7d26f8df7f8fd8841cb15ee7 Mon Sep 17 00:00:00 2001 From: Huang Pei Date: Thu, 25 Nov 2021 18:59:49 +0800 Subject: [PATCH 091/231] MIPS: loongson64: fix FTLB configuration It turns out that 'decode_configs' -> 'set_ftlb_enable' is called under c->cputype unset, which leaves FTLB disabled on BOTH 3A2000 and 3A3000 Fix it by calling "decode_configs" after c->cputype is initialized Fixes: da1bd29742b1 ("MIPS: Loongson64: Probe CPU features via CPUCFG") Signed-off-by: Huang Pei Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/cpu-probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index ac0e2cfc6d57..24a529c6c4be 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -1734,8 +1734,6 @@ static inline void decode_cpucfg(struct cpuinfo_mips *c) static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) { - decode_configs(c); - /* All Loongson processors covered here define ExcCode 16 as GSExc. */ c->options |= MIPS_CPU_GSEXCEX; @@ -1796,6 +1794,8 @@ static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) panic("Unknown Loongson Processor ID!"); break; } + + decode_configs(c); } #else static inline void cpu_probe_loongson(struct cpuinfo_mips *c, unsigned int cpu) { } From 1f80d15020d7f130194821feb1432b67648c632d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 25 Nov 2021 15:20:14 +0000 Subject: [PATCH 092/231] KVM: arm64: Avoid setting the upper 32 bits of TCR_EL2 and CPTR_EL2 to 1 Having a signed (1 << 31) constant for TCR_EL2_RES1 and CPTR_EL2_TCPAC causes the upper 32-bit to be set to 1 when assigning them to a 64-bit variable. Bit 32 in TCR_EL2 is no longer RES0 in ARMv8.7: with FEAT_LPA2 it changes the meaning of bits 49:48 and 9:8 in the stage 1 EL2 page table entries. As a result of the sign-extension, a non-VHE kernel can no longer boot on a model with ARMv8.7 enabled. CPTR_EL2 still has the top 32 bits RES0 but we should preempt any future problems Make these top bit constants unsigned as per commit df655b75c43f ("arm64: KVM: Avoid setting the upper 32 bits of VTCR_EL2 to 1"). Signed-off-by: Catalin Marinas Reported-by: Chris January Cc: Cc: Will Deacon Cc: Marc Zyngier Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20211125152014.2806582-1-catalin.marinas@arm.com --- arch/arm64/include/asm/kvm_arm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index a39fcf318c77..01d47c5886dc 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -91,7 +91,7 @@ #define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) /* TCR_EL2 Registers bits */ -#define TCR_EL2_RES1 ((1 << 31) | (1 << 23)) +#define TCR_EL2_RES1 ((1U << 31) | (1 << 23)) #define TCR_EL2_TBI (1 << 20) #define TCR_EL2_PS_SHIFT 16 #define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT) @@ -276,7 +276,7 @@ #define CPTR_EL2_TFP_SHIFT 10 /* Hyp Coprocessor Trap Register */ -#define CPTR_EL2_TCPAC (1 << 31) +#define CPTR_EL2_TCPAC (1U << 31) #define CPTR_EL2_TAM (1 << 30) #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) From 41ce097f714401e6ad8f3f5eb30d7f91b0b5e495 Mon Sep 17 00:00:00 2001 From: Huang Pei Date: Thu, 25 Nov 2021 18:59:48 +0800 Subject: [PATCH 093/231] MIPS: use 3-level pgtable for 64KB page size on MIPS_VA_BITS_48 It hangup when booting Loongson 3A1000 with BOTH CONFIG_PAGE_SIZE_64KB and CONFIG_MIPS_VA_BITS_48, that it turn out to use 2-level pgtable instead of 3-level. 64KB page size with 2-level pgtable only cover 42 bits VA, use 3-level pgtable to cover all 48 bits VA(55 bits) Fixes: 1e321fa917fb ("MIPS64: Support of at least 48 bits of SEGBITS) Signed-off-by: Huang Pei Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index de60ad190057..0215dc1529e9 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -3097,7 +3097,7 @@ config STACKTRACE_SUPPORT config PGTABLE_LEVELS int default 4 if PAGE_SIZE_4KB && MIPS_VA_BITS_48 - default 3 if 64BIT && !PAGE_SIZE_64KB + default 3 if 64BIT && (!PAGE_SIZE_64KB || MIPS_VA_BITS_48) default 2 config MIPS_AUTO_PFN_OFFSET From d2c12f56fa97df216e71437b218ffbeeb4dd46aa Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 25 Nov 2021 08:47:27 -0600 Subject: [PATCH 094/231] ipmi: fix IPMI_SMI_MSG_TYPE_IPMB_DIRECT response length checking A couple of issues: The tested data sizes are wrong; during the design that changed and this got missed. The formatting of the reponse couldn't use the normal one, it has to be an IPMB formatted response. Reported-by: Jakub Kicinski Fixes: 059747c245f0 ("ipmi: Add support for IPMB direct messages") Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 7d7df17d8b3d..99ea6d9b3716 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -4457,13 +4457,24 @@ return_unspecified: msg->rsp[2] = IPMI_ERR_UNSPECIFIED; msg->rsp_size = 3; } else if (msg->type == IPMI_SMI_MSG_TYPE_IPMB_DIRECT) { - /* commands must have at least 3 bytes, responses 4. */ - if (is_cmd && (msg->rsp_size < 3)) { + /* commands must have at least 4 bytes, responses 5. */ + if (is_cmd && (msg->rsp_size < 4)) { ipmi_inc_stat(intf, invalid_commands); goto out; } - if (!is_cmd && (msg->rsp_size < 4)) - goto return_unspecified; + if (!is_cmd && (msg->rsp_size < 5)) { + ipmi_inc_stat(intf, invalid_ipmb_responses); + /* Construct a valid error response. */ + msg->rsp[0] = msg->data[0] & 0xfc; /* NetFN */ + msg->rsp[0] |= (1 << 2); /* Make it a response */ + msg->rsp[0] |= msg->data[2] & 3; /* rqLUN */ + msg->rsp[1] = msg->data[1]; /* Addr */ + msg->rsp[2] = msg->data[2] & 0xfc; /* rqSeq */ + msg->rsp[2] |= msg->data[0] & 0x3; /* rsLUN */ + msg->rsp[3] = msg->data[3]; /* Cmd */ + msg->rsp[4] = IPMI_ERR_UNSPECIFIED; + msg->rsp_size = 5; + } } else if ((msg->data_size >= 2) && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2)) && (msg->data[1] == IPMI_SEND_MSG_CMD) From c03a487a83fddbca1ef6cb5b97a69cd3e390e233 Mon Sep 17 00:00:00 2001 From: Corey Minyard Date: Thu, 25 Nov 2021 11:23:20 -0600 Subject: [PATCH 095/231] ipmi:ipmb: Fix unknown command response More missed changes, the response back to another system sending a command that had no user to handle it wasn't formatted properly. Signed-off-by: Corey Minyard --- drivers/char/ipmi/ipmi_msghandler.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 99ea6d9b3716..c837d5416e0e 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -3920,9 +3920,11 @@ static int handle_ipmb_direct_rcv_cmd(struct ipmi_smi *intf, /* We didn't find a user, deliver an error response. */ ipmi_inc_stat(intf, unhandled_commands); - msg->data[0] = ((netfn + 1) << 2) | (msg->rsp[4] & 0x3); - msg->data[1] = msg->rsp[2]; - msg->data[2] = msg->rsp[4] & ~0x3; + msg->data[0] = (netfn + 1) << 2; + msg->data[0] |= msg->rsp[2] & 0x3; /* rqLUN */ + msg->data[1] = msg->rsp[1]; /* Addr */ + msg->data[2] = msg->rsp[2] & ~0x3; /* rqSeq */ + msg->data[2] |= msg->rsp[0] & 0x3; /* rsLUN */ msg->data[3] = cmd; msg->data[4] = IPMI_INVALID_CMD_COMPLETION_CODE; msg->data_size = 5; From 8503fea6761de32b72585001ac94e5f81ce8ca44 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 18:20:16 -0500 Subject: [PATCH 096/231] KVM: VMX: do not use uninitialized gfn_to_hva_cache An uninitialized gfn_to_hva_cache has ghc->len == 0, which causes the accessors to croak very loudly. While a BUG_ON is definitely _too_ loud and a bug on its own, there is indeed an issue of using the caches in such a way that they could not have been initialized, because ghc->gpa == 0 might match and thus kvm_gfn_to_hva_cache_init would not be called. For the vmcs12_cache, the solution is simply to invoke kvm_gfn_to_hva_cache_init unconditionally: we already know that the cache does not match the current VMCS pointer. For the shadow_vmcs12_cache, there is no similar condition that checks the VMCS link pointer, so invalidate the cache on VMXON. Fixes: cee66664dcd6 ("KVM: nVMX: Use a gfn_to_hva_cache for vmptrld") Acked-by: David Woodhouse Reported-by: syzbot+7b7db8bb4db6fd5e157b@syzkaller.appspotmail.com Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1e2f66951566..315fa456d368 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4857,6 +4857,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) if (!vmx->nested.cached_vmcs12) goto out_cached_vmcs12; + vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); if (!vmx->nested.cached_shadow_vmcs12) goto out_cached_shadow_vmcs12; @@ -5289,8 +5290,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; struct vmcs_hdr hdr; - if (ghc->gpa != vmptr && - kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { /* * Reads from an unbacked page return all 1s, * which means that the 32 bits located at the From 5f25e71e311478f9bb0a8ef49e7d8b95316491d7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 18:24:01 -0500 Subject: [PATCH 097/231] KVM: downgrade two BUG_ONs to WARN_ON_ONCE This is not an unrecoverable situation. Users of kvm_read_guest_offset_cached and kvm_write_guest_offset_cached must expect the read/write to fail, and therefore it is possible to just return early with an error value. Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6c5083f2eb50..72c4e6b39389 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2931,7 +2931,8 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset; - BUG_ON(len + offset > ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) @@ -2968,7 +2969,8 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, int r; gpa_t gpa = ghc->gpa + offset; - BUG_ON(len + offset > ghc->len); + if (WARN_ON_ONCE(len + offset > ghc->len)) + return -EINVAL; if (slots->generation != ghc->generation) { if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) From 78311a514099932cd8434d5d2194aa94e56ab67c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Nov 2021 07:35:44 -0500 Subject: [PATCH 098/231] KVM: x86: ignore APICv if LAPIC is not enabled Synchronize the two calls to kvm_x86_sync_pir_to_irr. The one in the reenter-guest fast path invoked the callback unconditionally even if LAPIC is present but disabled. In this case, there are no interrupts to deliver, and therefore posted interrupts can be ignored. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a403d92833f..441f4769173e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9849,7 +9849,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (vcpu->arch.apicv_active) + if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) static_call(kvm_x86_sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { From 4916ea8b06a594d9d4e0cf5aa434e9ebd56baafb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Nov 2021 05:23:20 -0500 Subject: [PATCH 099/231] selftests: fix check for circular KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM leaves the source VM in a dead state, so migrating back to the original source VM fails the ioctl. Adjust the test. Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 5ba325cd64bf..a66b9be30239 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -89,7 +89,7 @@ static void test_sev_migrate_from(bool es) { struct kvm_vm *src_vm; struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS]; - int i; + int i, ret; src_vm = sev_vm_create(es); for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) @@ -102,7 +102,10 @@ static void test_sev_migrate_from(bool es) sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd); /* Migrate the guest back to the original VM. */ - sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd); + ret = __sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd); + TEST_ASSERT(ret == -1 && errno == EIO, + "VM that was migrated from should be dead. ret %d, errno: %d\n", ret, + errno); kvm_vm_free(src_vm); for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) From 826bff439ff8dd8670a313c4a8d378fca0d5df3e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 17 Nov 2021 10:49:24 -0500 Subject: [PATCH 100/231] selftests: sev_migrate_tests: free all VMs Ensure that the ASID are freed promptly, which becomes more important when more tests are added to this file. Cc: Peter Gonda Cc: Sean Christopherson Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index a66b9be30239..0cd7e2eaa895 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -149,6 +149,8 @@ static void test_sev_migrate_locking(void) for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) pthread_join(pt[i], NULL); + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + kvm_vm_free(input[i].vm); } static void test_sev_migrate_parameters(void) @@ -165,7 +167,6 @@ static void test_sev_migrate_parameters(void) sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); - ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd); TEST_ASSERT( ret == -1 && errno == EINVAL, @@ -194,6 +195,12 @@ static void test_sev_migrate_parameters(void) TEST_ASSERT(ret == -1 && errno == EINVAL, "Migrations require SEV enabled. ret %d, errno: %d\n", ret, errno); + + kvm_vm_free(sev_vm); + kvm_vm_free(sev_es_vm); + kvm_vm_free(sev_es_vm_no_vmsa); + kvm_vm_free(vm_no_vcpu); + kvm_vm_free(vm_no_sev); } int main(int argc, char *argv[]) From 30d7c5d60a886e3c89633ccf0ea4865276a759fe Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 18 Nov 2021 04:41:34 -0500 Subject: [PATCH 101/231] KVM: SEV: expose KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM capability The capability, albeit present, was never exposed via KVM_CHECK_EXTENSION. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Cc: Peter Gonda Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 441f4769173e..30c4d72bf717 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4133,6 +4133,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SGX_ATTRIBUTE: #endif case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: case KVM_CAP_SREGS2: case KVM_CAP_EXIT_ON_EMULATION_FAILURE: case KVM_CAP_VCPU_ATTRIBUTES: From 2b4a5a5d56881ece3c66b9a9a8943a6f41bd7349 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:43 +0000 Subject: [PATCH 102/231] KVM: nVMX: Flush current VPID (L1 vs. L2) for KVM_REQ_TLB_FLUSH_GUEST Flush the current VPID when handling KVM_REQ_TLB_FLUSH_GUEST instead of always flushing vpid01. Any TLB flush that is triggered when L2 is active is scoped to L2's VPID (if it has one), e.g. if L2 toggles CR4.PGE and L1 doesn't intercept PGE writes, then KVM's emulation of the TLB flush needs to be applied to L2's VPID. Reported-by: Lai Jiangshan Fixes: 07ffaf343e34 ("KVM: nVMX: Sync all PGDs on nested transition with shadow paging") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ba66c171d951..18971cfadd4f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2918,6 +2918,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) } } +static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) + return nested_get_vpid02(vcpu); + return to_vmx(vcpu)->vpid; +} + static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -2930,31 +2937,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) if (enable_ept) ept_sync_context(construct_eptp(vcpu, root_hpa, mmu->shadow_root_level)); - else if (!is_guest_mode(vcpu)) - vpid_sync_context(to_vmx(vcpu)->vpid); else - vpid_sync_context(nested_get_vpid02(vcpu)); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) { /* - * vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in + * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in * vmx_flush_tlb_guest() for an explanation of why this is ok. */ - vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr); + vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr); } static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) { /* - * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 - * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit - * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a + * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are + * required to flush GVA->{G,H}PA mappings from the TLB if vpid is * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), * i.e. no explicit INVVPID is necessary. */ - vpid_sync_context(to_vmx(vcpu)->vpid); + vpid_sync_context(vmx_get_current_vpid(vcpu)); } void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu) From 40e5f9080472b614eeedcc5ba678289cd98d70df Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:43 +0000 Subject: [PATCH 103/231] KVM: nVMX: Abide to KVM_REQ_TLB_FLUSH_GUEST request on nested vmentry/vmexit Like KVM_REQ_TLB_FLUSH_CURRENT, the GUEST variant needs to be serviced at nested transitions, as KVM doesn't track requests for L1 vs L2. E.g. if there's a pending flush when a nested VM-Exit occurs, then the flush was requested in the context of L2 and needs to be handled before switching to L1, otherwise the flush for L2 would effectiely be lost. Opportunistically add a helper to handle CURRENT and GUEST as a pair, the logic for when they need to be serviced is identical as both requests are tied to L1 vs. L2, the only difference is the scope of the flush. Reported-by: Lai Jiangshan Fixes: 07ffaf343e34 ("KVM: nVMX: Sync all PGDs on nested transition with shadow paging") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++----- arch/x86/kvm/x86.c | 28 ++++++++++++++++++++++++---- arch/x86/kvm/x86.h | 7 +------ 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 315fa456d368..8e55aaef33ee 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3344,8 +3344,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, }; u32 failed_index; - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); @@ -4502,9 +4501,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, (void)nested_get_evmcs_page(vcpu); } - /* Service the TLB flush request for L2 before switching to L1. */ - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); + /* Service pending TLB flush requests for L2 before switching to L1. */ + kvm_service_local_tlb_flush_requests(vcpu); /* * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 30c4d72bf717..028151c309c9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3258,6 +3258,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_tlb_flush_guest)(vcpu); } + +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + static_call(kvm_x86_tlb_flush_current)(vcpu); +} + +/* + * Service "local" TLB flush requests, which are specific to the current MMU + * context. In addition to the generic event handling in vcpu_enter_guest(), + * TLB flushes that are targeted at an MMU context also need to be serviced + * prior before nested VM-Enter/VM-Exit. + */ +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); + static void record_steal_time(struct kvm_vcpu *vcpu) { struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; @@ -9649,10 +9672,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* Flushing all ASIDs flushes the current ASID... */ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } - if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) - kvm_vcpu_flush_tlb_current(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) - kvm_vcpu_flush_tlb_guest(vcpu); + kvm_service_local_tlb_flush_requests(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 997669ae9caa..4abcd8d9836d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val, #define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL +void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu); int kvm_check_nested_events(struct kvm_vcpu *vcpu); static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) @@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } -static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) -{ - ++vcpu->stat.tlb_flush; - static_call(kvm_x86_tlb_flush_current)(vcpu); -} - static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); From 712494de96f35f3e146b36b752c2afe0fdc0f0cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Nov 2021 01:49:44 +0000 Subject: [PATCH 104/231] KVM: nVMX: Emulate guest TLB flush on nested VM-Enter with new vpid12 Fully emulate a guest TLB flush on nested VM-Enter which changes vpid12, i.e. L2's VPID, instead of simply doing INVVPID to flush real hardware's TLB entries for vpid02. From L1's perspective, changing L2's VPID is effectively a TLB flush unless "hardware" has previously cached entries for the new vpid12. Because KVM tracks only a single vpid12, KVM doesn't know if the new vpid12 has been used in the past and so must treat it as a brand new, never been used VPID, i.e. must assume that the new vpid12 represents a TLB flush from L1's perspective. For example, if L1 and L2 share a CR3, the first VM-Enter to L2 (with a VPID) is effectively a TLB flush as hardware/KVM has never seen vpid12 and thus can't have cached entries in the TLB for vpid12. Reported-by: Lai Jiangshan Fixes: 5c614b3583e7 ("KVM: nVMX: nested VPID emulation") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211125014944.536398-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8e55aaef33ee..64f2828035c2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, WARN_ON(!enable_vpid); /* - * If VPID is enabled and used by vmc12, but L2 does not have a unique - * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate - * a VPID for L2, flush the current context as the effective ASID is - * common to both L1 and L2. - * - * Defer the flush so that it runs after vmcs02.EPTP has been set by - * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid - * redundant flushes further down the nested pipeline. - * - * If a TLB flush isn't required due to any of the above, and vpid12 is - * changing then the new "virtual" VPID (vpid12) will reuse the same - * "real" VPID (vpid02), and so needs to be flushed. There's no direct - * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for - * all nested vCPUs. Remember, a flush on VM-Enter does not invalidate - * guest-physical mappings, so there is no need to sync the nEPT MMU. + * VPID is enabled and in use by vmcs12. If vpid12 is changing, then + * emulate a guest TLB flush as KVM does not track vpid12 history nor + * is the VPID incorporated into the MMU context. I.e. KVM must assume + * that the new vpid12 has never been used and thus represents a new + * guest ASID that cannot have entries in the TLB. */ - if (!nested_has_guest_tlb_tag(vcpu)) { - kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } else if (is_vmenter && - vmcs12->virtual_processor_id != vmx->nested.last_vpid) { + if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - vpid_sync_context(nested_get_vpid02(vcpu)); + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); + return; } + + /* + * If VPID is enabled, used by vmc12, and vpid12 is not changing but + * does not have a unique TLB tag (ASID), i.e. EPT is disabled and + * KVM was unable to allocate a VPID for L2, flush the current context + * as the effective ASID is common to both L1 and L2. + */ + if (!nested_has_guest_tlb_tag(vcpu)) + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) From 6c1186430a808f97e2052bd5d9eff12c5d5defb0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 22 Nov 2021 18:58:17 +0100 Subject: [PATCH 105/231] KVM: selftests: Avoid KVM_SET_CPUID2 after KVM_RUN in hyperv_features test hyperv_features's sole purpose is to test access to various Hyper-V MSRs and hypercalls with different CPUID data. As KVM_SET_CPUID2 after KVM_RUN is deprecated and soon-to-be forbidden, avoid it by re-creating test VM for each sub-test. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211122175818.608220-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/hyperv_features.c | 140 +++++++++--------- 1 file changed, 71 insertions(+), 69 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_features.c b/tools/testing/selftests/kvm/x86_64/hyperv_features.c index 91d88aaa9899..672915ce73d8 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_features.c @@ -165,10 +165,10 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid, vcpu_set_cpuid(vm, VCPU_ID, cpuid); } -static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, - struct kvm_cpuid2 *best) +static void guest_test_msrs_access(void) { struct kvm_run *run; + struct kvm_vm *vm; struct ucall uc; int stage = 0, r; struct kvm_cpuid_entry2 feat = { @@ -180,11 +180,34 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, struct kvm_cpuid_entry2 dbg = { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }; - struct kvm_enable_cap cap = {0}; - - run = vcpu_state(vm, VCPU_ID); + struct kvm_cpuid2 *best; + vm_vaddr_t msr_gva; + struct kvm_enable_cap cap = { + .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, + .args = {1} + }; + struct msr_data *msr; while (true) { + vm = vm_create_default(VCPU_ID, 0, guest_msr); + + msr_gva = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); + msr = addr_gva2hva(vm, msr_gva); + + vcpu_args_set(vm, VCPU_ID, 1, msr_gva); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); + + run = vcpu_state(vm, VCPU_ID); + switch (stage) { case 0: /* @@ -315,6 +338,7 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, * capability enabled and guest visible CPUID bit unset. */ cap.cap = KVM_CAP_HYPERV_SYNIC2; + cap.args[0] = 0; vcpu_enable_cap(vm, VCPU_ID, &cap); break; case 22: @@ -461,9 +485,9 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_SYNC: - TEST_ASSERT(uc.args[1] == stage, - "Unexpected stage: %ld (%d expected)\n", - uc.args[1], stage); + TEST_ASSERT(uc.args[1] == 0, + "Unexpected stage: %ld (0 expected)\n", + uc.args[1]); break; case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], @@ -474,13 +498,14 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr, } stage++; + kvm_vm_free(vm); } } -static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall, - void *input, void *output, struct kvm_cpuid2 *best) +static void guest_test_hcalls_access(void) { struct kvm_run *run; + struct kvm_vm *vm; struct ucall uc; int stage = 0, r; struct kvm_cpuid_entry2 feat = { @@ -493,10 +518,38 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall struct kvm_cpuid_entry2 dbg = { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }; - - run = vcpu_state(vm, VCPU_ID); + struct kvm_enable_cap cap = { + .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, + .args = {1} + }; + vm_vaddr_t hcall_page, hcall_params; + struct hcall_data *hcall; + struct kvm_cpuid2 *best; while (true) { + vm = vm_create_default(VCPU_ID, 0, guest_hcall); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vm, VCPU_ID); + vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); + + /* Hypercall input/output */ + hcall_page = vm_vaddr_alloc_pages(vm, 2); + hcall = addr_gva2hva(vm, hcall_page); + memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); + + hcall_params = vm_vaddr_alloc_page(vm); + memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); + + vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); + vcpu_enable_cap(vm, VCPU_ID, &cap); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + best = kvm_get_supported_hv_cpuid(); + + run = vcpu_state(vm, VCPU_ID); + switch (stage) { case 0: hcall->control = 0xdeadbeef; @@ -606,9 +659,9 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall switch (get_ucall(vm, VCPU_ID, &uc)) { case UCALL_SYNC: - TEST_ASSERT(uc.args[1] == stage, - "Unexpected stage: %ld (%d expected)\n", - uc.args[1], stage); + TEST_ASSERT(uc.args[1] == 0, + "Unexpected stage: %ld (0 expected)\n", + uc.args[1]); break; case UCALL_ABORT: TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], @@ -619,66 +672,15 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall } stage++; + kvm_vm_free(vm); } } int main(void) { - struct kvm_cpuid2 *best; - struct kvm_vm *vm; - vm_vaddr_t msr_gva, hcall_page, hcall_params; - struct kvm_enable_cap cap = { - .cap = KVM_CAP_HYPERV_ENFORCE_CPUID, - .args = {1} - }; - - /* Test MSRs */ - vm = vm_create_default(VCPU_ID, 0, guest_msr); - - msr_gva = vm_vaddr_alloc_page(vm); - memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize()); - vcpu_args_set(vm, VCPU_ID, 1, msr_gva); - vcpu_enable_cap(vm, VCPU_ID, &cap); - - vcpu_set_hv_cpuid(vm, VCPU_ID); - - best = kvm_get_supported_hv_cpuid(); - - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler); - pr_info("Testing access to Hyper-V specific MSRs\n"); - guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva), - best); - kvm_vm_free(vm); - - /* Test hypercalls */ - vm = vm_create_default(VCPU_ID, 0, guest_hcall); - - vm_init_descriptor_tables(vm); - vcpu_init_descriptor_tables(vm, VCPU_ID); - vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler); - - /* Hypercall input/output */ - hcall_page = vm_vaddr_alloc_pages(vm, 2); - memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize()); - - hcall_params = vm_vaddr_alloc_page(vm); - memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize()); - - vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params); - vcpu_enable_cap(vm, VCPU_ID, &cap); - - vcpu_set_hv_cpuid(vm, VCPU_ID); - - best = kvm_get_supported_hv_cpuid(); + guest_test_msrs_access(); pr_info("Testing access to Hyper-V hypercalls\n"); - guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params), - addr_gva2hva(vm, hcall_page), - addr_gva2hva(vm, hcall_page) + getpagesize(), - best); - - kvm_vm_free(vm); + guest_test_hcalls_access(); } From feb627e8d6f69c9a319fe279710959efb3eba873 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 22 Nov 2021 18:58:18 +0100 Subject: [PATCH 106/231] KVM: x86: Forbid KVM_SET_CPUID{,2} after KVM_RUN Commit 63f5a1909f9e ("KVM: x86: Alert userspace that KVM_SET_CPUID{,2} after KVM_RUN is broken") officially deprecated KVM_SET_CPUID{,2} ioctls after first successful KVM_RUN and promissed to make this sequence forbiden in 5.16. It's time to fulfil the promise. Signed-off-by: Vitaly Kuznetsov Message-Id: <20211122175818.608220-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 28 +++++++++++----------------- arch/x86/kvm/x86.c | 19 +++++++++++++++++++ 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c839ee1282c..0c44581721b0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5025,6 +5025,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) /* * Invalidate all MMU roles to force them to reinitialize as CPUID * information is factored into reserved bit calculations. + * + * Correctly handling multiple vCPU models with respect to paging and + * physical address properties) in a single VM would require tracking + * all relevant CPUID information in kvm_mmu_page_role. That is very + * undesirable as it would increase the memory requirements for + * gfn_track (see struct kvm_mmu_page_role comments). For now that + * problem is swept under the rug; KVM's CPUID API is horrific and + * it's all but impossible to solve it without introducing a new API. */ vcpu->arch.root_mmu.mmu_role.ext.valid = 0; vcpu->arch.guest_mmu.mmu_role.ext.valid = 0; @@ -5032,24 +5040,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); /* - * KVM does not correctly handle changing guest CPUID after KVM_RUN, as - * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't - * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page - * faults due to reusing SPs/SPTEs. Alert userspace, but otherwise - * sweep the problem under the rug. - * - * KVM's horrific CPUID ABI makes the problem all but impossible to - * solve, as correctly handling multiple vCPU models (with respect to - * paging and physical address properties) in a single VM would require - * tracking all relevant CPUID information in kvm_mmu_page_role. That - * is very undesirable as it would double the memory requirements for - * gfn_track (see struct kvm_mmu_page_role comments), and in practice - * no sane VMM mucks with the core vCPU model on the fly. + * Changing guest CPUID after KVM_RUN is forbidden, see the comment in + * kvm_arch_vcpu_ioctl(). */ - if (vcpu->arch.last_vmentry_cpu != -1) { - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n"); - pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n"); - } + KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm); } void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 028151c309c9..817898eab7c3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5148,6 +5148,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; + /* + * KVM does not correctly handle changing guest CPUID after KVM_RUN, as + * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't + * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page + * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with + * the core vCPU model on the fly, so fail. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; @@ -5158,6 +5169,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_cpuid2 __user *cpuid_arg = argp; struct kvm_cpuid2 cpuid; + /* + * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in + * KVM_SET_CPUID case above. + */ + r = -EINVAL; + if (vcpu->arch.last_vmentry_cpu != -1) + goto out; + r = -EFAULT; if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; From 908fa88e420f30dde6d80f092795a18ec72ca6d3 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 23 Nov 2021 14:59:53 +0100 Subject: [PATCH 107/231] KVM: selftests: Make sure kvm_create_max_vcpus test won't hit RLIMIT_NOFILE With the elevated 'KVM_CAP_MAX_VCPUS' value kvm_create_max_vcpus test may hit RLIMIT_NOFILE limits: # ./kvm_create_max_vcpus KVM_CAP_MAX_VCPU_ID: 4096 KVM_CAP_MAX_VCPUS: 1024 Testing creating 1024 vCPUs, with IDs 0...1023. /dev/kvm not available (errno: 24), skipping test Adjust RLIMIT_NOFILE limits to make sure KVM_CAP_MAX_VCPUS fds can be opened. Note, raising hard limit ('rlim_max') requires CAP_SYS_RESOURCE capability which is generally not needed to run kvm selftests (but without raising the limit the test is doomed to fail anyway). Signed-off-by: Vitaly Kuznetsov Message-Id: <20211123135953.667434-1-vkuznets@redhat.com> [Skip the test if the hard limit can be raised. - Paolo] Reviewed-by: Sean Christopherson Tested-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- .../selftests/kvm/kvm_create_max_vcpus.c | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c index f968dfd4ee88..aed9dc3ca1e9 100644 --- a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "test_util.h" @@ -40,10 +41,39 @@ int main(int argc, char *argv[]) { int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + /* + * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds + + * an arbitrary number for everything else. + */ + int nr_fds_wanted = kvm_max_vcpus + 100; + struct rlimit rl; pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); + /* + * Check that we're allowed to open nr_fds_wanted file descriptors and + * try raising the limits if needed. + */ + TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!"); + + if (rl.rlim_cur < nr_fds_wanted) { + rl.rlim_cur = nr_fds_wanted; + if (rl.rlim_max < nr_fds_wanted) { + int old_rlim_max = rl.rlim_max; + rl.rlim_max = nr_fds_wanted; + + int r = setrlimit(RLIMIT_NOFILE, &rl); + if (r < 0) { + printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n", + old_rlim_max, nr_fds_wanted); + exit(KSFT_SKIP); + } + } else { + TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!"); + } + } + /* * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID From 12ec33a705749e18d9588b0a0e69e02821371156 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:43 +0800 Subject: [PATCH 108/231] KVM: X86: Fix when shadow_root_level=5 && guest root_level<4 If the is an L1 with nNPT in 32bit, the shadow walk starts with pae_root. Fixes: a717a780fc4e ("KVM: x86/mmu: Support shadowing NPT when 5-level paging is enabled in host) Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-2-jiangshanlai@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0c44581721b0..d7ae369ec8c2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2173,10 +2173,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato iterator->shadow_addr = root; iterator->level = vcpu->arch.mmu->shadow_root_level; - if (iterator->level == PT64_ROOT_4LEVEL && + if (iterator->level >= PT64_ROOT_4LEVEL && vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL && !vcpu->arch.mmu->direct_map) - --iterator->level; + iterator->level = PT32E_ROOT_LEVEL; if (iterator->level == PT32E_ROOT_LEVEL) { /* From 05b29633c7a956d5675f5fbba70db0d26aa5e73e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 24 Nov 2021 20:20:46 +0800 Subject: [PATCH 109/231] KVM: X86: Use vcpu->arch.walk_mmu for kvm_mmu_invlpg() INVLPG operates on guest virtual address, which are represented by vcpu->arch.walk_mmu. In nested virtualization scenarios, kvm_mmu_invlpg() was using the wrong MMU structure; if L2's invlpg were emulated by L0 (in practice, it hardly happen) when nested two-dimensional paging is enabled, the call to ->tlb_flush_gva() would be skipped and the hardware TLB entry would not be invalidated. Signed-off-by: Lai Jiangshan Message-Id: <20211124122055.64424-5-jiangshanlai@gmail.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d7ae369ec8c2..5942e9c6dd6e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5363,7 +5363,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); From 617a89484debcd4e7999796d693cf0b77d2519de Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Nov 2021 14:38:14 +0000 Subject: [PATCH 110/231] io_uring: fail cancellation for EXITING tasks WARNING: CPU: 1 PID: 20 at fs/io_uring.c:6269 io_try_cancel_userdata+0x3c5/0x640 fs/io_uring.c:6269 CPU: 1 PID: 20 Comm: kworker/1:0 Not tainted 5.16.0-rc1-syzkaller #0 Workqueue: events io_fallback_req_func RIP: 0010:io_try_cancel_userdata+0x3c5/0x640 fs/io_uring.c:6269 Call Trace: io_req_task_link_timeout+0x6b/0x1e0 fs/io_uring.c:6886 io_fallback_req_func+0xf9/0x1ae fs/io_uring.c:1334 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445 kthread+0x405/0x4f0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:295 We need original task's context to do cancellations, so if it's dying and the callback is executed in a fallback mode, fail the cancellation attempt. Fixes: 89b263f6d56e6 ("io_uring: run linked timeouts from task_work") Cc: stable@kernel.org # 5.15+ Reported-by: syzbot+ab0cfe96c2b3cd1c1153@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4c41c5f379c6941ad5a07cd48cb66ed62199cf7e.1637937097.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a4c508a1e0cf..7dd112d44adf 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6882,10 +6882,11 @@ static inline struct file *io_file_get(struct io_ring_ctx *ctx, static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { struct io_kiocb *prev = req->timeout.prev; - int ret; + int ret = -ENOENT; if (prev) { - ret = io_try_cancel_userdata(req, prev->user_data); + if (!(req->task->flags & PF_EXITING)) + ret = io_try_cancel_userdata(req, prev->user_data); io_req_complete_post(req, ret ?: -ETIME, 0); io_put_req(prev); } else { From 6af3f48bf6156a7f02e91aca64e2927c4bebda03 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 26 Nov 2021 14:38:15 +0000 Subject: [PATCH 111/231] io_uring: fix link traversal locking WARNING: inconsistent lock state 5.16.0-rc2-syzkaller #0 Not tainted inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage. ffff888078e11418 (&ctx->timeout_lock ){?.+.}-{2:2} , at: io_timeout_fn+0x6f/0x360 fs/io_uring.c:5943 {HARDIRQ-ON-W} state was registered at: [...] spin_unlock_irq include/linux/spinlock.h:399 [inline] __io_poll_remove_one fs/io_uring.c:5669 [inline] __io_poll_remove_one fs/io_uring.c:5654 [inline] io_poll_remove_one+0x236/0x870 fs/io_uring.c:5680 io_poll_remove_all+0x1af/0x235 fs/io_uring.c:5709 io_ring_ctx_wait_and_kill+0x1cc/0x322 fs/io_uring.c:9534 io_uring_release+0x42/0x46 fs/io_uring.c:9554 __fput+0x286/0x9f0 fs/file_table.c:280 task_work_run+0xdd/0x1a0 kernel/task_work.c:164 exit_task_work include/linux/task_work.h:32 [inline] do_exit+0xc14/0x2b40 kernel/exit.c:832 674ee8e1b4a41 ("io_uring: correct link-list traversal locking") fixed a data race but introduced a possible deadlock and inconsistentcy in irq states. E.g. io_poll_remove_all() spin_lock_irq(timeout_lock) io_poll_remove_one() spin_lock/unlock_irq(poll_lock); spin_unlock_irq(timeout_lock) Another type of problem is freeing a request while holding ->timeout_lock, which may leads to a deadlock in io_commit_cqring() -> io_flush_timeouts() and other places. Having 3 nested locks is also too ugly. Add io_match_task_safe(), which would briefly take and release timeout_lock for race prevention inside, so the actuall request cancellation / free / etc. code doesn't have it taken. Reported-by: syzbot+ff49a3059d49b0ca0eec@syzkaller.appspotmail.com Reported-by: syzbot+847f02ec20a6609a328b@syzkaller.appspotmail.com Reported-by: syzbot+3368aadcd30425ceb53b@syzkaller.appspotmail.com Reported-by: syzbot+51ce8887cdef77c9ac83@syzkaller.appspotmail.com Reported-by: syzbot+3cb756a49d2f394a9ee3@syzkaller.appspotmail.com Fixes: 674ee8e1b4a41 ("io_uring: correct link-list traversal locking") Cc: stable@kernel.org # 5.15+ Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/397f7ebf3f4171f1abe41f708ac1ecb5766f0b68.1637937097.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 60 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7dd112d44adf..75841b919dce 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1278,6 +1278,7 @@ static void io_refs_resurrect(struct percpu_ref *ref, struct completion *compl) static bool io_match_task(struct io_kiocb *head, struct task_struct *task, bool cancel_all) + __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *req; @@ -1293,6 +1294,44 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static bool io_match_linked(struct io_kiocb *head) +{ + struct io_kiocb *req; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* + * As io_match_task() but protected against racing with linked timeouts. + * User must not hold timeout_lock. + */ +static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) +{ + bool matched; + + if (task && head->task != task) + return false; + if (cancel_all) + return true; + + if (head->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = head->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irq(&ctx->timeout_lock); + matched = io_match_linked(head); + spin_unlock_irq(&ctx->timeout_lock); + } else { + matched = io_match_linked(head); + } + return matched; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; @@ -5699,17 +5738,15 @@ static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, int posted = 0, i; spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list; list = &ctx->cancel_hash[i]; hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task(req, tsk, cancel_all)) + if (io_match_task_safe(req, tsk, cancel_all)) posted += io_poll_remove_one(req); } } - spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); if (posted) @@ -9565,19 +9602,8 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - bool ret; - if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { - struct io_ring_ctx *ctx = req->ctx; - - /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); - ret = io_match_task(req, cancel->task, cancel->all); - spin_unlock_irq(&ctx->timeout_lock); - } else { - ret = io_match_task(req, cancel->task, cancel->all); - } - return ret; + return io_match_task_safe(req, cancel->task, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, @@ -9588,14 +9614,12 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, LIST_HEAD(list); spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; From 98b26a0e766724957b48301e3a6f7093a142d54b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Nov 2021 09:53:23 -0700 Subject: [PATCH 112/231] block: call rq_qos_done() before ref check in batch completions We need to call rq_qos_done() regardless of whether or not we're freeing the request or not, as the reference count doesn't cover the IO completion tracking. Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()") Reported-by: Shinichiro Kawasaki Reported-by: Kenneth R. Crudup Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 8799fa73ef34..8874a63ae952 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -860,13 +860,14 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) if (iob->need_ts) __blk_mq_end_request_acct(rq, now); + rq_qos_done(rq->q, rq); + WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!refcount_dec_and_test(&rq->ref)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); - rq_qos_done(rq->q, rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) From d422f40163087408b56290156ba233fc5ada53e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 26 Nov 2021 09:57:32 -0700 Subject: [PATCH 113/231] zram: only make zram_wb_devops for CONFIG_ZRAM_WRITEBACK If writeback isn't configured, then we get the following warning when compiling zram: drivers/block/zram/zram_drv.c:1824:45: warning: unused variable 'zram_wb_devops' [-Wunused-const-variable] Make sure we only define the block_device_operations if that option is enabled. Link: https://lore.kernel.org/lkml/202111261614.gCJMqcyh-lkp@intel.com/ Reported-by: kernel test robot Signed-off-by: Jens Axboe --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 08d7953ec5f1..25071126995b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1853,12 +1853,14 @@ static const struct block_device_operations zram_devops = { .owner = THIS_MODULE }; +#ifdef CONFIG_ZRAM_WRITEBACK static const struct block_device_operations zram_wb_devops = { .open = zram_open, .submit_bio = zram_submit_bio, .swap_slot_free_notify = zram_slot_free_notify, .owner = THIS_MODULE }; +#endif static DEVICE_ATTR_WO(compact); static DEVICE_ATTR_RW(disksize); From 6cb206508b621a9a0a2c35b60540e399225c8243 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 13:35:26 -0500 Subject: [PATCH 114/231] tracing: Check pid filtering when creating events When pid filtering is activated in an instance, all of the events trace files for that instance has the PID_FILTER flag set. This determines whether or not pid filtering needs to be done on the event, otherwise the event is executed as normal. If pid filtering is enabled when an event is created (via a dynamic event or modules), its flag is not updated to reflect the current state, and the events are not filtered properly. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4021b9a79f93..f8965fd50d3b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2678,12 +2678,24 @@ static struct trace_event_file * trace_create_new_event(struct trace_event_call *call, struct trace_array *tr) { + struct trace_pid_list *no_pid_list; + struct trace_pid_list *pid_list; struct trace_event_file *file; + unsigned int first; file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return NULL; + pid_list = rcu_dereference_protected(tr->filtered_pids, + lockdep_is_held(&event_mutex)); + no_pid_list = rcu_dereference_protected(tr->filtered_no_pids, + lockdep_is_held(&event_mutex)); + + if (!trace_pid_list_first(pid_list, &first) || + !trace_pid_list_first(pid_list, &first)) + file->flags |= EVENT_FILE_FL_PID_FILTER; + file->event_call = call; file->tr = tr; atomic_set(&file->sm_ref, 0); From 21e96a2035db43fc72f7023c4577a63ca606de86 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 23 Nov 2021 11:55:06 +0100 Subject: [PATCH 115/231] iommu/vt-d: Remove unused PASID_DISABLED The macro is unused after commit 00ecd5401349a so it can be removed. Reported-by: Linus Torvalds Fixes: 00ecd5401349a ("iommu/vt-d: Clean up unused PASID updating functions") Signed-off-by: Joerg Roedel Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/20211123105507.7654-2-joro@8bytes.org --- arch/x86/include/asm/fpu/api.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 6053674f9132..c2767a6a387e 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -102,12 +102,6 @@ extern void switch_fpu_return(void); */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); -/* - * Tasks that are not using SVA have mm->pasid set to zero to note that they - * will not have the valid bit set in MSR_IA32_PASID while they are running. - */ -#define PASID_DISABLED 0 - /* Trap handling */ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); From 717e88aad37befedfd531378b632e794e24e9afb Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 23 Nov 2021 11:55:07 +0100 Subject: [PATCH 116/231] iommu/amd: Clarify AMD IOMMUv2 initialization messages The messages printed on the initialization of the AMD IOMMUv2 driver have caused some confusion in the past. Clarify the messages to lower the confusion in the future. Cc: stable@vger.kernel.org Signed-off-by: Joerg Roedel Link: https://lore.kernel.org/r/20211123105507.7654-3-joro@8bytes.org --- drivers/iommu/amd/iommu_v2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 13cbeb997cc1..58da08cc3d01 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -929,10 +929,8 @@ static int __init amd_iommu_v2_init(void) { int ret; - pr_info("AMD IOMMUv2 driver by Joerg Roedel \n"); - if (!amd_iommu_v2_supported()) { - pr_info("AMD IOMMUv2 functionality not available on this system\n"); + pr_info("AMD IOMMUv2 functionality not available on this system - This is not a bug.\n"); /* * Load anyway to provide the symbols to other modules * which may use AMD IOMMUv2 optionally. @@ -947,6 +945,8 @@ static int __init amd_iommu_v2_init(void) amd_iommu_register_ppr_notifier(&ppr_nb); + pr_info("AMD IOMMUv2 loaded and initialized\n"); + return 0; out: From f7ff3cff3527ff1e70cad8d2fe7c0c7b6f83120a Mon Sep 17 00:00:00 2001 From: Alex Bee Date: Wed, 24 Nov 2021 03:13:25 +0100 Subject: [PATCH 117/231] iommu/rockchip: Fix PAGE_DESC_HI_MASKs for RK3568 With the submission of iommu driver for RK3568 a subtle bug was introduced: PAGE_DESC_HI_MASK1 and PAGE_DESC_HI_MASK2 have to be the other way arround - that leads to random errors, especially when addresses beyond 32 bit are used. Fix it. Fixes: c55356c534aa ("iommu: rockchip: Add support for iommu v2") Signed-off-by: Alex Bee Tested-by: Peter Geis Reviewed-by: Heiko Stuebner Tested-by: Dan Johansen Reviewed-by: Benjamin Gaignard Link: https://lore.kernel.org/r/20211124021325.858139-1-knaerzche@gmail.com Signed-off-by: Joerg Roedel --- drivers/iommu/rockchip-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index 5cb260820eda..7f23ad61c094 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -200,8 +200,8 @@ static inline phys_addr_t rk_dte_pt_address(u32 dte) #define DTE_HI_MASK2 GENMASK(7, 4) #define DTE_HI_SHIFT1 24 /* shift bit 8 to bit 32 */ #define DTE_HI_SHIFT2 32 /* shift bit 4 to bit 36 */ -#define PAGE_DESC_HI_MASK1 GENMASK_ULL(39, 36) -#define PAGE_DESC_HI_MASK2 GENMASK_ULL(35, 32) +#define PAGE_DESC_HI_MASK1 GENMASK_ULL(35, 32) +#define PAGE_DESC_HI_MASK2 GENMASK_ULL(39, 36) static inline phys_addr_t rk_dte_pt_address_v2(u32 dte) { From 4e5973dd2725bb30c3db622f7d73f7a5864ce718 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 26 Nov 2021 21:55:55 +0800 Subject: [PATCH 118/231] iommu/vt-d: Fix an unbalanced rcu_read_lock/rcu_read_unlock() If we return -EOPNOTSUPP, the rcu lock remains lock. This is spurious. Go through the end of the function instead. This way, the missing 'rcu_read_unlock()' is called. Fixes: 7afd7f6aa21a ("iommu/vt-d: Check FL and SL capability sanity in scalable mode") Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/40cc077ca5f543614eab2a10e84d29dd190273f6.1636217517.git.christophe.jaillet@wanadoo.fr Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211126135556.397932-2-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/cap_audit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/cap_audit.c b/drivers/iommu/intel/cap_audit.c index b39d223926a4..71596fc62822 100644 --- a/drivers/iommu/intel/cap_audit.c +++ b/drivers/iommu/intel/cap_audit.c @@ -144,6 +144,7 @@ static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) { struct dmar_drhd_unit *d; struct intel_iommu *i; + int rc = 0; rcu_read_lock(); if (list_empty(&dmar_drhd_units)) @@ -169,11 +170,11 @@ static int cap_audit_static(struct intel_iommu *iommu, enum cap_audit_type type) */ if (intel_cap_smts_sanity() && !intel_cap_flts_sanity() && !intel_cap_slts_sanity()) - return -EOPNOTSUPP; + rc = -EOPNOTSUPP; out: rcu_read_unlock(); - return 0; + return rc; } int intel_cap_audit(enum cap_audit_type type, struct intel_iommu *iommu) From 86dc40c7ea9c22f64571e0e45f695de73a0e2644 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 26 Nov 2021 21:55:56 +0800 Subject: [PATCH 119/231] iommu/vt-d: Fix unmap_pages support When supporting only the .map and .unmap callbacks of iommu_ops, the IOMMU driver can make assumptions about the size and alignment used for mappings based on the driver provided pgsize_bitmap. VT-d previously used essentially PAGE_MASK for this bitmap as any power of two mapping was acceptably filled by native page sizes. However, with the .map_pages and .unmap_pages interface we're now getting page-size and count arguments. If we simply combine these as (page-size * count) and make use of the previous map/unmap functions internally, any size and alignment assumptions are very different. As an example, a given vfio device assignment VM will often create a 4MB mapping at IOVA pfn [0x3fe00 - 0x401ff]. On a system that does not support IOMMU super pages, the unmap_pages interface will ask to unmap 1024 4KB pages at the base IOVA. dma_pte_clear_level() will recurse down to level 2 of the page table where the first half of the pfn range exactly matches the entire pte level. We clear the pte, increment the pfn by the level size, but (oops) the next pte is on a new page, so we exit the loop an pop back up a level. When we then update the pfn based on that higher level, we seem to assume that the previous pfn value was at the start of the level. In this case the level size is 256K pfns, which we add to the base pfn and get a results of 0x7fe00, which is clearly greater than 0x401ff, so we're done. Meanwhile we never cleared the ptes for the remainder of the range. When the VM remaps this range, we're overwriting valid ptes and the VT-d driver complains loudly, as reported by the user report linked below. The fix for this seems relatively simple, if each iteration of the loop in dma_pte_clear_level() is assumed to clear to the end of the level pte page, then our next pfn should be calculated from level_pfn rather than our working pfn. Fixes: 3f34f1259776 ("iommu/vt-d: Implement map/unmap_pages() iommu_ops callback") Reported-by: Ajay Garg Signed-off-by: Alex Williamson Tested-by: Giovanni Cabiddu Link: https://lore.kernel.org/all/20211002124012.18186-1-ajaygargnsit@gmail.com/ Link: https://lore.kernel.org/r/163659074748.1617923.12716161410774184024.stgit@omen Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20211126135556.397932-3-baolu.lu@linux.intel.com Signed-off-by: Joerg Roedel --- drivers/iommu/intel/iommu.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0bde0c8b4126..b6a8f3282411 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1339,13 +1339,11 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, pte = &pte[pfn_level_offset(pfn, level)]; do { - unsigned long level_pfn; + unsigned long level_pfn = pfn & level_mask(level); if (!dma_pte_present(pte)) goto next; - level_pfn = pfn & level_mask(level); - /* If range covers entire pagetable, free it */ if (start_pfn <= level_pfn && last_pfn >= level_pfn + level_size(level) - 1) { @@ -1366,7 +1364,7 @@ static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level, freelist); } next: - pfn += level_size(level); + pfn = level_pfn + level_size(level); } while (!first_pte_in_page(++pte) && pfn <= last_pfn); if (first_pte) From a55f224ff5f238013de8762c4287117e47b86e22 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 26 Nov 2021 17:34:42 -0500 Subject: [PATCH 120/231] tracing: Fix pid filtering when triggers are attached If a event is filtered by pid and a trigger that requires processing of the event to happen is a attached to the event, the discard portion does not take the pid filtering into account, and the event will then be recorded when it should not have been. Cc: stable@vger.kernel.org Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6b60ab9475ed..38715aa6cfdf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1366,14 +1366,26 @@ __event_trigger_test_discard(struct trace_event_file *file, if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, buffer, entry, event); - if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || - (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && - !filter_match_preds(file->filter, entry))) { - __trace_event_discard_commit(buffer, event); - return true; - } + if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | + EVENT_FILE_FL_FILTERED | + EVENT_FILE_FL_PID_FILTER)))) + return false; + + if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) + goto discard; + + if (file->flags & EVENT_FILE_FL_FILTERED && + !filter_match_preds(file->filter, entry)) + goto discard; + + if ((file->flags & EVENT_FILE_FL_PID_FILTER) && + trace_event_ignore_this_pid(file)) + goto discard; return false; + discard: + __trace_event_discard_commit(buffer, event); + return true; } /** From 1d0254e6b47e73222fd3d6ae95cccbaafe5b3ecf Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 22 Nov 2021 10:47:37 +0800 Subject: [PATCH 121/231] io_uring: fix soft lockup when call __io_remove_buffers I got issue as follows: [ 567.094140] __io_remove_buffers: [1]start ctx=0xffff8881067bf000 bgid=65533 buf=0xffff8881fefe1680 [ 594.360799] watchdog: BUG: soft lockup - CPU#2 stuck for 26s! [kworker/u32:5:108] [ 594.364987] Modules linked in: [ 594.365405] irq event stamp: 604180238 [ 594.365906] hardirqs last enabled at (604180237): [] _raw_spin_unlock_irqrestore+0x2d/0x50 [ 594.367181] hardirqs last disabled at (604180238): [] sysvec_apic_timer_interrupt+0xb/0xc0 [ 594.368420] softirqs last enabled at (569080666): [] __do_softirq+0x654/0xa9e [ 594.369551] softirqs last disabled at (569080575): [] irq_exit_rcu+0x1ca/0x250 [ 594.370692] CPU: 2 PID: 108 Comm: kworker/u32:5 Tainted: G L 5.15.0-next-20211112+ #88 [ 594.371891] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 [ 594.373604] Workqueue: events_unbound io_ring_exit_work [ 594.374303] RIP: 0010:_raw_spin_unlock_irqrestore+0x33/0x50 [ 594.375037] Code: 48 83 c7 18 53 48 89 f3 48 8b 74 24 10 e8 55 f5 55 fd 48 89 ef e8 ed a7 56 fd 80 e7 02 74 06 e8 43 13 7b fd fb bf 01 00 00 00 f8 78 474 [ 594.377433] RSP: 0018:ffff888101587a70 EFLAGS: 00000202 [ 594.378120] RAX: 0000000024030f0d RBX: 0000000000000246 RCX: 1ffffffff2f09106 [ 594.379053] RDX: 0000000000000000 RSI: ffffffff9449f0e0 RDI: 0000000000000001 [ 594.379991] RBP: ffffffff9586cdc0 R08: 0000000000000001 R09: fffffbfff2effcab [ 594.380923] R10: ffffffff977fe557 R11: fffffbfff2effcaa R12: ffff8881b8f3def0 [ 594.381858] R13: 0000000000000246 R14: ffff888153a8b070 R15: 0000000000000000 [ 594.382787] FS: 0000000000000000(0000) GS:ffff888399c00000(0000) knlGS:0000000000000000 [ 594.383851] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 594.384602] CR2: 00007fcbe71d2000 CR3: 00000000b4216000 CR4: 00000000000006e0 [ 594.385540] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 594.386474] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 594.387403] Call Trace: [ 594.387738] [ 594.388042] find_and_remove_object+0x118/0x160 [ 594.389321] delete_object_full+0xc/0x20 [ 594.389852] kfree+0x193/0x470 [ 594.390275] __io_remove_buffers.part.0+0xed/0x147 [ 594.390931] io_ring_ctx_free+0x342/0x6a2 [ 594.392159] io_ring_exit_work+0x41e/0x486 [ 594.396419] process_one_work+0x906/0x15a0 [ 594.399185] worker_thread+0x8b/0xd80 [ 594.400259] kthread+0x3bf/0x4a0 [ 594.401847] ret_from_fork+0x22/0x30 [ 594.402343] Message from syslogd@localhost at Nov 13 09:09:54 ... kernel:watchdog: BUG: soft lockup - CPU#2 stuck for 26s! [kworker/u32:5:108] [ 596.793660] __io_remove_buffers: [2099199]start ctx=0xffff8881067bf000 bgid=65533 buf=0xffff8881fefe1680 We can reproduce this issue by follow syzkaller log: r0 = syz_io_uring_setup(0x401, &(0x7f0000000300), &(0x7f0000003000/0x2000)=nil, &(0x7f0000ff8000/0x4000)=nil, &(0x7f0000000280)=0x0, &(0x7f0000000380)=0x0) sendmsg$ETHTOOL_MSG_FEATURES_SET(0xffffffffffffffff, &(0x7f0000003080)={0x0, 0x0, &(0x7f0000003040)={&(0x7f0000000040)=ANY=[], 0x18}}, 0x0) syz_io_uring_submit(r1, r2, &(0x7f0000000240)=@IORING_OP_PROVIDE_BUFFERS={0x1f, 0x5, 0x0, 0x401, 0x1, 0x0, 0x100, 0x0, 0x1, {0xfffd}}, 0x0) io_uring_enter(r0, 0x3a2d, 0x0, 0x0, 0x0, 0x0) The reason above issue is 'buf->list' has 2,100,000 nodes, occupied cpu lead to soft lockup. To solve this issue, we need add schedule point when do while loop in '__io_remove_buffers'. After add schedule point we do regression, get follow data. [ 240.141864] __io_remove_buffers: [1]start ctx=0xffff888170603000 bgid=65533 buf=0xffff8881116fcb00 [ 268.408260] __io_remove_buffers: [1]start ctx=0xffff8881b92d2000 bgid=65533 buf=0xffff888130c83180 [ 275.899234] __io_remove_buffers: [2099199]start ctx=0xffff888170603000 bgid=65533 buf=0xffff8881116fcb00 [ 296.741404] __io_remove_buffers: [1]start ctx=0xffff8881b659c000 bgid=65533 buf=0xffff8881010fe380 [ 305.090059] __io_remove_buffers: [2099199]start ctx=0xffff8881b92d2000 bgid=65533 buf=0xffff888130c83180 [ 325.415746] __io_remove_buffers: [1]start ctx=0xffff8881b92d1000 bgid=65533 buf=0xffff8881a17d8f00 [ 333.160318] __io_remove_buffers: [2099199]start ctx=0xffff8881b659c000 bgid=65533 buf=0xffff8881010fe380 ... Fixes:8bab4c09f24e("io_uring: allow conditional reschedule for intensive iterators") Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20211122024737.2198530-1-yebin10@huawei.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 75841b919dce..8a2b73cba06b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4366,6 +4366,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf, kfree(nxt); if (++i == nbufs) return i; + cond_resched(); } i++; kfree(buf); @@ -9295,10 +9296,8 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer *buf; unsigned long index; - xa_for_each(&ctx->io_buffers, index, buf) { + xa_for_each(&ctx->io_buffers, index, buf) __io_remove_buffers(ctx, buf, index, -1U); - cond_resched(); - } } static void io_req_caches_free(struct io_ring_ctx *ctx) From f6223ff799666235a80d05f8137b73e5580077b9 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 18 Nov 2021 09:59:07 +0800 Subject: [PATCH 122/231] io_uring: Fix undefined-behaviour in io_issue_sqe We got issue as follows: ================================================================================ UBSAN: Undefined behaviour in ./include/linux/ktime.h:42:14 signed integer overflow: -4966321760114568020 * 1000000000 cannot be represented in type 'long long int' CPU: 1 PID: 2186 Comm: syz-executor.2 Not tainted 4.19.90+ #12 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x3f0 arch/arm64/kernel/time.c:78 show_stack+0x28/0x38 arch/arm64/kernel/traps.c:158 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x170/0x1dc lib/dump_stack.c:118 ubsan_epilogue+0x18/0xb4 lib/ubsan.c:161 handle_overflow+0x188/0x1dc lib/ubsan.c:192 __ubsan_handle_mul_overflow+0x34/0x44 lib/ubsan.c:213 ktime_set include/linux/ktime.h:42 [inline] timespec64_to_ktime include/linux/ktime.h:78 [inline] io_timeout fs/io_uring.c:5153 [inline] io_issue_sqe+0x42c8/0x4550 fs/io_uring.c:5599 __io_queue_sqe+0x1b0/0xbc0 fs/io_uring.c:5988 io_queue_sqe+0x1ac/0x248 fs/io_uring.c:6067 io_submit_sqe fs/io_uring.c:6137 [inline] io_submit_sqes+0xed8/0x1c88 fs/io_uring.c:6331 __do_sys_io_uring_enter fs/io_uring.c:8170 [inline] __se_sys_io_uring_enter fs/io_uring.c:8129 [inline] __arm64_sys_io_uring_enter+0x490/0x980 fs/io_uring.c:8129 invoke_syscall arch/arm64/kernel/syscall.c:53 [inline] el0_svc_common+0x374/0x570 arch/arm64/kernel/syscall.c:121 el0_svc_handler+0x190/0x260 arch/arm64/kernel/syscall.c:190 el0_svc+0x10/0x218 arch/arm64/kernel/entry.S:1017 ================================================================================ As ktime_set only judge 'secs' if big than KTIME_SEC_MAX, but if we pass negative value maybe lead to overflow. To address this issue, we must check if 'sec' is negative. Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20211118015907.844807-1-yebin10@huawei.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 8a2b73cba06b..c4f217613f56 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6196,6 +6196,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; + if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) + return -EINVAL; + data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); From 27ff768fa21ca3286fcc87c3f38ac67d1a2cbe2d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Sat, 27 Nov 2021 16:45:26 -0500 Subject: [PATCH 123/231] tracing: Test the 'Do not trace this pid' case in create event When creating a new event (via a module, kprobe, eprobe, etc), the descriptors that are created must add flags for pid filtering if an instance has pid filtering enabled, as the flags are used at the time the event is executed to know if pid filtering should be done or not. The "Only trace this pid" case was added, but a cut and paste error made that case checked twice, instead of checking the "Trace all but this pid" case. Link: https://lore.kernel.org/all/202111280401.qC0z99JB-lkp@intel.com/ Fixes: 6cb206508b62 ("tracing: Check pid filtering when creating events") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f8965fd50d3b..92be9cb1d7d4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2693,7 +2693,7 @@ trace_create_new_event(struct trace_event_call *call, lockdep_is_held(&event_mutex)); if (!trace_pid_list_first(pid_list, &first) || - !trace_pid_list_first(pid_list, &first)) + !trace_pid_list_first(no_pid_list, &first)) file->flags |= EVENT_FILE_FL_PID_FILTER; file->event_call = call; From 1f0e290cc5fd818d002e0a83b0ea8eceb8f2c515 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 27 Nov 2021 07:44:40 -0800 Subject: [PATCH 124/231] arch: Add generic Kconfig option indicating page size smaller than 64k NTFS_RW and VMXNET3 require a page size smaller than 64kB. Add generic Kconfig option for use outside architecture code to avoid architecture specific Kconfig options in that code. Suggested-by: Michael Ellerman Signed-off-by: Guenter Roeck Cc: Anton Altaparmakov Signed-off-by: Linus Torvalds --- arch/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 26b8ed11639d..d3c4ab249e9c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -991,6 +991,16 @@ config HAVE_ARCH_COMPAT_MMAP_BASES and vice-versa 32-bit applications to call 64-bit mmap(). Required for applications doing different bitness syscalls. +config PAGE_SIZE_LESS_THAN_64KB + def_bool y + depends on !ARM64_64K_PAGES + depends on !IA64_PAGE_SIZE_64KB + depends on !PAGE_SIZE_64KB + depends on !PARISC_PAGE_SIZE_64KB + depends on !PPC_64K_PAGES + depends on !PPC_256K_PAGES + depends on !PAGE_SIZE_256KB + # This allows to use a set of generic functions to determine mmap base # address by giving priority to top-down scheme only if the process # is not in legacy mode (compat task, unlimited stack size or From 4eec7faf6775263d9e450ae7ee5bc4101d4a0bc9 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 27 Nov 2021 07:44:41 -0800 Subject: [PATCH 125/231] fs: ntfs: Limit NTFS_RW to page sizes smaller than 64k NTFS_RW code allocates page size dependent arrays on the stack. This results in build failures if the page size is 64k or larger. fs/ntfs/aops.c: In function 'ntfs_write_mst_block': fs/ntfs/aops.c:1311:1: error: the frame size of 2240 bytes is larger than 2048 bytes Since commit f22969a66041 ("powerpc/64s: Default to 64K pages for 64 bit book3s") this affects ppc:allmodconfig builds, but other architectures supporting page sizes of 64k or larger are also affected. Increasing the maximum frame size for affected architectures just to silence this error does not really help. The frame size would have to be set to a really large value for 256k pages. Also, a large frame size could potentially result in stack overruns in this code and elsewhere and is therefore not desirable. Make NTFS_RW dependent on page sizes smaller than 64k instead. Signed-off-by: Guenter Roeck Cc: Anton Altaparmakov Signed-off-by: Linus Torvalds --- fs/ntfs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig index 1667a7e590d8..f93e69a61283 100644 --- a/fs/ntfs/Kconfig +++ b/fs/ntfs/Kconfig @@ -52,6 +52,7 @@ config NTFS_DEBUG config NTFS_RW bool "NTFS write support" depends on NTFS_FS + depends on PAGE_SIZE_LESS_THAN_64KB help This enables the partial, but safe, write support in the NTFS driver. From 00169a9245f841ec666c70959bfd1dcacce74324 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sat, 27 Nov 2021 07:44:42 -0800 Subject: [PATCH 126/231] vmxnet3: Use generic Kconfig option for page size limit Use the architecture independent Kconfig option PAGE_SIZE_LESS_THAN_64KB to indicate that VMXNET3 requires a page size smaller than 64kB. Signed-off-by: Guenter Roeck Signed-off-by: Linus Torvalds --- drivers/net/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 10506a4b66ef..6cccc3dc00bc 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -567,9 +567,7 @@ config XEN_NETDEV_BACKEND config VMXNET3 tristate "VMware VMXNET3 ethernet driver" depends on PCI && INET - depends on !(PAGE_SIZE_64KB || ARM64_64K_PAGES || \ - IA64_PAGE_SIZE_64KB || PARISC_PAGE_SIZE_64KB || \ - PPC_64K_PAGES) + depends on PAGE_SIZE_LESS_THAN_64KB help This driver supports VMware's vmxnet3 virtual ethernet NIC. To compile this driver as a module, choose M here: the From d58071a8a76d779eedab38033ae4c821c30295a5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 28 Nov 2021 14:09:19 -0800 Subject: [PATCH 127/231] Linux 5.16-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 72b0c3d5cbad..0a6ecc8bb2d2 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Gobble Gobble # *DOCUMENTATION* From 65cc4ad62a9ed47c0b4fcd7af667d97d7c29f19d Mon Sep 17 00:00:00 2001 From: Stefan Binding Date: Sun, 28 Nov 2021 11:55:58 +0000 Subject: [PATCH 128/231] ALSA: hda/cs8409: Set PMSG_ON earlier inside cs8409 driver For cs8409, it is required to run Jack Detect on resume. Jack Detect on cs8409+cs42l42 requires an interrupt from cs42l42 to be sent to cs8409 which is propogated to the driver via an unsolicited event. However, the hda_codec drops unsolicited events if the power_state is not set to PMSG_ON. Which is set at the end of the resume call. This means there is a race condition between setting power_state to PMSG_ON and receiving the interrupt. To solve this, we can add an API to set the power_state earlier and call that before we start Jack Detect. This does not cause issues, since we know inside our driver that we are already initialized, and ready to handle the unsolicited events. Signed-off-by: Stefan Binding Signed-off-by: Vitaly Rodionov Cc: # v5.15+ Link: https://lore.kernel.org/r/20211128115558.71683-1-vitalyr@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_local.h | 9 +++++++++ sound/pci/hda/patch_cs8409.c | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h index ea8ab8b43337..d22c96eb2f8f 100644 --- a/sound/pci/hda/hda_local.h +++ b/sound/pci/hda/hda_local.h @@ -438,6 +438,15 @@ int snd_hda_codec_set_pin_target(struct hda_codec *codec, hda_nid_t nid, #define for_each_hda_codec_node(nid, codec) \ for ((nid) = (codec)->core.start_nid; (nid) < (codec)->core.end_nid; (nid)++) +/* Set the codec power_state flag to indicate to allow unsol event handling; + * see hda_codec_unsol_event() in hda_bind.c. Calling this might confuse the + * state tracking, so use with care. + */ +static inline void snd_hda_codec_allow_unsol_events(struct hda_codec *codec) +{ + codec->core.dev.power.power_state = PMSG_ON; +} + /* * get widget capabilities */ diff --git a/sound/pci/hda/patch_cs8409.c b/sound/pci/hda/patch_cs8409.c index 31ff11ab868e..039b9f2f8e94 100644 --- a/sound/pci/hda/patch_cs8409.c +++ b/sound/pci/hda/patch_cs8409.c @@ -750,6 +750,11 @@ static void cs42l42_resume(struct sub_codec *cs42l42) if (cs42l42->full_scale_vol) cs8409_i2c_write(cs42l42, 0x2001, 0x01); + /* we have to explicitly allow unsol event handling even during the + * resume phase so that the jack event is processed properly + */ + snd_hda_codec_allow_unsol_events(cs42l42->codec); + cs42l42_enable_jack_detect(cs42l42); } From ed53ae75693096f1c10b4561edd31a07b631bd72 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 11 Nov 2021 15:10:03 +0100 Subject: [PATCH 129/231] rt2x00: do not mark device gone on EPROTO errors during start As reported by Exuvo is possible that we have lot's of EPROTO errors during device start i.e. firmware load. But after that device works correctly. Hence marking device gone by few EPROTO errors done by commit e383c70474db ("rt2x00: check number of EPROTO errors") caused regression - Exuvo device stop working after kernel update. To fix disable the check during device start. Link: https://lore.kernel.org/linux-wireless/bff7d309-a816-6a75-51b6-5928ef4f7a8c@exuvo.se/ Reported-and-tested-by: Exuvo Fixes: e383c70474db ("rt2x00: check number of EPROTO errors") Cc: stable@vger.kernel.org Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20211111141003.GA134627@wp.pl --- drivers/net/wireless/ralink/rt2x00/rt2x00usb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c b/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c index e4473a551241..74c3d8cb3100 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00usb.c @@ -25,6 +25,9 @@ static bool rt2x00usb_check_usb_error(struct rt2x00_dev *rt2x00dev, int status) if (status == -ENODEV || status == -ENOENT) return true; + if (!test_bit(DEVICE_STATE_STARTED, &rt2x00dev->flags)) + return false; + if (status == -EPROTO || status == -ETIMEDOUT) rt2x00dev->num_proto_errs++; else From f8e7dfd6fdabb831846ab1970a875746559d491b Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Fri, 26 Nov 2021 16:51:15 +0100 Subject: [PATCH 130/231] net: stmmac: Avoid DMA_CHAN_CONTROL write if no Split Header support The driver assumes that split headers can be enabled/disabled without stopping/starting the device, so it writes DMA_CHAN_CONTROL from stmmac_set_features(). However, on my system (IP v5.10a without Split Header support), simply writing DMA_CHAN_CONTROL when DMA is running (for example, with the commands below) leads to a TX watchdog timeout. host$ socat TCP-LISTEN:1024,fork,reuseaddr - & device$ ethtool -K eth0 tso off device$ ethtool -K eth0 tso on device$ dd if=/dev/zero bs=1M count=10 | socat - TCP4:host:1024 Note that since my IP is configured without Split Header support, the driver always just reads and writes the same value to the DMA_CHAN_CONTROL register. I don't have access to any platforms with Split Header support so I don't know if these writes to the DMA_CHAN_CONTROL while DMA is running actually work properly on such systems. I could not find anything in the databook that says that DMA_CHAN_CONTROL should not be written when the DMA is running. But on systems without Split Header support, there is in any case no need to call enable_sph() in stmmac_set_features() at all since SPH can never be toggled, so we can avoid the watchdog timeout there by skipping this call. Fixes: 8c6fc097a2f4acf ("net: stmmac: gmac4+: Add Split Header support") Signed-off-by: Vincent Whitchurch Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 748195697e5a..da8306f60730 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5540,8 +5540,6 @@ static int stmmac_set_features(struct net_device *netdev, netdev_features_t features) { struct stmmac_priv *priv = netdev_priv(netdev); - bool sph_en; - u32 chan; /* Keep the COE Type in case of csum is supporting */ if (features & NETIF_F_RXCSUM) @@ -5553,10 +5551,13 @@ static int stmmac_set_features(struct net_device *netdev, */ stmmac_rx_ipc(priv, priv->hw); - sph_en = (priv->hw->rx_csum > 0) && priv->sph; + if (priv->sph_cap) { + bool sph_en = (priv->hw->rx_csum > 0) && priv->sph; + u32 chan; - for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) - stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan); + for (chan = 0; chan < priv->plat->rx_queues_to_use; chan++) + stmmac_enable_sph(priv, priv->ioaddr, sph_en, chan); + } return 0; } From dacb5d8875cc6cd3a553363b4d6f06760fcbe70c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 26 Nov 2021 19:34:21 +0100 Subject: [PATCH 131/231] tcp: fix page frag corruption on page fault Steffen reported a TCP stream corruption for HTTP requests served by the apache web-server using a cifs mount-point and memory mapping the relevant file. The root cause is quite similar to the one addressed by commit 20eb4f29b602 ("net: fix sk_page_frag() recursion from memory reclaim"). Here the nested access to the task page frag is caused by a page fault on the (mmapped) user-space memory buffer coming from the cifs file. The page fault handler performs an smb transaction on a different socket, inside the same process context. Since sk->sk_allaction for such socket does not prevent the usage for the task_frag, the nested allocation modify "under the hood" the page frag in use by the outer sendmsg call, corrupting the stream. The overall relevant stack trace looks like the following: httpd 78268 [001] 3461630.850950: probe:tcp_sendmsg_locked: ffffffff91461d91 tcp_sendmsg_locked+0x1 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139814e sock_sendmsg+0x3e ffffffffc06dfe1d smb_send_kvec+0x28 [...] ffffffffc06cfaf8 cifs_readpages+0x213 ffffffff90e83c4b read_pages+0x6b ffffffff90e83f31 __do_page_cache_readahead+0x1c1 ffffffff90e79e98 filemap_fault+0x788 ffffffff90eb0458 __do_fault+0x38 ffffffff90eb5280 do_fault+0x1a0 ffffffff90eb7c84 __handle_mm_fault+0x4d4 ffffffff90eb8093 handle_mm_fault+0xc3 ffffffff90c74f6d __do_page_fault+0x1ed ffffffff90c75277 do_page_fault+0x37 ffffffff9160111e page_fault+0x1e ffffffff9109e7b5 copyin+0x25 ffffffff9109eb40 _copy_from_iter_full+0xe0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462370 tcp_sendmsg_locked+0x5e0 ffffffff91462b57 tcp_sendmsg+0x27 ffffffff9139815c sock_sendmsg+0x4c ffffffff913981f7 sock_write_iter+0x97 ffffffff90f2cc56 do_iter_readv_writev+0x156 ffffffff90f2dff0 do_iter_write+0x80 ffffffff90f2e1c3 vfs_writev+0xa3 ffffffff90f2e27c do_writev+0x5c ffffffff90c042bb do_syscall_64+0x5b ffffffff916000ad entry_SYSCALL_64_after_hwframe+0x65 The cifs filesystem rightfully sets sk_allocations to GFP_NOFS, we can avoid the nesting using the sk page frag for allocation lacking the __GFP_FS flag. Do not define an additional mm-helper for that, as this is strictly tied to the sk page frag usage. v1 -> v2: - use a stricted sk_page_frag() check instead of reordering the code (Eric) Reported-by: Steffen Froemer Fixes: 5640f7685831 ("net: use a per task frag allocator") Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index b32906e1ab55..715cdb4b2b79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2430,19 +2430,22 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk) * @sk: socket * * Use the per task page_frag instead of the per socket one for - * optimization when we know that we're in the normal context and owns + * optimization when we know that we're in process context and own * everything that's associated with %current. * - * gfpflags_allow_blocking() isn't enough here as direct reclaim may nest - * inside other socket operations and end up recursing into sk_page_frag() - * while it's already in use. + * Both direct reclaim and page faults can nest inside other + * socket operations and end up recursing into sk_page_frag() + * while it's already in use: explicitly avoid task page_frag + * usage if the caller is potentially doing any of them. + * This assumes that page fault handlers use the GFP_NOFS flags. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { - if (gfpflags_normal_context(sk->sk_allocation)) + if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == + (__GFP_DIRECT_RECLAIM | __GFP_FS)) return ¤t->task_frag; return &sk->sk_frag; From 1e89ad864d035001835ccf02acea7b1d3dc41819 Mon Sep 17 00:00:00 2001 From: Luiz Angelo Daros de Luca Date: Fri, 26 Nov 2021 17:13:55 -0300 Subject: [PATCH 132/231] net: dsa: realtek-smi: fix indirect reg access for ports>3 This switch family can have up to 8 UTP ports {0..7}. However, INDIRECT_ACCESS_ADDRESS_PHYNUM_MASK was using 2 bits instead of 3, dropping the most significant bit during indirect register reads and writes. Reading or writing ports 4, 5, 6, and 7 registers was actually manipulating, respectively, ports 0, 1, 2, and 3 registers. This is not sufficient but necessary to support any variant with more than 4 UTP ports, like RTL8367S. rtl8365mb_phy_{read,write} will now returns -EINVAL if phy is greater than 7. Fixes: 4af2950c50c8 ("net: dsa: realtek-smi: add rtl8365mb subdriver for RTL8365MB-VC") Signed-off-by: Luiz Angelo Daros de Luca Signed-off-by: David S. Miller --- drivers/net/dsa/rtl8365mb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/rtl8365mb.c b/drivers/net/dsa/rtl8365mb.c index baaae97283c5..078ca4cd7160 100644 --- a/drivers/net/dsa/rtl8365mb.c +++ b/drivers/net/dsa/rtl8365mb.c @@ -107,6 +107,7 @@ #define RTL8365MB_LEARN_LIMIT_MAX_8365MB_VC 2112 /* Family-specific data and limits */ +#define RTL8365MB_PHYADDRMAX 7 #define RTL8365MB_NUM_PHYREGS 32 #define RTL8365MB_PHYREGMAX (RTL8365MB_NUM_PHYREGS - 1) #define RTL8365MB_MAX_NUM_PORTS (RTL8365MB_CPU_PORT_NUM_8365MB_VC + 1) @@ -176,7 +177,7 @@ #define RTL8365MB_INDIRECT_ACCESS_STATUS_REG 0x1F01 #define RTL8365MB_INDIRECT_ACCESS_ADDRESS_REG 0x1F02 #define RTL8365MB_INDIRECT_ACCESS_ADDRESS_OCPADR_5_1_MASK GENMASK(4, 0) -#define RTL8365MB_INDIRECT_ACCESS_ADDRESS_PHYNUM_MASK GENMASK(6, 5) +#define RTL8365MB_INDIRECT_ACCESS_ADDRESS_PHYNUM_MASK GENMASK(7, 5) #define RTL8365MB_INDIRECT_ACCESS_ADDRESS_OCPADR_9_6_MASK GENMASK(11, 8) #define RTL8365MB_PHY_BASE 0x2000 #define RTL8365MB_INDIRECT_ACCESS_WRITE_DATA_REG 0x1F03 @@ -679,6 +680,9 @@ static int rtl8365mb_phy_read(struct realtek_smi *smi, int phy, int regnum) u16 val; int ret; + if (phy > RTL8365MB_PHYADDRMAX) + return -EINVAL; + if (regnum > RTL8365MB_PHYREGMAX) return -EINVAL; @@ -704,6 +708,9 @@ static int rtl8365mb_phy_write(struct realtek_smi *smi, int phy, int regnum, u32 ocp_addr; int ret; + if (phy > RTL8365MB_PHYADDRMAX) + return -EINVAL; + if (regnum > RTL8365MB_PHYREGMAX) return -EINVAL; From 49989adc38f8693fb6e9f019904dd00c1d1db5ac Mon Sep 17 00:00:00 2001 From: Ole Ernst Date: Sat, 27 Nov 2021 10:05:45 +0100 Subject: [PATCH 133/231] USB: NO_LPM quirk Lenovo Powered USB-C Travel Hub This is another branded 8153 device that doesn't work well with LPM: r8152 2-2.1:1.0 enp0s13f0u2u1: Stop submitting intr, status -71 Disable LPM to resolve the issue. Signed-off-by: Ole Ernst Signed-off-by: David S. Miller --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 8239fe7129dd..019351c0b52c 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -434,6 +434,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1532, 0x0116), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Lenovo Powered USB-C Travel Hub (4X90S92381, RTL8153 GigE) */ + { USB_DEVICE(0x17ef, 0x721e), .driver_info = USB_QUIRK_NO_LPM }, + /* Lenovo ThinkCenter A630Z TI024Gen3 usb-audio */ { USB_DEVICE(0x17ef, 0xa012), .driver_info = USB_QUIRK_DISCONNECT_SUSPEND }, From 817b653160db9852d5a0498a31f047e18ce27e5b Mon Sep 17 00:00:00 2001 From: Sven Schuchmann Date: Sat, 27 Nov 2021 11:47:07 +0100 Subject: [PATCH 134/231] net: usb: lan78xx: lan78xx_phy_init(): use PHY_POLL instead of "0" if no IRQ is available On most systems request for IRQ 0 will fail, phylib will print an error message and fall back to polling. To fix this set the phydev->irq to PHY_POLL if no IRQ is available. Fixes: cc89c323a30e ("lan78xx: Use irq_domain for phy interrupt from USB Int. EP") Reviewed-by: Andrew Lunn Signed-off-by: Sven Schuchmann Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index f20376c1ef3f..8cd265fc1fd9 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2228,7 +2228,7 @@ static int lan78xx_phy_init(struct lan78xx_net *dev) if (dev->domain_data.phyirq > 0) phydev->irq = dev->domain_data.phyirq; else - phydev->irq = 0; + phydev->irq = PHY_POLL; netdev_dbg(dev->net, "phydev->irq = %d\n", phydev->irq); /* set to AUTOMDIX */ From 7d4741eacdefa5f0475431645b56baf00784df1f Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 29 Nov 2021 15:15:05 +0900 Subject: [PATCH 135/231] net: mpls: Fix notifications when deleting a device There are various problems related to netlink notifications for mpls route changes in response to interfaces being deleted: * delete interface of only nexthop DELROUTE notification is missing RTA_OIF attribute * delete interface of non-last nexthop NEWROUTE notification is missing entirely * delete interface of last nexthop DELROUTE notification is missing nexthop All of these problems stem from the fact that existing routes are modified in-place before sending a notification. Restructure mpls_ifdown() to avoid changing the route in the DELROUTE cases and to create a copy in the NEWROUTE case. Fixes: f8efb73c97e2 ("mpls: multipath route support") Signed-off-by: Benjamin Poirier Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 68 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index ffeb2df8be7a..6e587feb705c 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1491,22 +1491,52 @@ static void mpls_dev_destroy_rcu(struct rcu_head *head) kfree(mdev); } -static void mpls_ifdown(struct net_device *dev, int event) +static int mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - u8 alive, deleted; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { struct mpls_route *rt = rtnl_dereference(platform_label[index]); + bool nh_del = false; + u8 alive = 0; if (!rt) continue; - alive = 0; - deleted = 0; + if (event == NETDEV_UNREGISTER) { + u8 deleted = 0; + + for_nexthops(rt) { + struct net_device *nh_dev = + rtnl_dereference(nh->nh_dev); + + if (!nh_dev || nh_dev == dev) + deleted++; + if (nh_dev == dev) + nh_del = true; + } endfor_nexthops(rt); + + /* if there are no more nexthops, delete the route */ + if (deleted == rt->rt_nhn) { + mpls_route_update(net, index, NULL, NULL); + continue; + } + + if (nh_del) { + size_t size = sizeof(*rt) + rt->rt_nhn * + rt->rt_nh_size; + struct mpls_route *orig = rt; + + rt = kmalloc(size, GFP_KERNEL); + if (!rt) + return -ENOMEM; + memcpy(rt, orig, size); + } + } + change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; @@ -1530,16 +1560,15 @@ static void mpls_ifdown(struct net_device *dev, int event) next: if (!(nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) alive++; - if (!rtnl_dereference(nh->nh_dev)) - deleted++; } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); - /* if there are no more nexthops, delete the route */ - if (event == NETDEV_UNREGISTER && deleted == rt->rt_nhn) - mpls_route_update(net, index, NULL, NULL); + if (nh_del) + mpls_route_update(net, index, rt, NULL); } + + return 0; } static void mpls_ifup(struct net_device *dev, unsigned int flags) @@ -1597,8 +1626,12 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, return NOTIFY_OK; switch (event) { + int err; + case NETDEV_DOWN: - mpls_ifdown(dev, event); + err = mpls_ifdown(dev, event); + if (err) + return notifier_from_errno(err); break; case NETDEV_UP: flags = dev_get_flags(dev); @@ -1609,13 +1642,18 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGE: flags = dev_get_flags(dev); - if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) { mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); - else - mpls_ifdown(dev, event); + } else { + err = mpls_ifdown(dev, event); + if (err) + return notifier_from_errno(err); + } break; case NETDEV_UNREGISTER: - mpls_ifdown(dev, event); + err = mpls_ifdown(dev, event); + if (err) + return notifier_from_errno(err); mdev = mpls_dev_get(dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); @@ -1626,8 +1664,6 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, case NETDEV_CHANGENAME: mdev = mpls_dev_get(dev); if (mdev) { - int err; - mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) From 189168181bb67825a14e8083d1503cfdc2891ebf Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 29 Nov 2021 15:15:06 +0900 Subject: [PATCH 136/231] net: mpls: Remove rcu protection from nh_dev Following the previous commit, nh_dev can no longer be accessed and modified concurrently. Signed-off-by: Benjamin Poirier Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 39 +++++++++++++++------------------------ net/mpls/internal.h | 2 +- 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 6e587feb705c..0c7bde1c14a6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -409,7 +409,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, goto err; /* Find the output device */ - out_dev = rcu_dereference(nh->nh_dev); + out_dev = nh->nh_dev; if (!mpls_output_possible(out_dev)) goto tx_err; @@ -698,7 +698,7 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, (dev->addr_len != nh->nh_via_alen)) goto errout; - RCU_INIT_POINTER(nh->nh_dev, dev); + nh->nh_dev = dev; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; @@ -1510,12 +1510,9 @@ static int mpls_ifdown(struct net_device *dev, int event) u8 deleted = 0; for_nexthops(rt) { - struct net_device *nh_dev = - rtnl_dereference(nh->nh_dev); - - if (!nh_dev || nh_dev == dev) + if (!nh->nh_dev || nh->nh_dev == dev) deleted++; - if (nh_dev == dev) + if (nh->nh_dev == dev) nh_del = true; } endfor_nexthops(rt); @@ -1540,7 +1537,7 @@ static int mpls_ifdown(struct net_device *dev, int event) change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; - if (rtnl_dereference(nh->nh_dev) != dev) + if (nh->nh_dev != dev) goto next; switch (event) { @@ -1553,7 +1550,7 @@ static int mpls_ifdown(struct net_device *dev, int event) break; } if (event == NETDEV_UNREGISTER) - RCU_INIT_POINTER(nh->nh_dev, NULL); + nh->nh_dev = NULL; if (nh->nh_flags != nh_flags) WRITE_ONCE(nh->nh_flags, nh_flags); @@ -1588,14 +1585,12 @@ static void mpls_ifup(struct net_device *dev, unsigned int flags) alive = 0; change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; - struct net_device *nh_dev = - rtnl_dereference(nh->nh_dev); if (!(nh_flags & flags)) { alive++; continue; } - if (nh_dev != dev) + if (nh->nh_dev != dev) continue; alive++; nh_flags &= ~flags; @@ -2030,7 +2025,7 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; - dev = rtnl_dereference(nh->nh_dev); + dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (nh->nh_flags & RTNH_F_LINKDOWN) @@ -2048,7 +2043,7 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; for_nexthops(rt) { - dev = rtnl_dereference(nh->nh_dev); + dev = nh->nh_dev; if (!dev) continue; @@ -2159,18 +2154,14 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, static bool mpls_rt_uses_dev(struct mpls_route *rt, const struct net_device *dev) { - struct net_device *nh_dev; - if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; - nh_dev = rtnl_dereference(nh->nh_dev); - if (dev == nh_dev) + if (nh->nh_dev == dev) return true; } else { for_nexthops(rt) { - nh_dev = rtnl_dereference(nh->nh_dev); - if (nh_dev == dev) + if (nh->nh_dev == dev) return true; } endfor_nexthops(rt); } @@ -2258,7 +2249,7 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) size_t nhsize = 0; for_nexthops(rt) { - if (!rtnl_dereference(nh->nh_dev)) + if (!nh->nh_dev) continue; nhsize += nla_total_size(sizeof(struct rtnexthop)); /* RTA_VIA */ @@ -2504,7 +2495,7 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh), nh->nh_via_alen)) goto nla_put_failure; - dev = rtnl_dereference(nh->nh_dev); + dev = nh->nh_dev; if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; @@ -2543,7 +2534,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; - RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo); + rt0->rt_nh->nh_dev = lo; rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; @@ -2557,7 +2548,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; - RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo); + rt2->rt_nh->nh_dev = lo; rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 838cdfc10e47..893df00b77b6 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -87,7 +87,7 @@ enum mpls_payload_type { }; struct mpls_nh { /* next hop label forwarding entry */ - struct net_device __rcu *nh_dev; + struct net_device *nh_dev; /* nh_flags is accessed under RCU in the packet path; it is * modified handling netdev events with rtnl lock held From 5961060692f8b17cd2080620a3d27b95d2ae05ca Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 29 Nov 2021 17:32:12 +0800 Subject: [PATCH 137/231] net/tls: Fix authentication failure in CCM mode When the TLS cipher suite uses CCM mode, including AES CCM and SM4 CCM, the first byte of the B0 block is flags, and the real IV starts from the second byte. The XOR operation of the IV and rec_seq should be skip this byte, that is, add the iv_offset. Fixes: f295b3ae9f59 ("net/tls: Add support of AES128-CCM based ciphers") Signed-off-by: Tianjia Zhang Cc: Vakul Garg Cc: stable@vger.kernel.org # v5.2+ Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index d3e7ff90889e..dfe623a4e72f 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -521,7 +521,7 @@ static int tls_do_encryption(struct sock *sk, memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, prot->iv_size + prot->salt_size); - xor_iv_with_seq(prot, rec->iv_data, tls_ctx->tx.rec_seq); + xor_iv_with_seq(prot, rec->iv_data + iv_offset, tls_ctx->tx.rec_seq); sge->offset += prot->prepend_size; sge->length -= prot->prepend_size; @@ -1499,7 +1499,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, else memcpy(iv + iv_offset, tls_ctx->rx.iv, prot->salt_size); - xor_iv_with_seq(prot, iv, tls_ctx->rx.rec_seq); + xor_iv_with_seq(prot, iv + iv_offset, tls_ctx->rx.rec_seq); /* Prepare AAD */ tls_make_aad(aad, rxm->full_len - prot->overhead_size + From d8519565447078f141c58ba4193d820f2cdf1914 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 29 Nov 2021 10:16:52 +0800 Subject: [PATCH 138/231] mctp: test: fix skb free in test device tx In our test device, we're currently freeing skbs in the transmit path with kfree(), rather than kfree_skb(). This change uses the correct kfree_skb() instead. Fixes: ded21b722995 ("mctp: Add test utils") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Jeremy Kerr Signed-off-by: David S. Miller --- net/mctp/test/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index cc6b8803aa9d..7b7918702592 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -12,7 +12,7 @@ static netdev_tx_t mctp_test_dev_tx(struct sk_buff *skb, struct net_device *ndev) { - kfree(skb); + kfree_skb(skb); return NETDEV_TX_OK; } From 2191b1dfef7d45f44b5008d2148676d9f2c82874 Mon Sep 17 00:00:00 2001 From: Erik Ekman Date: Sun, 28 Nov 2021 13:37:11 +0100 Subject: [PATCH 139/231] net/mlx4_en: Update reported link modes for 1/10G When link modes were initially added in commit 2c762679435dc ("net/mlx4_en: Use PTYS register to query ethtool settings") and later updated for the new ethtool API in commit 3d8f7cc78d0eb ("net: mlx4: use new ETHTOOL_G/SSETTINGS API") the only 1/10G non-baseT link modes configured were 1000baseKX, 10000baseKX4 and 10000baseKR. It looks like these got picked to represent other modes since nothing better was available. Switch to using more specific link modes added in commit 5711a98221443 ("net: ethtool: add support for 1000BaseX and missing 10G link modes"). Tested with MCX311A-XCAT connected via DAC. Before: % sudo ethtool enp3s0 Settings for enp3s0: Supported ports: [ FIBRE ] Supported link modes: 1000baseKX/Full 10000baseKR/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: 1000baseKX/Full 10000baseKR/Full Advertised pause frame use: Symmetric Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 10000Mb/s Duplex: Full Auto-negotiation: off Port: Direct Attach Copper PHYAD: 0 Transceiver: internal Supports Wake-on: d Wake-on: d Current message level: 0x00000014 (20) link ifdown Link detected: yes With this change: % sudo ethtool enp3s0 Settings for enp3s0: Supported ports: [ FIBRE ] Supported link modes: 1000baseX/Full 10000baseCR/Full 10000baseSR/Full Supported pause frame use: Symmetric Receive-only Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: 1000baseX/Full 10000baseCR/Full 10000baseSR/Full Advertised pause frame use: Symmetric Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 10000Mb/s Duplex: Full Auto-negotiation: off Port: Direct Attach Copper PHYAD: 0 Transceiver: internal Supports Wake-on: d Wake-on: d Current message level: 0x00000014 (20) link ifdown Link detected: yes Tested-by: Michael Stapelberg Signed-off-by: Erik Ekman Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 066d79e4ecfc..10238bedd694 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -670,7 +670,7 @@ void __init mlx4_en_init_ptys2ethtool_map(void) MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_T, SPEED_1000, ETHTOOL_LINK_MODE_1000baseT_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_CX_SGMII, SPEED_1000, - ETHTOOL_LINK_MODE_1000baseKX_Full_BIT); + ETHTOOL_LINK_MODE_1000baseX_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_1000BASE_KX, SPEED_1000, ETHTOOL_LINK_MODE_1000baseKX_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_T, SPEED_10000, @@ -682,9 +682,9 @@ void __init mlx4_en_init_ptys2ethtool_map(void) MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_KR, SPEED_10000, ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_CR, SPEED_10000, - ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + ETHTOOL_LINK_MODE_10000baseCR_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_10GBASE_SR, SPEED_10000, - ETHTOOL_LINK_MODE_10000baseKR_Full_BIT); + ETHTOOL_LINK_MODE_10000baseSR_Full_BIT); MLX4_BUILD_PTYS2ETHTOOL_CONFIG(MLX4_20GBASE_KR2, SPEED_20000, ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT, ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT); From aa1dcb5646fdf34a15763facf4bf5e482a2814ca Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Mon, 29 Nov 2021 05:28:23 -0800 Subject: [PATCH 140/231] atlantic: Increase delay for fw transactions The max waiting period (of 1 ms) while reading the data from FW shared buffer is too small for certain types of data (e.g., stats). There's a chance that FW could be updating buffer at the same time and driver would be unsuccessful in reading data. Firmware manual recommends to have 1 sec timeout to fix this issue. Fixes: 5cfd54d7dc186 ("net: atlantic: minimal A2 fw_ops") Signed-off-by: Dmitry Bogdanov Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c index dd259c8f2f4f..b0e4119b9883 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c @@ -84,7 +84,7 @@ static int hw_atl2_shared_buffer_read_block(struct aq_hw_s *self, if (cnt > AQ_A2_FW_READ_TRY_MAX) return -ETIME; if (tid1.transaction_cnt_a != tid1.transaction_cnt_b) - udelay(1); + mdelay(1); } while (tid1.transaction_cnt_a != tid1.transaction_cnt_b); hw_atl2_mif_shared_buf_read(self, offset, (u32 *)data, dwords); @@ -339,8 +339,11 @@ static int aq_a2_fw_update_stats(struct aq_hw_s *self) { struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv; struct statistics_s stats; + int err; - hw_atl2_shared_buffer_read_safe(self, stats, &stats); + err = hw_atl2_shared_buffer_read_safe(self, stats, &stats); + if (err) + return err; #define AQ_SDELTA(_N_, _F_) (self->curr_stats._N_ += \ stats.msm._F_ - priv->last_stats.msm._F_) From aa685acd98eae25d5351e30288d6cfb65b9c80a5 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Mon, 29 Nov 2021 05:28:24 -0800 Subject: [PATCH 141/231] atlatnic: enable Nbase-t speeds with base-t When 2.5G is advertised, N-Base should be advertised against the T-base caps. N5G is out of use in baseline code and driver should treat both 5G and N5G (and also 2.5G and N2.5G) equally from user perspective. Fixes: 5cfd54d7dc186 ("net: atlantic: minimal A2 fw_ops") Signed-off-by: Nikita Danilov Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/aq_common.h | 25 +++++++++---------- .../atlantic/hw_atl/hw_atl_utils_fw2x.c | 3 --- .../atlantic/hw_atl2/hw_atl2_utils_fw.c | 4 +-- 3 files changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_common.h b/drivers/net/ethernet/aquantia/atlantic/aq_common.h index 23b2d390fcdd..4ad8f36fcade 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_common.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_common.h @@ -53,20 +53,19 @@ #define AQ_NIC_RATE_10G BIT(0) #define AQ_NIC_RATE_5G BIT(1) -#define AQ_NIC_RATE_5GSR BIT(2) -#define AQ_NIC_RATE_2G5 BIT(3) -#define AQ_NIC_RATE_1G BIT(4) -#define AQ_NIC_RATE_100M BIT(5) -#define AQ_NIC_RATE_10M BIT(6) -#define AQ_NIC_RATE_1G_HALF BIT(7) -#define AQ_NIC_RATE_100M_HALF BIT(8) -#define AQ_NIC_RATE_10M_HALF BIT(9) +#define AQ_NIC_RATE_2G5 BIT(2) +#define AQ_NIC_RATE_1G BIT(3) +#define AQ_NIC_RATE_100M BIT(4) +#define AQ_NIC_RATE_10M BIT(5) +#define AQ_NIC_RATE_1G_HALF BIT(6) +#define AQ_NIC_RATE_100M_HALF BIT(7) +#define AQ_NIC_RATE_10M_HALF BIT(8) -#define AQ_NIC_RATE_EEE_10G BIT(10) -#define AQ_NIC_RATE_EEE_5G BIT(11) -#define AQ_NIC_RATE_EEE_2G5 BIT(12) -#define AQ_NIC_RATE_EEE_1G BIT(13) -#define AQ_NIC_RATE_EEE_100M BIT(14) +#define AQ_NIC_RATE_EEE_10G BIT(9) +#define AQ_NIC_RATE_EEE_5G BIT(10) +#define AQ_NIC_RATE_EEE_2G5 BIT(11) +#define AQ_NIC_RATE_EEE_1G BIT(12) +#define AQ_NIC_RATE_EEE_100M BIT(13) #define AQ_NIC_RATE_EEE_MSK (AQ_NIC_RATE_EEE_10G |\ AQ_NIC_RATE_EEE_5G |\ AQ_NIC_RATE_EEE_2G5 |\ diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c index eac631c45c56..4d4cfbc91e19 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c @@ -132,9 +132,6 @@ static enum hw_atl_fw2x_rate link_speed_mask_2fw2x_ratemask(u32 speed) if (speed & AQ_NIC_RATE_5G) rate |= FW2X_RATE_5G; - if (speed & AQ_NIC_RATE_5GSR) - rate |= FW2X_RATE_5G; - if (speed & AQ_NIC_RATE_2G5) rate |= FW2X_RATE_2G5; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c index b0e4119b9883..b7a9b0ed6df3 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c @@ -154,7 +154,7 @@ static void a2_link_speed_mask2fw(u32 speed, { link_options->rate_10G = !!(speed & AQ_NIC_RATE_10G); link_options->rate_5G = !!(speed & AQ_NIC_RATE_5G); - link_options->rate_N5G = !!(speed & AQ_NIC_RATE_5GSR); + link_options->rate_N5G = link_options->rate_5G; link_options->rate_2P5G = !!(speed & AQ_NIC_RATE_2G5); link_options->rate_N2P5G = link_options->rate_2P5G; link_options->rate_1G = !!(speed & AQ_NIC_RATE_1G); @@ -192,8 +192,6 @@ static u32 a2_fw_lkp_to_mask(struct lkp_link_caps_s *lkp_link_caps) rate |= AQ_NIC_RATE_10G; if (lkp_link_caps->rate_5G) rate |= AQ_NIC_RATE_5G; - if (lkp_link_caps->rate_N5G) - rate |= AQ_NIC_RATE_5GSR; if (lkp_link_caps->rate_2P5G) rate |= AQ_NIC_RATE_2G5; if (lkp_link_caps->rate_1G) From 2465c802232bc8d2b5bd83b55b08d05c11808704 Mon Sep 17 00:00:00 2001 From: Sameer Saurabh Date: Mon, 29 Nov 2021 05:28:25 -0800 Subject: [PATCH 142/231] atlantic: Fix to display FW bundle version instead of FW mac version. The correct way to reflect firmware version is to use bundle version. Hence populating the same instead of MAC fw version. Fixes: c1be0bf092bd2 ("net: atlantic: common functions needed for basic A2 init/deinit hw_ops") Signed-off-by: Sameer Saurabh Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c index b7a9b0ed6df3..e164ac5b55a8 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c @@ -500,9 +500,9 @@ u32 hw_atl2_utils_get_fw_version(struct aq_hw_s *self) hw_atl2_shared_buffer_read_safe(self, version, &version); /* A2 FW version is stored in reverse order */ - return version.mac.major << 24 | - version.mac.minor << 16 | - version.mac.build; + return version.bundle.major << 24 | + version.bundle.minor << 16 | + version.bundle.build; } int hw_atl2_utils_get_action_resolve_table_caps(struct aq_hw_s *self, From 413d5e09caa5a11da9c7d72401ba0588466a04c0 Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Mon, 29 Nov 2021 05:28:26 -0800 Subject: [PATCH 143/231] atlantic: Add missing DIDs and fix 115c. At the late production stages new dev ids were introduced. These are now in production, so its important for the driver to recognize these. And also fix the board caps for AQC115C adapter. Fixes: b3f0c79cba206 ("net: atlantic: A2 hw_ops skeleton") Signed-off-by: Nikita Danilov Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_common.h | 2 ++ .../ethernet/aquantia/atlantic/aq_pci_func.c | 7 ++++++- .../aquantia/atlantic/hw_atl2/hw_atl2.c | 17 +++++++++++++++++ .../aquantia/atlantic/hw_atl2/hw_atl2.h | 2 ++ 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_common.h b/drivers/net/ethernet/aquantia/atlantic/aq_common.h index 4ad8f36fcade..ace691d7cd75 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_common.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_common.h @@ -40,10 +40,12 @@ #define AQ_DEVICE_ID_AQC113DEV 0x00C0 #define AQ_DEVICE_ID_AQC113CS 0x94C0 +#define AQ_DEVICE_ID_AQC113CA 0x34C0 #define AQ_DEVICE_ID_AQC114CS 0x93C0 #define AQ_DEVICE_ID_AQC113 0x04C0 #define AQ_DEVICE_ID_AQC113C 0x14C0 #define AQ_DEVICE_ID_AQC115C 0x12C0 +#define AQ_DEVICE_ID_AQC116C 0x11C0 #define HW_ATL_NIC_NAME "Marvell (aQuantia) AQtion 10Gbit Network Adapter" diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c index d4b1976ee69b..797a95142d1f 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c @@ -49,6 +49,8 @@ static const struct pci_device_id aq_pci_tbl[] = { { PCI_VDEVICE(AQUANTIA, AQ_DEVICE_ID_AQC113), }, { PCI_VDEVICE(AQUANTIA, AQ_DEVICE_ID_AQC113C), }, { PCI_VDEVICE(AQUANTIA, AQ_DEVICE_ID_AQC115C), }, + { PCI_VDEVICE(AQUANTIA, AQ_DEVICE_ID_AQC113CA), }, + { PCI_VDEVICE(AQUANTIA, AQ_DEVICE_ID_AQC116C), }, {} }; @@ -85,7 +87,10 @@ static const struct aq_board_revision_s hw_atl_boards[] = { { AQ_DEVICE_ID_AQC113CS, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc113, }, { AQ_DEVICE_ID_AQC114CS, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc113, }, { AQ_DEVICE_ID_AQC113C, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc113, }, - { AQ_DEVICE_ID_AQC115C, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc113, }, + { AQ_DEVICE_ID_AQC115C, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc115c, }, + { AQ_DEVICE_ID_AQC113CA, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc113, }, + { AQ_DEVICE_ID_AQC116C, AQ_HWREV_ANY, &hw_atl2_ops, &hw_atl2_caps_aqc116c, }, + }; MODULE_DEVICE_TABLE(pci, aq_pci_tbl); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c index c98708bb044c..0a28428a0cb7 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c @@ -72,6 +72,23 @@ const struct aq_hw_caps_s hw_atl2_caps_aqc113 = { AQ_NIC_RATE_10M_HALF, }; +const struct aq_hw_caps_s hw_atl2_caps_aqc115c = { + DEFAULT_BOARD_BASIC_CAPABILITIES, + .media_type = AQ_HW_MEDIA_TYPE_TP, + .link_speed_msk = AQ_NIC_RATE_2G5 | + AQ_NIC_RATE_1G | + AQ_NIC_RATE_100M | + AQ_NIC_RATE_10M, +}; + +const struct aq_hw_caps_s hw_atl2_caps_aqc116c = { + DEFAULT_BOARD_BASIC_CAPABILITIES, + .media_type = AQ_HW_MEDIA_TYPE_TP, + .link_speed_msk = AQ_NIC_RATE_1G | + AQ_NIC_RATE_100M | + AQ_NIC_RATE_10M, +}; + static u32 hw_atl2_sem_act_rslvr_get(struct aq_hw_s *self) { return hw_atl_reg_glb_cpu_sem_get(self, HW_ATL2_FW_SM_ACT_RSLVR); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.h index de8723f1c28a..346f0dc9912e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.h @@ -9,6 +9,8 @@ #include "aq_common.h" extern const struct aq_hw_caps_s hw_atl2_caps_aqc113; +extern const struct aq_hw_caps_s hw_atl2_caps_aqc115c; +extern const struct aq_hw_caps_s hw_atl2_caps_aqc116c; extern const struct aq_hw_ops hw_atl2_ops; #endif /* HW_ATL2_H */ From 03fa512189eb9b55ded5f3e81ad638315555b340 Mon Sep 17 00:00:00 2001 From: Sameer Saurabh Date: Mon, 29 Nov 2021 05:28:27 -0800 Subject: [PATCH 144/231] Remove Half duplex mode speed capabilities. Since Half Duplex mode has been deprecated by the firmware, driver should not advertise Half Duplex speed in ethtool support link speed values. Fixes: 071a02046c262 ("net: atlantic: A2: half duplex support") Signed-off-by: Sameer Saurabh Signed-off-by: Igor Russkikh Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c index 0a28428a0cb7..5dfc751572ed 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2.c @@ -65,11 +65,8 @@ const struct aq_hw_caps_s hw_atl2_caps_aqc113 = { AQ_NIC_RATE_5G | AQ_NIC_RATE_2G5 | AQ_NIC_RATE_1G | - AQ_NIC_RATE_1G_HALF | AQ_NIC_RATE_100M | - AQ_NIC_RATE_100M_HALF | - AQ_NIC_RATE_10M | - AQ_NIC_RATE_10M_HALF, + AQ_NIC_RATE_10M, }; const struct aq_hw_caps_s hw_atl2_caps_aqc115c = { From 2087ced0fc3a6d45203925750a2b1bcd5402e639 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Mon, 29 Nov 2021 05:28:28 -0800 Subject: [PATCH 145/231] atlantic: Fix statistics logic for production hardware B0 is the main and widespread device revision of atlantic2 HW. In the current state, driver will incorrectly fetch the statistics for this revision. Fixes: 5cfd54d7dc186 ("net: atlantic: minimal A2 fw_ops") Signed-off-by: Dmitry Bogdanov Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../net/ethernet/aquantia/atlantic/aq_hw.h | 2 + .../net/ethernet/aquantia/atlantic/aq_nic.c | 10 +- .../aquantia/atlantic/hw_atl/hw_atl_utils.c | 15 ++- .../aquantia/atlantic/hw_atl2/hw_atl2_utils.h | 38 ++++++- .../atlantic/hw_atl2/hw_atl2_utils_fw.c | 99 +++++++++++++++---- 5 files changed, 138 insertions(+), 26 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h index 062a300a566a..dbd284660135 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h @@ -80,6 +80,8 @@ struct aq_hw_link_status_s { }; struct aq_stats_s { + u64 brc; + u64 btc; u64 uprc; u64 mprc; u64 bprc; diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c index 1acf544afeb4..02c4e3b4a6a5 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c @@ -905,8 +905,14 @@ u64 *aq_nic_get_stats(struct aq_nic_s *self, u64 *data) data[++i] = stats->mbtc; data[++i] = stats->bbrc; data[++i] = stats->bbtc; - data[++i] = stats->ubrc + stats->mbrc + stats->bbrc; - data[++i] = stats->ubtc + stats->mbtc + stats->bbtc; + if (stats->brc) + data[++i] = stats->brc; + else + data[++i] = stats->ubrc + stats->mbrc + stats->bbrc; + if (stats->btc) + data[++i] = stats->btc; + else + data[++i] = stats->ubtc + stats->mbtc + stats->bbtc; data[++i] = stats->dma_pkt_rc; data[++i] = stats->dma_pkt_tc; data[++i] = stats->dma_oct_rc; diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c index 3f1704cbe1cb..7e88d7234b14 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c @@ -867,12 +867,20 @@ static int hw_atl_fw1x_deinit(struct aq_hw_s *self) int hw_atl_utils_update_stats(struct aq_hw_s *self) { struct aq_stats_s *cs = &self->curr_stats; + struct aq_stats_s curr_stats = *cs; struct hw_atl_utils_mbox mbox; + bool corrupted_stats = false; hw_atl_utils_mpi_read_stats(self, &mbox); -#define AQ_SDELTA(_N_) (self->curr_stats._N_ += \ - mbox.stats._N_ - self->last_stats._N_) +#define AQ_SDELTA(_N_) \ +do { \ + if (!corrupted_stats && \ + ((s64)(mbox.stats._N_ - self->last_stats._N_)) >= 0) \ + curr_stats._N_ += mbox.stats._N_ - self->last_stats._N_; \ + else \ + corrupted_stats = true; \ +} while (0) if (self->aq_link_status.mbps) { AQ_SDELTA(uprc); @@ -892,6 +900,9 @@ int hw_atl_utils_update_stats(struct aq_hw_s *self) AQ_SDELTA(bbrc); AQ_SDELTA(bbtc); AQ_SDELTA(dpc); + + if (!corrupted_stats) + *cs = curr_stats; } #undef AQ_SDELTA diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.h index b66fa346581c..6bad64c77b87 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils.h @@ -239,7 +239,8 @@ struct version_s { u8 minor; u16 build; } phy; - u32 rsvd; + u32 drv_iface_ver:4; + u32 rsvd:28; }; struct link_status_s { @@ -424,7 +425,7 @@ struct cable_diag_status_s { u16 rsvd2; }; -struct statistics_s { +struct statistics_a0_s { struct { u32 link_up; u32 link_down; @@ -457,6 +458,33 @@ struct statistics_s { u32 reserve_fw_gap; }; +struct __packed statistics_b0_s { + u64 rx_good_octets; + u64 rx_pause_frames; + u64 rx_good_frames; + u64 rx_errors; + u64 rx_unicast_frames; + u64 rx_multicast_frames; + u64 rx_broadcast_frames; + + u64 tx_good_octets; + u64 tx_pause_frames; + u64 tx_good_frames; + u64 tx_errors; + u64 tx_unicast_frames; + u64 tx_multicast_frames; + u64 tx_broadcast_frames; + + u32 main_loop_cycles; +}; + +struct __packed statistics_s { + union __packed { + struct statistics_a0_s a0; + struct statistics_b0_s b0; + }; +}; + struct filter_caps_s { u8 l2_filters_base_index:6; u8 flexible_filter_mask:2; @@ -545,7 +573,7 @@ struct management_status_s { u32 rsvd5; }; -struct fw_interface_out { +struct __packed fw_interface_out { struct transaction_counter_s transaction_id; struct version_s version; struct link_status_s link_status; @@ -569,7 +597,6 @@ struct fw_interface_out { struct core_dump_s core_dump; u32 rsvd11; struct statistics_s stats; - u32 rsvd12; struct filter_caps_s filter_caps; struct device_caps_s device_caps; u32 rsvd13; @@ -592,6 +619,9 @@ struct fw_interface_out { #define AQ_HOST_MODE_LOW_POWER 3U #define AQ_HOST_MODE_SHUTDOWN 4U +#define AQ_A2_FW_INTERFACE_A0 0 +#define AQ_A2_FW_INTERFACE_B0 1 + int hw_atl2_utils_initfw(struct aq_hw_s *self, const struct aq_fw_ops **fw_ops); int hw_atl2_utils_soft_reset(struct aq_hw_s *self); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c index e164ac5b55a8..58d426dda3ed 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl2/hw_atl2_utils_fw.c @@ -333,18 +333,22 @@ static int aq_a2_fw_get_mac_permanent(struct aq_hw_s *self, u8 *mac) return 0; } -static int aq_a2_fw_update_stats(struct aq_hw_s *self) +static void aq_a2_fill_a0_stats(struct aq_hw_s *self, + struct statistics_s *stats) { struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv; - struct statistics_s stats; - int err; + struct aq_stats_s *cs = &self->curr_stats; + struct aq_stats_s curr_stats = *cs; + bool corrupted_stats = false; - err = hw_atl2_shared_buffer_read_safe(self, stats, &stats); - if (err) - return err; - -#define AQ_SDELTA(_N_, _F_) (self->curr_stats._N_ += \ - stats.msm._F_ - priv->last_stats.msm._F_) +#define AQ_SDELTA(_N, _F) \ +do { \ + if (!corrupted_stats && \ + ((s64)(stats->a0.msm._F - priv->last_stats.a0.msm._F)) >= 0) \ + curr_stats._N += stats->a0.msm._F - priv->last_stats.a0.msm._F;\ + else \ + corrupted_stats = true; \ +} while (0) if (self->aq_link_status.mbps) { AQ_SDELTA(uprc, rx_unicast_frames); @@ -363,17 +367,76 @@ static int aq_a2_fw_update_stats(struct aq_hw_s *self) AQ_SDELTA(mbtc, tx_multicast_octets); AQ_SDELTA(bbrc, rx_broadcast_octets); AQ_SDELTA(bbtc, tx_broadcast_octets); + + if (!corrupted_stats) + *cs = curr_stats; } #undef AQ_SDELTA - self->curr_stats.dma_pkt_rc = - hw_atl_stats_rx_dma_good_pkt_counter_get(self); - self->curr_stats.dma_pkt_tc = - hw_atl_stats_tx_dma_good_pkt_counter_get(self); - self->curr_stats.dma_oct_rc = - hw_atl_stats_rx_dma_good_octet_counter_get(self); - self->curr_stats.dma_oct_tc = - hw_atl_stats_tx_dma_good_octet_counter_get(self); - self->curr_stats.dpc = hw_atl_rpb_rx_dma_drop_pkt_cnt_get(self); + +} + +static void aq_a2_fill_b0_stats(struct aq_hw_s *self, + struct statistics_s *stats) +{ + struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv; + struct aq_stats_s *cs = &self->curr_stats; + struct aq_stats_s curr_stats = *cs; + bool corrupted_stats = false; + +#define AQ_SDELTA(_N, _F) \ +do { \ + if (!corrupted_stats && \ + ((s64)(stats->b0._F - priv->last_stats.b0._F)) >= 0) \ + curr_stats._N += stats->b0._F - priv->last_stats.b0._F; \ + else \ + corrupted_stats = true; \ +} while (0) + + if (self->aq_link_status.mbps) { + AQ_SDELTA(uprc, rx_unicast_frames); + AQ_SDELTA(mprc, rx_multicast_frames); + AQ_SDELTA(bprc, rx_broadcast_frames); + AQ_SDELTA(erpr, rx_errors); + AQ_SDELTA(brc, rx_good_octets); + + AQ_SDELTA(uptc, tx_unicast_frames); + AQ_SDELTA(mptc, tx_multicast_frames); + AQ_SDELTA(bptc, tx_broadcast_frames); + AQ_SDELTA(erpt, tx_errors); + AQ_SDELTA(btc, tx_good_octets); + + if (!corrupted_stats) + *cs = curr_stats; + } +#undef AQ_SDELTA +} + +static int aq_a2_fw_update_stats(struct aq_hw_s *self) +{ + struct hw_atl2_priv *priv = (struct hw_atl2_priv *)self->priv; + struct aq_stats_s *cs = &self->curr_stats; + struct statistics_s stats; + struct version_s version; + int err; + + err = hw_atl2_shared_buffer_read_safe(self, version, &version); + if (err) + return err; + + err = hw_atl2_shared_buffer_read_safe(self, stats, &stats); + if (err) + return err; + + if (version.drv_iface_ver == AQ_A2_FW_INTERFACE_A0) + aq_a2_fill_a0_stats(self, &stats); + else + aq_a2_fill_b0_stats(self, &stats); + + cs->dma_pkt_rc = hw_atl_stats_rx_dma_good_pkt_counter_get(self); + cs->dma_pkt_tc = hw_atl_stats_tx_dma_good_pkt_counter_get(self); + cs->dma_oct_rc = hw_atl_stats_rx_dma_good_octet_counter_get(self); + cs->dma_oct_tc = hw_atl_stats_tx_dma_good_octet_counter_get(self); + cs->dpc = hw_atl_rpb_rx_dma_drop_pkt_cnt_get(self); memcpy(&priv->last_stats, &stats, sizeof(stats)); From 060a0fb721ec5bbe02ae322e434ec87dc25ed6e9 Mon Sep 17 00:00:00 2001 From: Sameer Saurabh Date: Mon, 29 Nov 2021 05:28:29 -0800 Subject: [PATCH 146/231] atlantic: Remove warn trace message. Remove the warn trace message - it's not a correct check here, because the function can still be called on the device in DOWN state Fixes: 508f2e3dce454 ("net: atlantic: split rx and tx per-queue stats") Signed-off-by: Sameer Saurabh Signed-off-by: Sudarsana Reddy Kalluru Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- drivers/net/ethernet/aquantia/atlantic/aq_vec.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c index d281322d7dd2..f4774cf051c9 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_vec.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_vec.c @@ -362,9 +362,6 @@ unsigned int aq_vec_get_sw_stats(struct aq_vec_s *self, const unsigned int tc, u { unsigned int count; - WARN_ONCE(!aq_vec_is_valid_tc(self, tc), - "Invalid tc %u (#rx=%u, #tx=%u)\n", - tc, self->rx_rings, self->tx_rings); if (!aq_vec_is_valid_tc(self, tc)) return 0; From cdef485217d30382f3bf6448c54b4401648fe3f1 Mon Sep 17 00:00:00 2001 From: msizanoen1 Date: Tue, 23 Nov 2021 13:48:32 +0100 Subject: [PATCH 147/231] ipv6: fix memory leak in fib6_rule_suppress The kernel leaks memory when a `fib` rule is present in IPv6 nftables firewall rules and a suppress_prefix rule is present in the IPv6 routing rules (used by certain tools such as wg-quick). In such scenarios, every incoming packet will leak an allocation in `ip6_dst_cache` slab cache. After some hours of `bpftrace`-ing and source code reading, I tracked down the issue to ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule"). The problem with that change is that the generic `args->flags` always have `FIB_LOOKUP_NOREF` set[1][2] but the IPv6-specific flag `RT6_LOOKUP_F_DST_NOREF` might not be, leading to `fib6_rule_suppress` not decreasing the refcount when needed. How to reproduce: - Add the following nftables rule to a prerouting chain: meta nfproto ipv6 fib saddr . mark . iif oif missing drop This can be done with: sudo nft create table inet test sudo nft create chain inet test test_chain '{ type filter hook prerouting priority filter + 10; policy accept; }' sudo nft add rule inet test test_chain meta nfproto ipv6 fib saddr . mark . iif oif missing drop - Run: sudo ip -6 rule add table main suppress_prefixlength 0 - Watch `sudo slabtop -o | grep ip6_dst_cache` to see memory usage increase with every incoming ipv6 packet. This patch exposes the protocol-specific flags to the protocol specific `suppress` function, and check the protocol-specific `flags` argument for RT6_LOOKUP_F_DST_NOREF instead of the generic FIB_LOOKUP_NOREF when decreasing the refcount, like this. [1]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L71 [2]: https://github.com/torvalds/linux/blob/ca7a03c4175366a92cee0ccc4fec0038c3266e26/net/ipv6/fib6_rules.c#L99 Link: https://bugzilla.kernel.org/show_bug.cgi?id=215105 Fixes: ca7a03c41753 ("ipv6: do not free rt if FIB_LOOKUP_NOREF is set on suppress rule") Cc: stable@vger.kernel.org Signed-off-by: Jason A. Donenfeld Signed-off-by: David S. Miller --- include/net/fib_rules.h | 4 +++- net/core/fib_rules.c | 2 +- net/ipv4/fib_rules.c | 1 + net/ipv6/fib6_rules.c | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 4b10676c69d1..bd07484ab9dd 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -69,7 +69,7 @@ struct fib_rules_ops { int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); - bool (*suppress)(struct fib_rule *, + bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); @@ -218,7 +218,9 @@ INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg)); #endif diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 79df7cd9dbc1..1bb567a3b329 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -323,7 +323,7 @@ jumped: if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress, fib6_rule_suppress, fib4_rule_suppress, - rule, arg)) + rule, flags, arg)) continue; if (err != -EAGAIN) { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index ce54a30c2ef1..364ad3446b2f 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -141,6 +141,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib_result *result = (struct fib_result *) arg->result; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 40f3e4f9f33a..dcedfe29d9d9 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -267,6 +267,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule, } INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, + int flags, struct fib_lookup_arg *arg) { struct fib6_result *res = arg->result; @@ -294,8 +295,7 @@ INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule, return false; suppress_route: - if (!(arg->flags & FIB_LOOKUP_NOREF)) - ip6_rt_put(rt); + ip6_rt_put_flags(rt, flags); return true; } From ca77fba821351190777b236ce749d7c4d353102e Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Sun, 21 Nov 2021 04:16:07 +0000 Subject: [PATCH 148/231] rxrpc: Fix rxrpc_peer leak in rxrpc_look_up_bundle() Need to call rxrpc_put_peer() for bundle candidate before kfree() as it holds a ref to rxrpc_peer. [DH: v2: Changed to abstract out the bundle freeing code into a function] Fixes: 245500d853e9 ("rxrpc: Rewrite the client connection manager") Signed-off-by: Eiichi Tsukata Signed-off-by: David Howells Reviewed-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/r/20211121041608.133740-1-eiichi.tsukata@nutanix.com/ # v1 --- net/rxrpc/conn_client.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index dbea0bfee48e..8120138dac01 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -135,16 +135,20 @@ struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle) return bundle; } +static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) +{ + rxrpc_put_peer(bundle->params.peer); + kfree(bundle); +} + void rxrpc_put_bundle(struct rxrpc_bundle *bundle) { unsigned int d = bundle->debug_id; unsigned int u = atomic_dec_return(&bundle->usage); _debug("PUT B=%x %u", d, u); - if (u == 0) { - rxrpc_put_peer(bundle->params.peer); - kfree(bundle); - } + if (u == 0) + rxrpc_free_bundle(bundle); } /* @@ -328,7 +332,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c return candidate; found_bundle_free: - kfree(candidate); + rxrpc_free_bundle(candidate); found_bundle: rxrpc_get_bundle(bundle); spin_unlock(&local->client_bundles_lock); From beacff50edbd6c9659a6f15fc7f6126909fade29 Mon Sep 17 00:00:00 2001 From: Eiichi Tsukata Date: Sun, 21 Nov 2021 04:16:08 +0000 Subject: [PATCH 149/231] rxrpc: Fix rxrpc_local leak in rxrpc_lookup_peer() Need to call rxrpc_put_local() for peer candidate before kfree() as it holds a ref to rxrpc_local. [DH: v2: Changed to abstract the peer freeing code out into a function] Fixes: 9ebeddef58c4 ("rxrpc: rxrpc_peer needs to hold a ref on the rxrpc_local record") Signed-off-by: Eiichi Tsukata Signed-off-by: David Howells Reviewed-by: Marc Dionne cc: linux-afs@lists.infradead.org Link: https://lore.kernel.org/all/20211121041608.133740-2-eiichi.tsukata@nutanix.com/ # v1 --- net/rxrpc/peer_object.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 68396d052052..0298fe2ad6d3 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -299,6 +299,12 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, return peer; } +static void rxrpc_free_peer(struct rxrpc_peer *peer) +{ + rxrpc_put_local(peer->local); + kfree_rcu(peer, rcu); +} + /* * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context @@ -365,7 +371,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) - kfree(candidate); + rxrpc_free_peer(candidate); else peer = candidate; } @@ -420,8 +426,7 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) list_del_init(&peer->keepalive_link); spin_unlock_bh(&rxnet->peer_hash_lock); - rxrpc_put_local(peer->local); - kfree_rcu(peer, rcu); + rxrpc_free_peer(peer); } /* @@ -457,8 +462,7 @@ void rxrpc_put_peer_locked(struct rxrpc_peer *peer) if (n == 0) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - rxrpc_put_local(peer->local); - kfree_rcu(peer, rcu); + rxrpc_free_peer(peer); } } From 191587cd1a5f36852a0fc32cff2d5bc7680551db Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 29 Nov 2021 14:41:48 +0100 Subject: [PATCH 150/231] mt76: fix key pointer overwrite in mt7921s_write_txwi/mt7663_usb_sdio_write_txwi Fix pointer overwrite in mt7921s_tx_prepare_skb and mt7663_usb_sdio_tx_prepare_skb routines since in commit '2a9e9857473b ("mt76: fix possible pktid leak") mt76_tx_status_skb_add() has been moved out of mt7921s_write_txwi()/mt7663_usb_sdio_write_txwi() overwriting hw key pointer in ieee80211_tx_info structure. Fix the issue saving key pointer before running mt76_tx_status_skb_add(). Fixes: 2a9e9857473b ("mt76: fix possible pktid leak") Tested-by: Deren Wu Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/eba40c84b6d114f618e2ae486cc6d0f2e9272cf9.1638193069.git.lorenzo@kernel.org --- drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c | 11 +++++------ drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c | 11 +++++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c index bfe6c1579dc1..5a6d7829c6e0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/usb_sdio.c @@ -43,13 +43,11 @@ EXPORT_SYMBOL_GPL(mt7663_usb_sdio_reg_map); static void mt7663_usb_sdio_write_txwi(struct mt7615_dev *dev, struct mt76_wcid *wcid, enum mt76_txq_id qid, struct ieee80211_sta *sta, - int pid, struct sk_buff *skb) + struct ieee80211_key_conf *key, int pid, + struct sk_buff *skb) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_key_conf *key = info->control.hw_key; - __le32 *txwi; + __le32 *txwi = (__le32 *)(skb->data - MT_USB_TXD_SIZE); - txwi = (__le32 *)(skb->data - MT_USB_TXD_SIZE); memset(txwi, 0, MT_USB_TXD_SIZE); mt7615_mac_write_txwi(dev, txwi, skb, wcid, sta, pid, key, false); skb_push(skb, MT_USB_TXD_SIZE); @@ -188,6 +186,7 @@ int mt7663_usb_sdio_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, struct mt7615_dev *dev = container_of(mdev, struct mt7615_dev, mt76); struct sk_buff *skb = tx_info->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_key_conf *key = info->control.hw_key; struct mt7615_sta *msta; int pad, err, pktid; @@ -205,7 +204,7 @@ int mt7663_usb_sdio_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, } pktid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); - mt7663_usb_sdio_write_txwi(dev, wcid, qid, sta, pktid, skb); + mt7663_usb_sdio_write_txwi(dev, wcid, qid, sta, key, pktid, skb); if (mt76_is_usb(mdev)) { u32 len = skb->len; diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c b/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c index 85b3d88f8ecc..bdec508b6b9f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/sdio_mac.c @@ -142,13 +142,11 @@ out: static void mt7921s_write_txwi(struct mt7921_dev *dev, struct mt76_wcid *wcid, enum mt76_txq_id qid, struct ieee80211_sta *sta, - int pid, struct sk_buff *skb) + struct ieee80211_key_conf *key, int pid, + struct sk_buff *skb) { - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_key_conf *key = info->control.hw_key; - __le32 *txwi; + __le32 *txwi = (__le32 *)(skb->data - MT_SDIO_TXD_SIZE); - txwi = (__le32 *)(skb->data - MT_SDIO_TXD_SIZE); memset(txwi, 0, MT_SDIO_TXD_SIZE); mt7921_mac_write_txwi(dev, txwi, skb, wcid, key, pid, false); skb_push(skb, MT_SDIO_TXD_SIZE); @@ -161,6 +159,7 @@ int mt7921s_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, { struct mt7921_dev *dev = container_of(mdev, struct mt7921_dev, mt76); struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_info->skb); + struct ieee80211_key_conf *key = info->control.hw_key; struct sk_buff *skb = tx_info->skb; int err, pad, pktid; @@ -180,7 +179,7 @@ int mt7921s_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr, } pktid = mt76_tx_status_skb_add(&dev->mt76, wcid, skb); - mt7921s_write_txwi(dev, wcid, qid, sta, pktid, skb); + mt7921s_write_txwi(dev, wcid, qid, sta, key, pktid, skb); mt7921_skb_add_sdio_hdr(skb, MT7921_SDIO_DATA); pad = round_up(skb->len, 4) - skb->len; From ddca5b0eba4ef69338cbc210d3fb3332499128f9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Nov 2021 15:26:43 +0000 Subject: [PATCH 151/231] netfs: Adjust docs after foliation Adjust the netfslib docs in light of the foliation changes. Also un-kdoc-mark netfs_skip_folio_read() since it's internal and isn't part of the API. Signed-off-by: David Howells Reviewed-by: Jeff Layton cc: Matthew Wilcox cc: linux-cachefs@redhat.com cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/163706992597.3179783.18360472879717076435.stgit@warthog.procyon.org.uk/ Signed-off-by: Linus Torvalds --- Documentation/filesystems/netfs_library.rst | 95 ++++++++++++--------- fs/netfs/read_helper.c | 4 +- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/Documentation/filesystems/netfs_library.rst b/Documentation/filesystems/netfs_library.rst index bb68d39f03b7..375baca7edcd 100644 --- a/Documentation/filesystems/netfs_library.rst +++ b/Documentation/filesystems/netfs_library.rst @@ -1,7 +1,7 @@ .. SPDX-License-Identifier: GPL-2.0 ================================= -NETWORK FILESYSTEM HELPER LIBRARY +Network Filesystem Helper Library ================================= .. Contents: @@ -37,22 +37,22 @@ into a common call framework. The following services are provided: - * Handles transparent huge pages (THPs). + * Handle folios that span multiple pages. - * Insulates the netfs from VM interface changes. + * Insulate the netfs from VM interface changes. - * Allows the netfs to arbitrarily split reads up into pieces, even ones that - don't match page sizes or page alignments and that may cross pages. + * Allow the netfs to arbitrarily split reads up into pieces, even ones that + don't match folio sizes or folio alignments and that may cross folios. - * Allows the netfs to expand a readahead request in both directions to meet - its needs. + * Allow the netfs to expand a readahead request in both directions to meet its + needs. - * Allows the netfs to partially fulfil a read, which will then be resubmitted. + * Allow the netfs to partially fulfil a read, which will then be resubmitted. - * Handles local caching, allowing cached data and server-read data to be + * Handle local caching, allowing cached data and server-read data to be interleaved for a single request. - * Handles clearing of bufferage that aren't on the server. + * Handle clearing of bufferage that aren't on the server. * Handle retrying of reads that failed, switching reads from the cache to the server as necessary. @@ -70,22 +70,22 @@ Read Helper Functions Three read helpers are provided:: - * void netfs_readahead(struct readahead_control *ractl, - const struct netfs_read_request_ops *ops, - void *netfs_priv);`` - * int netfs_readpage(struct file *file, - struct page *page, - const struct netfs_read_request_ops *ops, - void *netfs_priv); - * int netfs_write_begin(struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned int len, - unsigned int flags, - struct page **_page, - void **_fsdata, - const struct netfs_read_request_ops *ops, - void *netfs_priv); + void netfs_readahead(struct readahead_control *ractl, + const struct netfs_read_request_ops *ops, + void *netfs_priv); + int netfs_readpage(struct file *file, + struct folio *folio, + const struct netfs_read_request_ops *ops, + void *netfs_priv); + int netfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, + unsigned int len, + unsigned int flags, + struct folio **_folio, + void **_fsdata, + const struct netfs_read_request_ops *ops, + void *netfs_priv); Each corresponds to a VM operation, with the addition of a couple of parameters for the use of the read helpers: @@ -103,8 +103,8 @@ Both of these values will be stored into the read request structure. For ->readahead() and ->readpage(), the network filesystem should just jump into the corresponding read helper; whereas for ->write_begin(), it may be a little more complicated as the network filesystem might want to flush -conflicting writes or track dirty data and needs to put the acquired page if an -error occurs after calling the helper. +conflicting writes or track dirty data and needs to put the acquired folio if +an error occurs after calling the helper. The helpers manage the read request, calling back into the network filesystem through the suppplied table of operations. Waits will be performed as @@ -253,7 +253,7 @@ through which it can issue requests and negotiate:: void (*issue_op)(struct netfs_read_subrequest *subreq); bool (*is_still_valid)(struct netfs_read_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct page *page, void **_fsdata); + struct folio *folio, void **_fsdata); void (*done)(struct netfs_read_request *rreq); void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; @@ -313,13 +313,14 @@ The operations are as follows: There is no return value; the netfs_subreq_terminated() function should be called to indicate whether or not the operation succeeded and how much data - it transferred. The filesystem also should not deal with setting pages + it transferred. The filesystem also should not deal with setting folios uptodate, unlocking them or dropping their refs - the helpers need to deal with this as they have to coordinate with copying to the local cache. - Note that the helpers have the pages locked, but not pinned. It is possible - to use the ITER_XARRAY iov iterator to refer to the range of the inode that - is being operated upon without the need to allocate large bvec tables. + Note that the helpers have the folios locked, but not pinned. It is + possible to use the ITER_XARRAY iov iterator to refer to the range of the + inode that is being operated upon without the need to allocate large bvec + tables. * ``is_still_valid()`` @@ -330,15 +331,15 @@ The operations are as follows: * ``check_write_begin()`` [Optional] This is called from the netfs_write_begin() helper once it has - allocated/grabbed the page to be modified to allow the filesystem to flush + allocated/grabbed the folio to be modified to allow the filesystem to flush conflicting state before allowing it to be modified. - It should return 0 if everything is now fine, -EAGAIN if the page should be + It should return 0 if everything is now fine, -EAGAIN if the folio should be regrabbed and any other error code to abort the operation. * ``done`` - [Optional] This is called after the pages in the request have all been + [Optional] This is called after the folios in the request have all been unlocked (and marked uptodate if applicable). * ``cleanup`` @@ -390,7 +391,7 @@ The read helpers work by the following general procedure: * If NETFS_SREQ_CLEAR_TAIL was set, a short read will be cleared to the end of the slice instead of reissuing. - * Once the data is read, the pages that have been fully read/cleared: + * Once the data is read, the folios that have been fully read/cleared: * Will be marked uptodate. @@ -398,11 +399,11 @@ The read helpers work by the following general procedure: * Unlocked - * Any pages that need writing to the cache will then have DIO writes issued. + * Any folios that need writing to the cache will then have DIO writes issued. * Synchronous operations will wait for reading to be complete. - * Writes to the cache will proceed asynchronously and the pages will have the + * Writes to the cache will proceed asynchronously and the folios will have the PG_fscache mark removed when that completes. * The request structures will be cleaned up when everything has completed. @@ -452,6 +453,9 @@ operation table looks like the following:: netfs_io_terminated_t term_func, void *term_func_priv); + int (*prepare_write)(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size); + int (*write)(struct netfs_cache_resources *cres, loff_t start_pos, struct iov_iter *iter, @@ -509,6 +513,14 @@ The methods defined in the table are: indicating whether the termination is definitely happening in the caller's context. + * ``prepare_write()`` + + [Required] Called to adjust a write to the cache and check that there is + sufficient space in the cache. The start and length values indicate the + size of the write that netfslib is proposing, and this can be adjusted by + the cache to respect DIO boundaries. The file size is passed for + information. + * ``write()`` [Required] Called to write to the cache. The start file offset is given @@ -525,4 +537,9 @@ not the read request structure as they could be used in other situations where there isn't a read request structure as well, such as writing dirty data to the cache. + +API Function Reference +====================== + .. kernel-doc:: include/linux/netfs.h +.. kernel-doc:: fs/netfs/read_helper.c diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 9320a42dfaf9..7046f9bdd8dc 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -1008,8 +1008,8 @@ out: } EXPORT_SYMBOL(netfs_readpage); -/** - * netfs_skip_folio_read - prep a folio for writing without reading first +/* + * Prepare a folio for writing without reading first * @folio: The folio being prepared * @pos: starting position for the write * @len: length of write From ae9287811ba75571cd69505d50ab0e612ace8572 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:20 -0500 Subject: [PATCH 152/231] wireguard: allowedips: add missing __rcu annotation to satisfy sparse A __rcu annotation got lost during refactoring, which caused sparse to become enraged. Fixes: bf7b042dc62a ("wireguard: allowedips: free empty intermediate nodes when removing single node") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/allowedips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireguard/allowedips.c b/drivers/net/wireguard/allowedips.c index b7197e80f226..9a4c8ff32d9d 100644 --- a/drivers/net/wireguard/allowedips.c +++ b/drivers/net/wireguard/allowedips.c @@ -163,7 +163,7 @@ static bool node_placement(struct allowedips_node __rcu *trie, const u8 *key, return exact; } -static inline void connect_node(struct allowedips_node **parent, u8 bit, struct allowedips_node *node) +static inline void connect_node(struct allowedips_node __rcu **parent, u8 bit, struct allowedips_node *node) { node->parent_bit_packed = (unsigned long)parent | bit; rcu_assign_pointer(*parent, node); From 03ff1b1def73f817e196bf96ab36ac259490bd7c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:21 -0500 Subject: [PATCH 153/231] wireguard: selftests: increase default dmesg log size The selftests currently parse the kernel log at the end to track potential memory leaks. With these tests now reading off the end of the buffer, due to recent optimizations, some creation messages were lost, making the tests think that there was a free without an alloc. Fix this by increasing the kernel log size. Fixes: 24b70eeeb4f4 ("wireguard: use synchronize_net rather than synchronize_rcu") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/kernel.config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 74db83a0aedd..a9b5a520a1d2 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -66,6 +66,7 @@ CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15 +CONFIG_LOG_BUF_SHIFT=18 CONFIG_PRINTK_TIME=y CONFIG_BLK_DEV_INITRD=y CONFIG_LEGACY_VSYSCALL_NONE=y From 782c72af567fc2ef09bd7615d0307f24de72c7e0 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:22 -0500 Subject: [PATCH 154/231] wireguard: selftests: actually test for routing loops We previously removed the restriction on looping to self, and then added a test to make sure the kernel didn't blow up during a routing loop. The kernel didn't blow up, thankfully, but on certain architectures where skb fragmentation is easier, such as ppc64, the skbs weren't actually being discarded after a few rounds through. But the test wasn't catching this. So actually test explicitly for massive increases in tx to see if we have a routing loop. Note that the actual loop problem will need to be addressed in a different commit. Fixes: b673e24aad36 ("wireguard: socket: remove errant restriction on looping to self") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/netns.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index ebc4ee0fe179..2e5c1630885e 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -276,7 +276,11 @@ n0 ping -W 1 -c 1 192.168.241.2 n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7 ip2 link del wg0 ip2 link del wg1 -! n0 ping -W 1 -c 10 -f 192.168.241.2 || false # Should not crash kernel +read _ _ tx_bytes_before < <(n0 wg show wg1 transfer) +! n0 ping -W 1 -c 10 -f 192.168.241.2 || false +sleep 1 +read _ _ tx_bytes_after < <(n0 wg show wg1 transfer) +(( tx_bytes_after - tx_bytes_before < 70000 )) ip0 link del wg1 ip1 link del wg0 From b251b711a92189d558b07fde5a7ccd5a7915ebdd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 29 Nov 2021 10:39:23 -0500 Subject: [PATCH 155/231] wireguard: main: rename 'mod_init' & 'mod_exit' functions to be module-specific Rename module_init & module_exit functions that are named "mod_init" and "mod_exit" so that they are unique in both the System.map file and in initcall_debug output instead of showing up as almost anonymous "mod_init". This is helpful for debugging and in determining how long certain module_init calls take to execute. Signed-off-by: Randy Dunlap Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/wireguard/main.c b/drivers/net/wireguard/main.c index 75dbe77b0b4b..ee4da9ab8013 100644 --- a/drivers/net/wireguard/main.c +++ b/drivers/net/wireguard/main.c @@ -17,7 +17,7 @@ #include #include -static int __init mod_init(void) +static int __init wg_mod_init(void) { int ret; @@ -60,7 +60,7 @@ err_allowedips: return ret; } -static void __exit mod_exit(void) +static void __exit wg_mod_exit(void) { wg_genetlink_uninit(); wg_device_uninit(); @@ -68,8 +68,8 @@ static void __exit mod_exit(void) wg_allowedips_slab_uninit(); } -module_init(mod_init); -module_exit(mod_exit); +module_init(wg_mod_init); +module_exit(wg_mod_exit); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("WireGuard secure network tunnel"); MODULE_AUTHOR("Jason A. Donenfeld "); From 7e938beb8321d34f040557b8915b228af125f73c Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 29 Nov 2021 10:39:24 -0500 Subject: [PATCH 156/231] wireguard: selftests: rename DEBUG_PI_LIST to DEBUG_PLIST DEBUG_PI_LIST was renamed to DEBUG_PLIST since 8e18faeac3 ("lib/plist: rename DEBUG_PI_LIST to DEBUG_PLIST"). Signed-off-by: Li Zhijian Fixes: 8e18faeac3e4 ("lib/plist: rename DEBUG_PI_LIST to DEBUG_PLIST") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- tools/testing/selftests/wireguard/qemu/debug.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index fe07d97df9fa..2b321b8a96cf 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -47,7 +47,7 @@ CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_TRACE_IRQFLAGS=y CONFIG_DEBUG_BUGVERBOSE=y CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_PI_LIST=y +CONFIG_DEBUG_PLIST=y CONFIG_PROVE_RCU=y CONFIG_SPARSE_RCU_POINTER=y CONFIG_RCU_CPU_STALL_TIMEOUT=21 From 20ae1d6aa159eb91a9bf09ff92ccaa94dbea92c2 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:25 -0500 Subject: [PATCH 157/231] wireguard: device: reset peer src endpoint when netns exits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each peer's endpoint contains a dst_cache entry that takes a reference to another netdev. When the containing namespace exits, we take down the socket and prevent future sockets from being created (by setting creating_net to NULL), which removes that potential reference on the netns. However, it doesn't release references to the netns that a netdev cached in dst_cache might be taking, so the netns still might fail to exit. Since the socket is gimped anyway, we can simply clear all the dst_caches (by way of clearing the endpoint src), which will release all references. However, the current dst_cache_reset function only releases those references lazily. But it turns out that all of our usages of wg_socket_clear_peer_endpoint_src are called from contexts that are not exactly high-speed or bottle-necked. For example, when there's connection difficulty, or when userspace is reconfiguring the interface. And in particular for this patch, when the netns is exiting. So for those cases, it makes more sense to call dst_release immediately. For that, we add a small helper function to dst_cache. This patch also adds a test to netns.sh from Hangbin Liu to ensure this doesn't regress. Tested-by: Hangbin Liu Reported-by: Xiumei Mu Cc: Toke Høiland-Jørgensen Cc: Paolo Abeni Fixes: 900575aa33a3 ("wireguard: device: avoid circular netns references") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 3 +++ drivers/net/wireguard/socket.c | 2 +- include/net/dst_cache.h | 11 ++++++++++ net/core/dst_cache.c | 19 +++++++++++++++++ tools/testing/selftests/wireguard/netns.sh | 24 +++++++++++++++++++++- 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 551ddaaaf540..77e64ea6be67 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -398,6 +398,7 @@ static struct rtnl_link_ops link_ops __read_mostly = { static void wg_netns_pre_exit(struct net *net) { struct wg_device *wg; + struct wg_peer *peer; rtnl_lock(); list_for_each_entry(wg, &device_list, device_list) { @@ -407,6 +408,8 @@ static void wg_netns_pre_exit(struct net *net) mutex_lock(&wg->device_update_lock); rcu_assign_pointer(wg->creating_net, NULL); wg_socket_reinit(wg, NULL, NULL); + list_for_each_entry(peer, &wg->peer_list, peer_list) + wg_socket_clear_peer_endpoint_src(peer); mutex_unlock(&wg->device_update_lock); } } diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 8c496b747108..6f07b949cb81 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -308,7 +308,7 @@ void wg_socket_clear_peer_endpoint_src(struct wg_peer *peer) { write_lock_bh(&peer->endpoint_lock); memset(&peer->endpoint.src6, 0, sizeof(peer->endpoint.src6)); - dst_cache_reset(&peer->endpoint_cache); + dst_cache_reset_now(&peer->endpoint_cache); write_unlock_bh(&peer->endpoint_lock); } diff --git a/include/net/dst_cache.h b/include/net/dst_cache.h index 67634675e919..df6622a5fe98 100644 --- a/include/net/dst_cache.h +++ b/include/net/dst_cache.h @@ -79,6 +79,17 @@ static inline void dst_cache_reset(struct dst_cache *dst_cache) dst_cache->reset_ts = jiffies; } +/** + * dst_cache_reset_now - invalidate the cache contents immediately + * @dst_cache: the cache + * + * The caller must be sure there are no concurrent users, as this frees + * all dst_cache users immediately, rather than waiting for the next + * per-cpu usage like dst_cache_reset does. Most callers should use the + * higher speed lazily-freed dst_cache_reset function instead. + */ +void dst_cache_reset_now(struct dst_cache *dst_cache); + /** * dst_cache_init - initialize the cache, allocating the required storage * @dst_cache: the cache diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index be74ab4551c2..0ccfd5fa5cb9 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -162,3 +162,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache) free_percpu(dst_cache->cache); } EXPORT_SYMBOL_GPL(dst_cache_destroy); + +void dst_cache_reset_now(struct dst_cache *dst_cache) +{ + int i; + + if (!dst_cache->cache) + return; + + dst_cache->reset_ts = jiffies; + for_each_possible_cpu(i) { + struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i); + struct dst_entry *dst = idst->dst; + + idst->cookie = 0; + idst->dst = NULL; + dst_release(dst); + } +} +EXPORT_SYMBOL_GPL(dst_cache_reset_now); diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh index 2e5c1630885e..8a9461aa0878 100755 --- a/tools/testing/selftests/wireguard/netns.sh +++ b/tools/testing/selftests/wireguard/netns.sh @@ -613,6 +613,28 @@ ip0 link set wg0 up kill $ncat_pid ip0 link del wg0 +# Ensure that dst_cache references don't outlive netns lifetime +ip1 link add dev wg0 type wireguard +ip2 link add dev wg0 type wireguard +configure_peers +ip1 link add veth1 type veth peer name veth2 +ip1 link set veth2 netns $netns2 +ip1 addr add fd00:aa::1/64 dev veth1 +ip2 addr add fd00:aa::2/64 dev veth2 +ip1 link set veth1 up +ip2 link set veth2 up +waitiface $netns1 veth1 +waitiface $netns2 veth2 +ip1 -6 route add default dev veth1 via fd00:aa::2 +ip2 -6 route add default dev veth2 via fd00:aa::1 +n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2 +n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1 +n1 ping6 -c 1 fd00::2 +pp ip netns delete $netns1 +pp ip netns delete $netns2 +pp ip netns add $netns1 +pp ip netns add $netns2 + # Ensure there aren't circular reference loops ip1 link add wg1 type wireguard ip2 link add wg2 type wireguard @@ -631,7 +653,7 @@ while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do done < /dev/kmsg alldeleted=1 for object in "${!objects[@]}"; do - if [[ ${objects["$object"]} != *createddestroyed ]]; then + if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then echo "Error: $object: merely ${objects["$object"]}" >&3 alldeleted=0 fi From 886fcee939adb5e2af92741b90643a59f2b54f97 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:26 -0500 Subject: [PATCH 158/231] wireguard: receive: use ring buffer for incoming handshakes Apparently the spinlock on incoming_handshake's skb_queue is highly contended, and a torrent of handshake or cookie packets can bring the data plane to its knees, simply by virtue of enqueueing the handshake packets to be processed asynchronously. So, we try switching this to a ring buffer to hopefully have less lock contention. This alleviates the problem somewhat, though it still isn't perfect, so future patches will have to improve this further. However, it at least doesn't completely diminish the data plane. Reported-by: Streun Fabio Reported-by: Joel Wanner Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/device.c | 36 ++++++++++++++++---------------- drivers/net/wireguard/device.h | 9 +++----- drivers/net/wireguard/queueing.c | 6 +++--- drivers/net/wireguard/queueing.h | 2 +- drivers/net/wireguard/receive.c | 27 +++++++++++------------- 5 files changed, 37 insertions(+), 43 deletions(-) diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 77e64ea6be67..a46067c38bf5 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -98,6 +98,7 @@ static int wg_stop(struct net_device *dev) { struct wg_device *wg = netdev_priv(dev); struct wg_peer *peer; + struct sk_buff *skb; mutex_lock(&wg->device_update_lock); list_for_each_entry(peer, &wg->peer_list, peer_list) { @@ -108,7 +109,9 @@ static int wg_stop(struct net_device *dev) wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); } mutex_unlock(&wg->device_update_lock); - skb_queue_purge(&wg->incoming_handshakes); + while ((skb = ptr_ring_consume(&wg->handshake_queue.ring)) != NULL) + kfree_skb(skb); + atomic_set(&wg->handshake_queue_len, 0); wg_socket_reinit(wg, NULL, NULL); return 0; } @@ -235,14 +238,13 @@ static void wg_destruct(struct net_device *dev) destroy_workqueue(wg->handshake_receive_wq); destroy_workqueue(wg->handshake_send_wq); destroy_workqueue(wg->packet_crypt_wq); - wg_packet_queue_free(&wg->decrypt_queue); - wg_packet_queue_free(&wg->encrypt_queue); + wg_packet_queue_free(&wg->handshake_queue, true); + wg_packet_queue_free(&wg->decrypt_queue, false); + wg_packet_queue_free(&wg->encrypt_queue, false); rcu_barrier(); /* Wait for all the peers to be actually freed. */ wg_ratelimiter_uninit(); memzero_explicit(&wg->static_identity, sizeof(wg->static_identity)); - skb_queue_purge(&wg->incoming_handshakes); free_percpu(dev->tstats); - free_percpu(wg->incoming_handshakes_worker); kvfree(wg->index_hashtable); kvfree(wg->peer_hashtable); mutex_unlock(&wg->device_update_lock); @@ -298,7 +300,6 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, init_rwsem(&wg->static_identity.lock); mutex_init(&wg->socket_update_lock); mutex_init(&wg->device_update_lock); - skb_queue_head_init(&wg->incoming_handshakes); wg_allowedips_init(&wg->peer_allowedips); wg_cookie_checker_init(&wg->cookie_checker, wg); INIT_LIST_HEAD(&wg->peer_list); @@ -316,16 +317,10 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, if (!dev->tstats) goto err_free_index_hashtable; - wg->incoming_handshakes_worker = - wg_packet_percpu_multicore_worker_alloc( - wg_packet_handshake_receive_worker, wg); - if (!wg->incoming_handshakes_worker) - goto err_free_tstats; - wg->handshake_receive_wq = alloc_workqueue("wg-kex-%s", WQ_CPU_INTENSIVE | WQ_FREEZABLE, 0, dev->name); if (!wg->handshake_receive_wq) - goto err_free_incoming_handshakes; + goto err_free_tstats; wg->handshake_send_wq = alloc_workqueue("wg-kex-%s", WQ_UNBOUND | WQ_FREEZABLE, 0, dev->name); @@ -347,10 +342,15 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, if (ret < 0) goto err_free_encrypt_queue; - ret = wg_ratelimiter_init(); + ret = wg_packet_queue_init(&wg->handshake_queue, wg_packet_handshake_receive_worker, + MAX_QUEUED_INCOMING_HANDSHAKES); if (ret < 0) goto err_free_decrypt_queue; + ret = wg_ratelimiter_init(); + if (ret < 0) + goto err_free_handshake_queue; + ret = register_netdevice(dev); if (ret < 0) goto err_uninit_ratelimiter; @@ -367,18 +367,18 @@ static int wg_newlink(struct net *src_net, struct net_device *dev, err_uninit_ratelimiter: wg_ratelimiter_uninit(); +err_free_handshake_queue: + wg_packet_queue_free(&wg->handshake_queue, false); err_free_decrypt_queue: - wg_packet_queue_free(&wg->decrypt_queue); + wg_packet_queue_free(&wg->decrypt_queue, false); err_free_encrypt_queue: - wg_packet_queue_free(&wg->encrypt_queue); + wg_packet_queue_free(&wg->encrypt_queue, false); err_destroy_packet_crypt: destroy_workqueue(wg->packet_crypt_wq); err_destroy_handshake_send: destroy_workqueue(wg->handshake_send_wq); err_destroy_handshake_receive: destroy_workqueue(wg->handshake_receive_wq); -err_free_incoming_handshakes: - free_percpu(wg->incoming_handshakes_worker); err_free_tstats: free_percpu(dev->tstats); err_free_index_hashtable: diff --git a/drivers/net/wireguard/device.h b/drivers/net/wireguard/device.h index 854bc3d97150..43c7cebbf50b 100644 --- a/drivers/net/wireguard/device.h +++ b/drivers/net/wireguard/device.h @@ -39,21 +39,18 @@ struct prev_queue { struct wg_device { struct net_device *dev; - struct crypt_queue encrypt_queue, decrypt_queue; + struct crypt_queue encrypt_queue, decrypt_queue, handshake_queue; struct sock __rcu *sock4, *sock6; struct net __rcu *creating_net; struct noise_static_identity static_identity; - struct workqueue_struct *handshake_receive_wq, *handshake_send_wq; - struct workqueue_struct *packet_crypt_wq; - struct sk_buff_head incoming_handshakes; - int incoming_handshake_cpu; - struct multicore_worker __percpu *incoming_handshakes_worker; + struct workqueue_struct *packet_crypt_wq,*handshake_receive_wq, *handshake_send_wq; struct cookie_checker cookie_checker; struct pubkey_hashtable *peer_hashtable; struct index_hashtable *index_hashtable; struct allowedips peer_allowedips; struct mutex device_update_lock, socket_update_lock; struct list_head device_list, peer_list; + atomic_t handshake_queue_len; unsigned int num_peers, device_update_gen; u32 fwmark; u16 incoming_port; diff --git a/drivers/net/wireguard/queueing.c b/drivers/net/wireguard/queueing.c index 48e7b982a307..1de413b19e34 100644 --- a/drivers/net/wireguard/queueing.c +++ b/drivers/net/wireguard/queueing.c @@ -38,11 +38,11 @@ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, return 0; } -void wg_packet_queue_free(struct crypt_queue *queue) +void wg_packet_queue_free(struct crypt_queue *queue, bool purge) { free_percpu(queue->worker); - WARN_ON(!__ptr_ring_empty(&queue->ring)); - ptr_ring_cleanup(&queue->ring, NULL); + WARN_ON(!purge && !__ptr_ring_empty(&queue->ring)); + ptr_ring_cleanup(&queue->ring, purge ? (void(*)(void*))kfree_skb : NULL); } #define NEXT(skb) ((skb)->prev) diff --git a/drivers/net/wireguard/queueing.h b/drivers/net/wireguard/queueing.h index 4ef2944a68bc..e2388107f7fd 100644 --- a/drivers/net/wireguard/queueing.h +++ b/drivers/net/wireguard/queueing.h @@ -23,7 +23,7 @@ struct sk_buff; /* queueing.c APIs: */ int wg_packet_queue_init(struct crypt_queue *queue, work_func_t function, unsigned int len); -void wg_packet_queue_free(struct crypt_queue *queue); +void wg_packet_queue_free(struct crypt_queue *queue, bool purge); struct multicore_worker __percpu * wg_packet_percpu_multicore_worker_alloc(work_func_t function, void *ptr); diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index 7dc84bcca261..f4e537e3e8ec 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -116,8 +116,8 @@ static void wg_receive_handshake_packet(struct wg_device *wg, return; } - under_load = skb_queue_len(&wg->incoming_handshakes) >= - MAX_QUEUED_INCOMING_HANDSHAKES / 8; + under_load = atomic_read(&wg->handshake_queue_len) >= + MAX_QUEUED_INCOMING_HANDSHAKES / 8; if (under_load) { last_under_load = ktime_get_coarse_boottime_ns(); } else if (last_under_load) { @@ -212,13 +212,14 @@ static void wg_receive_handshake_packet(struct wg_device *wg, void wg_packet_handshake_receive_worker(struct work_struct *work) { - struct wg_device *wg = container_of(work, struct multicore_worker, - work)->ptr; + struct crypt_queue *queue = container_of(work, struct multicore_worker, work)->ptr; + struct wg_device *wg = container_of(queue, struct wg_device, handshake_queue); struct sk_buff *skb; - while ((skb = skb_dequeue(&wg->incoming_handshakes)) != NULL) { + while ((skb = ptr_ring_consume_bh(&queue->ring)) != NULL) { wg_receive_handshake_packet(wg, skb); dev_kfree_skb(skb); + atomic_dec(&wg->handshake_queue_len); cond_resched(); } } @@ -554,21 +555,17 @@ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb) case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): case cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE): { int cpu; - - if (skb_queue_len(&wg->incoming_handshakes) > - MAX_QUEUED_INCOMING_HANDSHAKES || - unlikely(!rng_is_initialized())) { + if (unlikely(!rng_is_initialized() || + ptr_ring_produce_bh(&wg->handshake_queue.ring, skb))) { net_dbg_skb_ratelimited("%s: Dropping handshake packet from %pISpfsc\n", wg->dev->name, skb); goto err; } - skb_queue_tail(&wg->incoming_handshakes, skb); - /* Queues up a call to packet_process_queued_handshake_ - * packets(skb): - */ - cpu = wg_cpumask_next_online(&wg->incoming_handshake_cpu); + atomic_inc(&wg->handshake_queue_len); + cpu = wg_cpumask_next_online(&wg->handshake_queue.last_cpu); + /* Queues up a call to packet_process_queued_handshake_packets(skb): */ queue_work_on(cpu, wg->handshake_receive_wq, - &per_cpu_ptr(wg->incoming_handshakes_worker, cpu)->work); + &per_cpu_ptr(wg->handshake_queue.worker, cpu)->work); break; } case cpu_to_le32(MESSAGE_DATA): From fb32f4f606c17b869805d7cede8b03d78339b50a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Mon, 29 Nov 2021 10:39:27 -0500 Subject: [PATCH 159/231] wireguard: receive: drop handshakes if queue lock is contended If we're being delivered packets from multiple CPUs so quickly that the ring lock is contended for CPU tries, then it's safe to assume that the queue is near capacity anyway, so just drop the packet rather than spinning. This helps deal with multicore DoS that can interfere with data path performance. It _still_ does not completely fix the issue, but it again chips away at it. Reported-by: Streun Fabio Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/receive.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireguard/receive.c b/drivers/net/wireguard/receive.c index f4e537e3e8ec..7b8df406c773 100644 --- a/drivers/net/wireguard/receive.c +++ b/drivers/net/wireguard/receive.c @@ -554,9 +554,19 @@ void wg_packet_receive(struct wg_device *wg, struct sk_buff *skb) case cpu_to_le32(MESSAGE_HANDSHAKE_INITIATION): case cpu_to_le32(MESSAGE_HANDSHAKE_RESPONSE): case cpu_to_le32(MESSAGE_HANDSHAKE_COOKIE): { - int cpu; - if (unlikely(!rng_is_initialized() || - ptr_ring_produce_bh(&wg->handshake_queue.ring, skb))) { + int cpu, ret = -EBUSY; + + if (unlikely(!rng_is_initialized())) + goto drop; + if (atomic_read(&wg->handshake_queue_len) > MAX_QUEUED_INCOMING_HANDSHAKES / 2) { + if (spin_trylock_bh(&wg->handshake_queue.ring.producer_lock)) { + ret = __ptr_ring_produce(&wg->handshake_queue.ring, skb); + spin_unlock_bh(&wg->handshake_queue.ring.producer_lock); + } + } else + ret = ptr_ring_produce_bh(&wg->handshake_queue.ring, skb); + if (ret) { + drop: net_dbg_skb_ratelimited("%s: Dropping handshake packet from %pISpfsc\n", wg->dev->name, skb); goto err; From 4e3fd721710553832460c179c2ee5ce67ef7f1e0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 29 Nov 2021 10:39:28 -0500 Subject: [PATCH 160/231] wireguard: ratelimiter: use kvcalloc() instead of kvzalloc() Use 2-factor argument form kvcalloc() instead of kvzalloc(). Link: https://github.com/KSPP/linux/issues/162 Fixes: e7096c131e51 ("net: WireGuard secure network tunnel") Signed-off-by: Gustavo A. R. Silva [Jason: Gustavo's link above is for KSPP, but this isn't actually a security fix, as table_size is bounded to 8192 anyway, and gcc realizes this, so the codegen comes out to be about the same.] Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- drivers/net/wireguard/ratelimiter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireguard/ratelimiter.c b/drivers/net/wireguard/ratelimiter.c index 3fedd1d21f5e..dd55e5c26f46 100644 --- a/drivers/net/wireguard/ratelimiter.c +++ b/drivers/net/wireguard/ratelimiter.c @@ -176,12 +176,12 @@ int wg_ratelimiter_init(void) (1U << 14) / sizeof(struct hlist_head))); max_entries = table_size * 8; - table_v4 = kvzalloc(table_size * sizeof(*table_v4), GFP_KERNEL); + table_v4 = kvcalloc(table_size, sizeof(*table_v4), GFP_KERNEL); if (unlikely(!table_v4)) goto err_kmemcache; #if IS_ENABLED(CONFIG_IPV6) - table_v6 = kvzalloc(table_size * sizeof(*table_v6), GFP_KERNEL); + table_v6 = kvcalloc(table_size, sizeof(*table_v6), GFP_KERNEL); if (unlikely(!table_v6)) { kvfree(table_v4); goto err_kmemcache; From f7e5b9bfa6c8820407b64eabc1f29c9a87e8993d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 29 Nov 2021 10:39:29 -0500 Subject: [PATCH 161/231] siphash: use _unaligned version by default On ARM v6 and later, we define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS because the ordinary load/store instructions (ldr, ldrh, ldrb) can tolerate any misalignment of the memory address. However, load/store double and load/store multiple instructions (ldrd, ldm) may still only be used on memory addresses that are 32-bit aligned, and so we have to use the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS macro with care, or we may end up with a severe performance hit due to alignment traps that require fixups by the kernel. Testing shows that this currently happens with clang-13 but not gcc-11. In theory, any compiler version can produce this bug or other problems, as we are dealing with undefined behavior in C99 even on architectures that support this in hardware, see also https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100363. Fortunately, the get_unaligned() accessors do the right thing: when building for ARMv6 or later, the compiler will emit unaligned accesses using the ordinary load/store instructions (but avoid the ones that require 32-bit alignment). When building for older ARM, those accessors will emit the appropriate sequence of ldrb/mov/orr instructions. And on architectures that can truly tolerate any kind of misalignment, the get_unaligned() accessors resolve to the leXX_to_cpup accessors that operate on aligned addresses. Since the compiler will in fact emit ldrd or ldm instructions when building this code for ARM v6 or later, the solution is to use the unaligned accessors unconditionally on architectures where this is known to be fast. The _aligned version of the hash function is however still needed to get the best performance on architectures that cannot do any unaligned access in hardware. This new version avoids the undefined behavior and should produce the fastest hash on all architectures we support. Link: https://lore.kernel.org/linux-arm-kernel/20181008211554.5355-4-ard.biesheuvel@linaro.org/ Link: https://lore.kernel.org/linux-crypto/CAK8P3a2KfmmGDbVHULWevB0hv71P2oi2ZCHEAqT=8dQfa0=cqQ@mail.gmail.com/ Reported-by: Ard Biesheuvel Fixes: 2c956a60778c ("siphash: add cryptographically secure PRF") Signed-off-by: Arnd Bergmann Reviewed-by: Jason A. Donenfeld Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld Signed-off-by: Jakub Kicinski --- include/linux/siphash.h | 14 ++++---------- lib/siphash.c | 12 ++++++------ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/linux/siphash.h b/include/linux/siphash.h index bf21591a9e5e..0cda61855d90 100644 --- a/include/linux/siphash.h +++ b/include/linux/siphash.h @@ -27,9 +27,7 @@ static inline bool siphash_key_is_zero(const siphash_key_t *key) } u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); -#endif u64 siphash_1u64(const u64 a, const siphash_key_t *key); u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); @@ -82,10 +80,9 @@ static inline u64 ___siphash_aligned(const __le64 *data, size_t len, static inline u64 siphash(const void *data, size_t len, const siphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) return __siphash_unaligned(data, len, key); -#endif return ___siphash_aligned(data, len, key); } @@ -96,10 +93,8 @@ typedef struct { u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key); -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key); -#endif u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); @@ -135,10 +130,9 @@ static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, static inline u32 hsiphash(const void *data, size_t len, const hsiphash_key_t *key) { -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + !IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) return __hsiphash_unaligned(data, len, key); -#endif return ___hsiphash_aligned(data, len, key); } diff --git a/lib/siphash.c b/lib/siphash.c index a90112ee72a1..72b9068ab57b 100644 --- a/lib/siphash.c +++ b/lib/siphash.c @@ -49,6 +49,7 @@ SIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -80,8 +81,8 @@ u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -113,7 +114,6 @@ u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) POSTAMBLE } EXPORT_SYMBOL(__siphash_unaligned); -#endif /** * siphash_1u64 - compute 64-bit siphash PRF value of a u64 @@ -250,6 +250,7 @@ EXPORT_SYMBOL(siphash_3u32); HSIPROUND; \ return (v0 ^ v1) ^ (v2 ^ v3); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u64)); @@ -280,8 +281,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -313,7 +314,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 @@ -418,6 +418,7 @@ EXPORT_SYMBOL(hsiphash_4u32); HSIPROUND; \ return v1 ^ v3; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) { const u8 *end = data + len - (len % sizeof(u32)); @@ -438,8 +439,8 @@ u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_aligned); +#endif -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS u32 __hsiphash_unaligned(const void *data, size_t len, const hsiphash_key_t *key) { @@ -461,7 +462,6 @@ u32 __hsiphash_unaligned(const void *data, size_t len, HPOSTAMBLE } EXPORT_SYMBOL(__hsiphash_unaligned); -#endif /** * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 From 1a59c9c55585e1ec5b352d31b3f8402f196eae94 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 29 Nov 2021 15:16:52 +0000 Subject: [PATCH 162/231] net: mscc: ocelot: fix missing unlock on error in ocelot_hwstamp_set() Add the missing mutex_unlock before return from function ocelot_hwstamp_set() in the ocelot_setup_ptp_traps() error handling case. Fixes: 96ca08c05838 ("net: mscc: ocelot: set up traps for PTP packets") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20211129151652.1165433-1-weiyongjun1@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 409cde1e59c6..1e4ad953cffb 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -1563,8 +1563,10 @@ int ocelot_hwstamp_set(struct ocelot *ocelot, int port, struct ifreq *ifr) } err = ocelot_setup_ptp_traps(ocelot, port, l2, l4); - if (err) + if (err) { + mutex_unlock(&ocelot->ptp_lock); return err; + } if (l2 && l4) cfg.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; From 7533377215b6ee432c06c5855f6be5d66e694e46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 01:50:08 +0000 Subject: [PATCH 163/231] KVM: x86/mmu: Use yield-safe TDP MMU root iter in MMU notifier unmapping Use the yield-safe variant of the TDP MMU iterator when handling an unmapping event from the MMU notifier, as most occurences of the event allow yielding. Fixes: e1eed5847b09 ("KVM: x86/mmu: Allow yielding during MMU notifier unmap/zap, if possible") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-Id: <20211120015008.3780032-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 1f8c9f783b78..4cd6bf7e73f0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1031,7 +1031,7 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range, { struct kvm_mmu_page *root; - for_each_tdp_mmu_root(kvm, root, range->slot->as_id) + for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false) flush = zap_gfn_range(kvm, root, range->start, range->end, range->may_block, flush, false); From 4b85c921cd393764d22c0cdab6d7d5d120aa0980 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 04:50:21 +0000 Subject: [PATCH 164/231] KVM: x86/mmu: Remove spurious TLB flushes in TDP MMU zap collapsible path Drop the "flush" param and return values to/from the TDP MMU's helper for zapping collapsible SPTEs. Because the helper runs with mmu_lock held for read, not write, it uses tdp_mmu_zap_spte_atomic(), and the atomic zap handles the necessary remote TLB flush. Similarly, because mmu_lock is dropped and re-acquired between zapping legacy MMUs and zapping TDP MMUs, kvm_mmu_zap_collapsible_sptes() must handle remote TLB flushes from the legacy MMU before calling into the TDP MMU. Fixes: e2209710ccc5d ("KVM: x86/mmu: Skip rmap operations if rmaps not allocated") Signed-off-by: Sean Christopherson Message-Id: <20211120045046.3940942-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 9 ++------- arch/x86/kvm/mmu/tdp_mmu.c | 22 +++++++--------------- arch/x86/kvm/mmu/tdp_mmu.h | 5 ++--- 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5942e9c6dd6e..1b3a7cc9d595 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5848,8 +5848,6 @@ restart: void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *slot) { - bool flush; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* @@ -5857,17 +5855,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, * logging at a 4k granularity and never creates collapsible * 2m SPTEs during dirty logging. */ - flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true); - if (flush) + if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true)) kvm_arch_flush_remote_tlbs_memslot(kvm, slot); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, false); - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, slot); + kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot); read_unlock(&kvm->mmu_lock); } } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 4cd6bf7e73f0..1db8496259ad 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1362,10 +1362,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, * Clear leaf entries which could be replaced by large mappings, for * GFNs within the slot. */ -static bool zap_collapsible_spte_range(struct kvm *kvm, +static void zap_collapsible_spte_range(struct kvm *kvm, struct kvm_mmu_page *root, - const struct kvm_memory_slot *slot, - bool flush) + const struct kvm_memory_slot *slot) { gfn_t start = slot->base_gfn; gfn_t end = start + slot->npages; @@ -1376,10 +1375,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm, tdp_root_for_each_pte(iter, root, start, end) { retry: - if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) { - flush = false; + if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true)) continue; - } if (!is_shadow_present_pte(iter.old_spte) || !is_last_spte(iter.old_spte, iter.level)) @@ -1391,6 +1388,7 @@ retry: pfn, PG_LEVEL_NUM)) continue; + /* Note, a successful atomic zap also does a remote TLB flush. */ if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) { /* * The iter must explicitly re-read the SPTE because @@ -1399,30 +1397,24 @@ retry: iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep)); goto retry; } - flush = true; } rcu_read_unlock(); - - return flush; } /* * Clear non-leaf entries (and free associated page tables) which could * be replaced by large mappings, for GFNs within the slot. */ -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush) +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; lockdep_assert_held_read(&kvm->mmu_lock); for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true) - flush = zap_collapsible_spte_range(kvm, root, slot, flush); - - return flush; + zap_collapsible_spte_range(kvm, root, slot); } /* diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 476b133544dd..3899004a5d91 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, bool wrprot); -bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *slot, - bool flush); +void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, + const struct kvm_memory_slot *slot); bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, From 28f091bc2f8c23b7eac2402956b692621be7f9f4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 13:01:37 -0500 Subject: [PATCH 165/231] KVM: MMU: shadow nested paging does not have PKU Initialize the mask for PKU permissions as if CR4.PKE=0, avoiding incorrect interpretations of the nested hypervisor's page tables. Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 1b3a7cc9d595..0e017a3b7c27 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4855,7 +4855,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, struct kvm_mmu *context = &vcpu->arch.guest_mmu; struct kvm_mmu_role_regs regs = { .cr0 = cr0, - .cr4 = cr4, + .cr4 = cr4 & ~X86_CR4_PKE, .efer = efer, }; union kvm_mmu_role new_role; @@ -4919,7 +4919,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->direct_map = false; update_permission_bitmask(context, true); - update_pkru_bitmask(context); + context->pkru_mask = 0; reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } From f47491d7f30b8ab084d0b1596697a7ea4561a894 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 20 Nov 2021 01:57:06 +0000 Subject: [PATCH 166/231] KVM: x86/mmu: Handle "default" period when selectively waking kthread Account for the '0' being a default, "let KVM choose" period, when determining whether or not the recovery worker needs to be awakened in response to userspace reducing the period. Failure to do so results in the worker not being awakened properly, e.g. when changing the period from '0' to any small-ish value. Fixes: 4dfe4f40d845 ("kvm: x86: mmu: Make NX huge page recovery period configurable") Cc: stable@vger.kernel.org Cc: Junaid Shahid Signed-off-by: Sean Christopherson Message-Id: <20211120015706.3830341-1-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 48 +++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0e017a3b7c27..6354297e92ae 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6171,23 +6171,46 @@ void kvm_mmu_module_exit(void) mmu_audit_disable(); } +/* + * Calculate the effective recovery period, accounting for '0' meaning "let KVM + * select a halving time of 1 hour". Returns true if recovery is enabled. + */ +static bool calc_nx_huge_pages_recovery_period(uint *period) +{ + /* + * Use READ_ONCE to get the params, this may be called outside of the + * param setters, e.g. by the kthread to compute its next timeout. + */ + bool enabled = READ_ONCE(nx_huge_pages); + uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + + if (!enabled || !ratio) + return false; + + *period = READ_ONCE(nx_huge_pages_recovery_period_ms); + if (!*period) { + /* Make sure the period is not less than one second. */ + ratio = min(ratio, 3600u); + *period = 60 * 60 * 1000 / ratio; + } + return true; +} + static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp) { bool was_recovery_enabled, is_recovery_enabled; uint old_period, new_period; int err; - was_recovery_enabled = nx_huge_pages_recovery_ratio; - old_period = nx_huge_pages_recovery_period_ms; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); if (err) return err; - is_recovery_enabled = nx_huge_pages_recovery_ratio; - new_period = nx_huge_pages_recovery_period_ms; + is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period); - if (READ_ONCE(nx_huge_pages) && is_recovery_enabled && + if (is_recovery_enabled && (!was_recovery_enabled || old_period > new_period)) { struct kvm *kvm; @@ -6251,18 +6274,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) static long get_nx_lpage_recovery_timeout(u64 start_time) { - uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio); - uint period = READ_ONCE(nx_huge_pages_recovery_period_ms); + bool enabled; + uint period; - if (!period && ratio) { - /* Make sure the period is not less than one second. */ - ratio = min(ratio, 3600u); - period = 60 * 60 * 1000 / ratio; - } + enabled = calc_nx_huge_pages_recovery_period(&period); - return READ_ONCE(nx_huge_pages) && ratio - ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; + return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; } static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) From 81835ee113e92683160030fe3328f3c3187a92c2 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Fri, 26 Nov 2021 16:28:31 +0100 Subject: [PATCH 167/231] KVM: selftests: page_table_test: fix calculation of guest_test_phys_mem A kvm_page_table_test run with its default settings fails on VMX due to memory region add failure: > ==== Test Assertion Failure ==== > lib/kvm_util.c:952: ret == 0 > pid=10538 tid=10538 errno=17 - File exists > 1 0x00000000004057d1: vm_userspace_mem_region_add at kvm_util.c:947 > 2 0x0000000000401ee9: pre_init_before_test at kvm_page_table_test.c:302 > 3 (inlined by) run_test at kvm_page_table_test.c:374 > 4 0x0000000000409754: for_each_guest_mode at guest_modes.c:53 > 5 0x0000000000401860: main at kvm_page_table_test.c:500 > 6 0x00007f82ae2d8554: ?? ??:0 > 7 0x0000000000401894: _start at ??:? > KVM_SET_USER_MEMORY_REGION IOCTL failed, > rc: -1 errno: 17 > slot: 1 flags: 0x0 > guest_phys_addr: 0xc0000000 size: 0x40000000 This is because the memory range that this test is trying to add (0x0c0000000 - 0x100000000) conflicts with LAPIC mapping at 0x0fee00000. Looking at the code it seems that guest_test_*phys*_mem variable gets mistakenly overwritten with guest_test_*virt*_mem while trying to adjust the former for alignment. With the correct variable adjusted this test runs successfully. Signed-off-by: Maciej S. Szmigiero Message-Id: <52e487458c3172923549bbcf9dfccfbe6faea60b.1637940473.git.maciej.szmigiero@oracle.com> Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/kvm_page_table_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 3836322add00..ba1fdc3dcf4a 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -280,7 +280,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) #ifdef __s390x__ alignment = max(0x100000, alignment); #endif - guest_test_phys_mem = align_down(guest_test_virt_mem, alignment); + guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); /* Set up the shared data structure test_args */ test_args.vm = vm; From 7e1901f6c86c896acff6609e0176f93f756d8b2a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:09 -0500 Subject: [PATCH 168/231] KVM: VMX: prepare sync_pir_to_irr for running with APICv disabled If APICv is disabled for this vCPU, assigned devices may still attempt to post interrupts. In that case, we need to cancel the vmentry and deliver the interrupt with KVM_REQ_EVENT. Extend the existing code that handles injection of L1 interrupts into L2 to cover this case as well. vmx_hwapic_irr_update is only called when APICv is active so it would be confusing to add a check for vcpu->arch.apicv_active in there. Instead, just use vmx_set_rvi directly in vmx_sync_pir_to_irr. Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Reviewed-by: Sean Christopherson Message-Id: <20211123004311.2954158-3-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 18971cfadd4f..1fadec8cbf96 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6267,9 +6267,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; - bool max_irr_updated; + bool got_posted_interrupt; - if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + if (KVM_BUG_ON(!enable_apicv, vcpu->kvm)) return -EIO; if (pi_test_on(&vmx->pi_desc)) { @@ -6279,22 +6279,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr_updated = + got_posted_interrupt = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); - - /* - * If we are running L2 and L1 has a new pending interrupt - * which can be injected, this may cause a vmexit or it may - * be injected into L2. Either way, this interrupt will be - * processed via KVM_REQ_EVENT, not RVI, because we do not use - * virtual interrupt delivery to inject L1 interrupts into L2. - */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_make_request(KVM_REQ_EVENT, vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); + got_posted_interrupt = false; } - vmx_hwapic_irr_update(vcpu, max_irr); + + /* + * Newly recognized interrupts are injected via either virtual interrupt + * delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is + * disabled in two cases: + * + * 1) If L2 is running and the vCPU has a new pending interrupt. If L1 + * wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a + * VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected + * into L2, but KVM doesn't use virtual interrupt delivery to inject + * interrupts into L2, and so KVM_REQ_EVENT is again needed. + * + * 2) If APICv is disabled for this vCPU, assigned devices may still + * attempt to post interrupts. The posted interrupt vector will cause + * a VM-Exit and the subsequent entry will call sync_pir_to_irr. + */ + if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu)) + vmx_set_rvi(max_irr); + else if (got_posted_interrupt) + kvm_make_request(KVM_REQ_EVENT, vcpu); + return max_irr; } From 37c4dbf337c5c2cdb24365ffae6ed70ac1e74d7a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:10 -0500 Subject: [PATCH 169/231] KVM: x86: check PIR even for vCPUs with disabled APICv The IRTE for an assigned device can trigger a POSTED_INTR_VECTOR even if APICv is disabled on the vCPU that receives it. In that case, the interrupt will just cause a vmexit and leave the ON bit set together with the PIR bit corresponding to the interrupt. Right now, the interrupt would not be delivered until APICv is re-enabled. However, fixing this is just a matter of always doing the PIR->IRR synchronization, even if the vCPU has temporarily disabled APICv. This is not a problem for performance, or if anything it is an improvement. First, in the common case where vcpu->arch.apicv_active is true, one fewer check has to be performed. Second, static_call_cond will elide the function call if APICv is not present or disabled. Finally, in the case for AMD hardware we can remove the sync_pir_to_irr callback: it is only needed for apic_has_interrupt_for_ppr, and that function already has a fallback for !APICv. Cc: stable@vger.kernel.org Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Message-Id: <20211123004311.2954158-4-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm/svm.c | 1 - arch/x86/kvm/x86.c | 18 +++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 759952dd1222..f206fc35deff 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (apic->vcpu->arch.apicv_active) + if (kvm_x86_ops.sync_pir_to_irr) highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5630c241d5f6..d0f68d11ec70 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4651,7 +4651,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, - .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 817898eab7c3..0ee1a039b490 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4472,8 +4472,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -9571,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -9842,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * This handles the case where a posted interrupt was - * notified with kvm_vcpu_kick. + * notified with kvm_vcpu_kick. Assigned devices can + * use the POSTED_INTR_VECTOR even if APICv is disabled, + * so do it even if APICv is disabled on this vCPU. */ - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; @@ -9889,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) break; - if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - static_call(kvm_x86_sync_pir_to_irr)(vcpu); + if (kvm_lapic_enabled(vcpu)) + static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); if (unlikely(kvm_vcpu_exit_request(vcpu))) { exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; From 53b7ca1a359389276c76fbc9e1009d8626a17e40 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:43:11 -0500 Subject: [PATCH 170/231] KVM: x86: Use a stable condition around all VT-d PI paths Currently, checks for whether VT-d PI can be used refer to the current status of the feature in the current vCPU; or they more or less pick vCPU 0 in case a specific vCPU is not available. However, these checks do not attempt to synchronize with changes to the IRTE. In particular, there is no path that updates the IRTE when APICv is re-activated on vCPU 0; and there is no path to wakeup a CPU that has APICv disabled, if the wakeup occurs because of an IRTE that points to a posted interrupt. To fix this, always go through the VT-d PI path as long as there are assigned devices and APICv is available on both the host and the VM side. Since the relevant condition was copied over three times, take the hint and factor it into a separate function. Suggested-by: Sean Christopherson Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Reviewed-by: Maxim Levitsky Reviewed-by: David Matlack Message-Id: <20211123004311.2954158-5-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/posted_intr.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 5f81ef092bd4..1c94783b5a54 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -5,6 +5,7 @@ #include #include "lapic.h" +#include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" @@ -77,13 +78,18 @@ after_clear_sn: pi_set_on(pi_desc); } +static bool vmx_can_use_vtd_pi(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm) && enable_apicv && + kvm_arch_has_assigned_device(kvm) && + irq_remapping_cap(IRQ_POSTING_CAP); +} + void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return; /* Set SN when the vCPU is preempted */ @@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu) struct pi_desc old, new; struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); - if (!kvm_arch_has_assigned_device(vcpu->kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(vcpu)) + if (!vmx_can_use_vtd_pi(vcpu->kvm)) return 0; WARN_ON(irqs_disabled()); @@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct vcpu_data vcpu_info; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP) || - !kvm_vcpu_apicv_active(kvm->vcpus[0])) + if (!vmx_can_use_vtd_pi(kvm)) return 0; idx = srcu_read_lock(&kvm->irq_srcu); From 4674164f0ac5fd553c38b2b8c49fe13297fed38b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:28 -0500 Subject: [PATCH 171/231] KVM: SEV: do not use list_replace_init on an empty list list_replace_init cannot be used if the source is an empty list, because "new->next->prev = new" will overwrite "old->next": new old prev = new, next = new prev = old, next = old new->next = old->next prev = new, next = old prev = old, next = old new->next->prev = new prev = new, next = old prev = old, next = new new->prev = old->prev prev = old, next = old prev = old, next = old new->next->prev = new prev = old, next = old prev = new, next = new The desired outcome instead would be to leave both old and new the same as they were (two empty circular lists). Use list_cut_before, which already has the necessary check and is documented to discard the previous contents of the list that will hold the result. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-5-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 21ac0a5de4e0..75955beb3770 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1613,8 +1613,7 @@ static void sev_migrate_from(struct kvm_sev_info *dst, src->handle = 0; src->pages_locked = 0; - INIT_LIST_HEAD(&dst->regions_list); - list_replace_init(&src->regions_list, &dst->regions_list); + list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) From 501b580c02339a83917cf3b44c445f2419b15dcb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:29 -0500 Subject: [PATCH 172/231] KVM: SEV: cleanup locking for KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM Encapsulate the handling of the migration_in_progress flag for both VMs in two functions sev_lock_two_vms and sev_unlock_two_vms. It does not matter if KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM locks the destination struct kvm a bit later, and this change 1) keeps the cleanup chain of labels smaller 2) makes it possible for KVM_CAP_VM_COPY_ENC_CONTEXT_FROM to reuse the logic. Cc: Peter Gonda Cc: Sean Christopherson Message-Id: <20211123005036.2954379-6-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 53 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75955beb3770..8902b018fc18 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1543,28 +1543,40 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id) return false; } -static int sev_lock_for_migration(struct kvm *kvm) +static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + + if (dst_kvm == src_kvm) + return -EINVAL; /* - * Bail if this VM is already involved in a migration to avoid deadlock - * between two VMs trying to migrate to/from each other. + * Bail if these VMs are already involved in a migration to avoid + * deadlock between two VMs trying to migrate to/from each other. */ - if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - mutex_lock(&kvm->lock); + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) { + atomic_set_release(&dst_sev->migration_in_progress, 0); + return -EBUSY; + } + mutex_lock(&dst_kvm->lock); + mutex_lock(&src_kvm->lock); return 0; } -static void sev_unlock_after_migration(struct kvm *kvm) +static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; + struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; - mutex_unlock(&kvm->lock); - atomic_set_release(&sev->migration_in_progress, 0); + mutex_unlock(&dst_kvm->lock); + mutex_unlock(&src_kvm->lock); + atomic_set_release(&dst_sev->migration_in_progress, 0); + atomic_set_release(&src_sev->migration_in_progress, 0); } @@ -1665,15 +1677,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) bool charged = false; int ret; - ret = sev_lock_for_migration(kvm); - if (ret) - return ret; - - if (sev_guest(kvm)) { - ret = -EINVAL; - goto out_unlock; - } - source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; @@ -1681,13 +1684,13 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } source_kvm = source_kvm_file->private_data; - ret = sev_lock_for_migration(source_kvm); + ret = sev_lock_two_vms(kvm, source_kvm); if (ret) goto out_fput; - if (!sev_guest(source_kvm)) { + if (sev_guest(kvm) || !sev_guest(source_kvm)) { ret = -EINVAL; - goto out_source; + goto out_unlock; } src_sev = &to_kvm_svm(source_kvm)->sev_info; @@ -1727,13 +1730,11 @@ out_dst_cgroup: sev_misc_cg_uncharge(cg_cleanup_sev); put_misc_cg(cg_cleanup_sev->misc_cg); cg_cleanup_sev->misc_cg = NULL; -out_source: - sev_unlock_after_migration(source_kvm); +out_unlock: + sev_unlock_two_vms(kvm, source_kvm); out_fput: if (source_kvm_file) fput(source_kvm_file); -out_unlock: - sev_unlock_after_migration(kvm); return ret; } From 2b347a387811cb4aa7bcdb96e9203c5019a6fb41 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:30 -0500 Subject: [PATCH 173/231] KVM: SEV: initialize regions_list of a mirror VM This was broken before the introduction of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, but technically harmless because the region list was unused for a mirror VM. However, it is untidy and it now causes a NULL pointer access when attempting to move the encryption context of a mirror VM. Fixes: 54526d1fd593 ("KVM: x86: Support KVM VMs sharing SEV context") Message-Id: <20211123005036.2954379-7-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8902b018fc18..8daabc3dc079 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2007,6 +2007,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) mirror_sev->fd = source_sev.fd; mirror_sev->es_active = source_sev.es_active; mirror_sev->handle = source_sev.handle; + INIT_LIST_HEAD(&mirror_sev->regions_list); /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different From 642525e3bd474dc50b7d0e8ee9c966b97e4be3ac Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:31 -0500 Subject: [PATCH 174/231] KVM: SEV: move mirror status to destination of KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM Allow intra-host migration of a mirror VM; the destination VM will be a mirror of the same ASID as the source. Fixes: b56639318bb2 ("KVM: SEV: Add support for SEV intra host migration") Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-8-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 8daabc3dc079..74b6459b5fb2 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1619,11 +1619,13 @@ static void sev_migrate_from(struct kvm_sev_info *dst, dst->asid = src->asid; dst->handle = src->handle; dst->pages_locked = src->pages_locked; + dst->enc_context_owner = src->enc_context_owner; src->asid = 0; src->active = false; src->handle = 0; src->pages_locked = 0; + src->enc_context_owner = NULL; list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list); } From dc79c9f4eb6b4f4584ba0f6f334b907283ed4b6c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:32 -0500 Subject: [PATCH 175/231] selftests: sev_migrate_tests: add tests for KVM_CAP_VM_COPY_ENC_CONTEXT_FROM I am putting the tests in sev_migrate_tests because the failure conditions are very similar and some of the setup code can be reused, too. The tests cover both successful creation of a mirror VM, and error conditions. Cc: Peter Gonda Cc: Sean Christopherson Message-Id: <20211123005036.2954379-9-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- .../selftests/kvm/x86_64/sev_migrate_tests.c | 112 ++++++++++++++++-- 1 file changed, 105 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index 0cd7e2eaa895..d265cea5de85 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -54,12 +54,15 @@ static struct kvm_vm *sev_vm_create(bool es) return vm; } -static struct kvm_vm *__vm_create(void) +static struct kvm_vm *aux_vm_create(bool with_vcpus) { struct kvm_vm *vm; int i; vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + if (!with_vcpus) + return vm; + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) vm_vcpu_add(vm, i); @@ -93,7 +96,7 @@ static void test_sev_migrate_from(bool es) src_vm = sev_vm_create(es); for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) - dst_vms[i] = __vm_create(); + dst_vms[i] = aux_vm_create(true); /* Initial migration from the src to the first dst. */ sev_migrate_from(dst_vms[0]->fd, src_vm->fd); @@ -162,7 +165,7 @@ static void test_sev_migrate_parameters(void) sev_vm = sev_vm_create(/* es= */ false); sev_es_vm = sev_vm_create(/* es= */ true); vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); - vm_no_sev = __vm_create(); + vm_no_sev = aux_vm_create(true); sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); vm_vcpu_add(sev_es_vm_no_vmsa, 1); @@ -203,11 +206,106 @@ static void test_sev_migrate_parameters(void) kvm_vm_free(vm_no_sev); } +static int __sev_mirror_create(int dst_fd, int src_fd) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM, + .args = { src_fd } + }; + + return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); +} + + +static void sev_mirror_create(int dst_fd, int src_fd) +{ + int ret; + + ret = __sev_mirror_create(dst_fd, src_fd); + TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno); +} + +static void test_sev_mirror(bool es) +{ + struct kvm_vm *src_vm, *dst_vm; + struct kvm_sev_launch_start start = { + .policy = es ? SEV_POLICY_ES : 0 + }; + int i; + + src_vm = sev_vm_create(es); + dst_vm = aux_vm_create(false); + + sev_mirror_create(dst_vm->fd, src_vm->fd); + + /* Check that we can complete creation of the mirror VM. */ + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + vm_vcpu_add(dst_vm, i); + sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_START, &start); + if (es) + sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); + + kvm_vm_free(src_vm); + kvm_vm_free(dst_vm); +} + +static void test_sev_mirror_parameters(void) +{ + struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu; + int ret; + + sev_vm = sev_vm_create(/* es= */ false); + sev_es_vm = sev_vm_create(/* es= */ true); + vm_with_vcpu = aux_vm_create(true); + vm_no_vcpu = aux_vm_create(false); + + ret = __sev_mirror_create(sev_vm->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to self. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_mirror_create(sev_es_vm->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Copy context requires SEV enabled. ret %d, errno: %d\n", ret, + errno); + + ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n", + ret, errno); + + kvm_vm_free(sev_vm); + kvm_vm_free(sev_es_vm); + kvm_vm_free(vm_with_vcpu); + kvm_vm_free(vm_no_vcpu); +} + int main(int argc, char *argv[]) { - test_sev_migrate_from(/* es= */ false); - test_sev_migrate_from(/* es= */ true); - test_sev_migrate_locking(); - test_sev_migrate_parameters(); + if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { + test_sev_migrate_from(/* es= */ false); + test_sev_migrate_from(/* es= */ true); + test_sev_migrate_locking(); + test_sev_migrate_parameters(); + } + if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { + test_sev_mirror(/* es= */ false); + test_sev_mirror(/* es= */ true); + test_sev_mirror_parameters(); + } return 0; } From bf42b02b19e27d6849852a41dd734af4c05e73c6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:33 -0500 Subject: [PATCH 176/231] KVM: SEV: Do COPY_ENC_CONTEXT_FROM with both VMs locked Now that we have a facility to lock two VMs with deadlock protection, use it for the creation of mirror VMs as well. One of COPY_ENC_CONTEXT_FROM(dst, src) and COPY_ENC_CONTEXT_FROM(src, dst) would always fail, so the combination is nonsensical and it is okay to return -EBUSY if it is attempted. This sidesteps the question of what happens if a VM is MOVE_ENC_CONTEXT_FROM'd at the same time as it is COPY_ENC_CONTEXT_FROM'd: the locking prevents that from happening. Cc: Peter Gonda Cc: Sean Christopherson Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-10-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 66 +++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 74b6459b5fb2..025d9731b66c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1955,77 +1955,59 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) { struct file *source_kvm_file; struct kvm *source_kvm; - struct kvm_sev_info source_sev, *mirror_sev; + struct kvm_sev_info *source_sev, *mirror_sev; int ret; source_kvm_file = fget(source_fd); if (!file_is_kvm(source_kvm_file)) { ret = -EBADF; - goto e_source_put; + goto e_source_fput; } source_kvm = source_kvm_file->private_data; - mutex_lock(&source_kvm->lock); + ret = sev_lock_two_vms(kvm, source_kvm); + if (ret) + goto e_source_fput; - if (!sev_guest(source_kvm)) { + /* + * Mirrors of mirrors should work, but let's not get silly. Also + * disallow out-of-band SEV/SEV-ES init if the target is already an + * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being + * created after SEV/SEV-ES initialization, e.g. to init intercepts. + */ + if (sev_guest(kvm) || !sev_guest(source_kvm) || + is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) { ret = -EINVAL; - goto e_source_unlock; + goto e_unlock; } - /* Mirrors of mirrors should work, but let's not get silly */ - if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) { - ret = -EINVAL; - goto e_source_unlock; - } - - memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info, - sizeof(source_sev)); - /* * The mirror kvm holds an enc_context_owner ref so its asid can't * disappear until we're done with it */ + source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); - fput(source_kvm_file); - mutex_unlock(&source_kvm->lock); - mutex_lock(&kvm->lock); - - /* - * Disallow out-of-band SEV/SEV-ES init if the target is already an - * SEV guest, or if vCPUs have been created. KVM relies on vCPUs being - * created after SEV/SEV-ES initialization, e.g. to init intercepts. - */ - if (sev_guest(kvm) || kvm->created_vcpus) { - ret = -EINVAL; - goto e_mirror_unlock; - } - /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; mirror_sev->enc_context_owner = source_kvm; mirror_sev->active = true; - mirror_sev->asid = source_sev.asid; - mirror_sev->fd = source_sev.fd; - mirror_sev->es_active = source_sev.es_active; - mirror_sev->handle = source_sev.handle; + mirror_sev->asid = source_sev->asid; + mirror_sev->fd = source_sev->fd; + mirror_sev->es_active = source_sev->es_active; + mirror_sev->handle = source_sev->handle; INIT_LIST_HEAD(&mirror_sev->regions_list); + ret = 0; + /* * Do not copy ap_jump_table. Since the mirror does not share the same * KVM contexts as the original, and they may have different * memory-views. */ - mutex_unlock(&kvm->lock); - return 0; - -e_mirror_unlock: - mutex_unlock(&kvm->lock); - kvm_put_kvm(source_kvm); - return ret; -e_source_unlock: - mutex_unlock(&source_kvm->lock); -e_source_put: +e_unlock: + sev_unlock_two_vms(kvm, source_kvm); +e_source_fput: if (source_kvm_file) fput(source_kvm_file); return ret; From 17d44a96f000fe1040d4ba1c34e458c63be6b7ce Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:34 -0500 Subject: [PATCH 177/231] KVM: SEV: Prohibit migration of a VM that has mirrors VMs that mirror an encryption context rely on the owner to keep the ASID allocated. Performing a KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM would cause a dangling ASID: 1. copy context from A to B (gets ref to A) 2. move context from A to L (moves ASID from A to L) 3. close L (releases ASID from L, B still references it) The right way to do the handoff instead is to create a fresh mirror VM on the destination first: 1. copy context from A to B (gets ref to A) [later] 2. close B (releases ref to A) 3. move context from A to L (moves ASID from A to L) 4. copy context from L to M So, catch the situation by adding a count of how many VMs are mirroring this one's encryption context. Fixes: 0b020f5af092 ("KVM: SEV: Add support for SEV-ES intra host migration") Message-Id: <20211123005036.2954379-11-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 22 ++++++++++- arch/x86/kvm/svm/svm.h | 1 + .../selftests/kvm/x86_64/sev_migrate_tests.c | 37 +++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 025d9731b66c..89a716290fac 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1696,6 +1696,16 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) } src_sev = &to_kvm_svm(source_kvm)->sev_info; + + /* + * VMs mirroring src's encryption context rely on it to keep the + * ASID allocated, but below we are clearing src_sev->asid. + */ + if (src_sev->num_mirrored_vms) { + ret = -EBUSY; + goto out_unlock; + } + dst_sev->misc_cg = get_current_misc_cg(); cg_cleanup_sev = dst_sev; if (dst_sev->misc_cg != src_sev->misc_cg) { @@ -1987,6 +1997,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd) */ source_sev = &to_kvm_svm(source_kvm)->sev_info; kvm_get_kvm(source_kvm); + source_sev->num_mirrored_vms++; /* Set enc_context_owner and copy its encryption context over */ mirror_sev = &to_kvm_svm(kvm)->sev_info; @@ -2019,12 +2030,21 @@ void sev_vm_destroy(struct kvm *kvm) struct list_head *head = &sev->regions_list; struct list_head *pos, *q; + WARN_ON(sev->num_mirrored_vms); + if (!sev_guest(kvm)) return; /* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */ if (is_mirroring_enc_context(kvm)) { - kvm_put_kvm(sev->enc_context_owner); + struct kvm *owner_kvm = sev->enc_context_owner; + struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info; + + mutex_lock(&owner_kvm->lock); + if (!WARN_ON(!owner_sev->num_mirrored_vms)) + owner_sev->num_mirrored_vms--; + mutex_unlock(&owner_kvm->lock); + kvm_put_kvm(owner_kvm); return; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5faad3dc10e2..1c7306c370fa 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -79,6 +79,7 @@ struct kvm_sev_info { struct list_head regions_list; /* List of registered regions */ u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ + unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ atomic_t migration_in_progress; }; diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c index d265cea5de85..29b18d565cf4 100644 --- a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -294,6 +294,41 @@ static void test_sev_mirror_parameters(void) kvm_vm_free(vm_no_vcpu); } +static void test_sev_move_copy(void) +{ + struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm; + int ret; + + sev_vm = sev_vm_create(/* es= */ false); + dst_vm = aux_vm_create(true); + mirror_vm = aux_vm_create(false); + dst_mirror_vm = aux_vm_create(false); + + sev_mirror_create(mirror_vm->fd, sev_vm->fd); + ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); + TEST_ASSERT(ret == -1 && errno == EBUSY, + "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, + errno); + + /* The mirror itself can be migrated. */ + sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd); + ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd); + TEST_ASSERT(ret == -1 && errno == EBUSY, + "Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret, + errno); + + /* + * mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus, + * the owner can be copied as soon as dst_mirror_vm is gone. + */ + kvm_vm_free(dst_mirror_vm); + sev_migrate_from(dst_vm->fd, sev_vm->fd); + + kvm_vm_free(mirror_vm); + kvm_vm_free(dst_vm); + kvm_vm_free(sev_vm); +} + int main(int argc, char *argv[]) { if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) { @@ -301,6 +336,8 @@ int main(int argc, char *argv[]) test_sev_migrate_from(/* es= */ true); test_sev_migrate_locking(); test_sev_migrate_parameters(); + if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) + test_sev_move_copy(); } if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) { test_sev_mirror(/* es= */ false); From 10a37929efeb4c51a0069afdd537c4fa3831f6e5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:35 -0500 Subject: [PATCH 178/231] KVM: SEV: do not take kvm->lock when destroying Taking the lock is useless since there are no other references, and there are already accesses (e.g. to sev->enc_context_owner) that do not take it. So get rid of it. Reviewed-by: Sean Christopherson Message-Id: <20211123005036.2954379-12-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89a716290fac..bbbf980c7e40 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2048,8 +2048,6 @@ void sev_vm_destroy(struct kvm *kvm) return; } - mutex_lock(&kvm->lock); - /* * Ensure that all guest tagged cache entries are flushed before * releasing the pages back to the system for use. CLFLUSH will @@ -2069,8 +2067,6 @@ void sev_vm_destroy(struct kvm *kvm) } } - mutex_unlock(&kvm->lock); - sev_unbind_asid(kvm, sev->handle); sev_asid_free(sev); } From c9d61dcb0bc26a761dc84a87bd8a0d3b3c432f10 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 22 Nov 2021 19:50:36 -0500 Subject: [PATCH 179/231] KVM: SEV: accept signals in sev_lock_two_vms Generally, kvm->lock is not taken for a long time, but sev_lock_two_vms is different: it takes vCPU locks inside, so userspace can hold it back just by calling a vCPU ioctl. Play it safe and use mutex_lock_killable. Message-Id: <20211123005036.2954379-13-pbonzini@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index bbbf980c7e40..59727a966f90 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1547,6 +1547,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) { struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info; struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info; + int r = -EBUSY; if (dst_kvm == src_kvm) return -EINVAL; @@ -1558,14 +1559,23 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1)) return -EBUSY; - if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) { - atomic_set_release(&dst_sev->migration_in_progress, 0); - return -EBUSY; - } + if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1)) + goto release_dst; - mutex_lock(&dst_kvm->lock); - mutex_lock(&src_kvm->lock); + r = -EINTR; + if (mutex_lock_killable(&dst_kvm->lock)) + goto release_src; + if (mutex_lock_killable(&src_kvm->lock)) + goto unlock_dst; return 0; + +unlock_dst: + mutex_unlock(&dst_kvm->lock); +release_src: + atomic_set_release(&src_sev->migration_in_progress, 0); +release_dst: + atomic_set_release(&dst_sev->migration_in_progress, 0); + return r; } static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) From b83f5ac7d922e69a109261f5f940eebbd4e514c4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 29 Nov 2021 22:53:27 +0100 Subject: [PATCH 180/231] net: marvell: mvpp2: Fix the computation of shared CPUs 'bitmap_fill()' fills a bitmap one 'long' at a time. It is likely that an exact number of bits is expected. Use 'bitmap_set()' instead in order not to set unexpected bits. Fixes: e531f76757eb ("net: mvpp2: handle cases where more CPUs are available than s/w threads") Signed-off-by: Christophe JAILLET Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index ce486e16489c..6480696c979b 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -7458,7 +7458,7 @@ static int mvpp2_probe(struct platform_device *pdev) shared = num_present_cpus() - priv->nthreads; if (shared > 0) - bitmap_fill(&priv->lock_map, + bitmap_set(&priv->lock_map, 0, min_t(int, shared, MVPP2_MAX_THREADS)); for (i = 0; i < MVPP2_MAX_THREADS; i++) { From d1ec975f9fa6d2211c1f403010361034a87e317f Mon Sep 17 00:00:00 2001 From: Maciej Fijalkowski Date: Mon, 29 Nov 2021 15:17:46 -0800 Subject: [PATCH 181/231] ice: xsk: clear status_error0 for each allocated desc Fix a bug in which the receiving of packets can stop in the zero-copy driver. Ice HW ignores 3 lower bits from QRX_TAIL register, which means that tail is bumped only on intervals of 8. Currently with XSK RX batching in place, ice_alloc_rx_bufs_zc() clears the status_error0 only of the last descriptor that has been allocated/taken from the XSK buffer pool. status_error0 includes DD bit that is looked upon by the ice_clean_rx_irq_zc() to tell if a descriptor can be processed. The bug can be triggered when driver updates the ntu but not the QRX_TAIL, so HW wouldn't have a chance to write to the ready descriptors. Later on driver moves the ntc to the mentioned set of descriptors and interprets them as a ready to be processed, since corresponding DD bits were not cleared nor any writeback has happened that would clear it. This can then lead to ntc == ntu case which means that ring is empty and no further packet processing. Fix the XSK traffic hang that can be observed when l2fwd scenario from xdpsock is used by making sure that status_error0 is cleared for each descriptor that is fed to HW and therefore we are sure that driver will not processed non-valid DD bits. This will also prevent the driver from processing the descriptors that were allocated in favor of the previously processed ones, but writeback didn't happen yet. Fixes: db804cfc21e9 ("ice: Use the xsk batched rx allocation interface") Signed-off-by: Maciej Fijalkowski Reviewed-by: Alexander Lobakin Signed-off-by: Tony Nguyen Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ice/ice_xsk.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c index ff55cb415b11..bb9a80847298 100644 --- a/drivers/net/ethernet/intel/ice/ice_xsk.c +++ b/drivers/net/ethernet/intel/ice/ice_xsk.c @@ -383,6 +383,7 @@ bool ice_alloc_rx_bufs_zc(struct ice_rx_ring *rx_ring, u16 count) while (i--) { dma = xsk_buff_xdp_get_dma(*xdp); rx_desc->read.pkt_addr = cpu_to_le64(dma); + rx_desc->wb.status_error0 = 0; rx_desc++; xdp++; From f4a8adbfe4841491b60c14fe610571e1422359f9 Mon Sep 17 00:00:00 2001 From: Dongliang Mu Date: Tue, 30 Nov 2021 12:05:54 +0800 Subject: [PATCH 182/231] dpaa2-eth: destroy workqueue at the end of remove function The commit c55211892f46 ("dpaa2-eth: support PTP Sync packet one-step timestamping") forgets to destroy workqueue at the end of remove function. Fix this by adding destroy_workqueue before fsl_mc_portal_free and free_netdev. Fixes: c55211892f46 ("dpaa2-eth: support PTP Sync packet one-step timestamping") Signed-off-by: Dongliang Mu Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 6451c8383639..8e643567abce 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4550,6 +4550,8 @@ static int dpaa2_eth_remove(struct fsl_mc_device *ls_dev) fsl_mc_portal_free(priv->mc_io); + destroy_workqueue(priv->dpaa2_ptp_wq); + dev_dbg(net_dev->dev.parent, "Removed interface %s\n", net_dev->name); free_netdev(net_dev); From 34d8778a943761121f391b7921f79a7adbe1feaf Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Tue, 30 Nov 2021 08:33:58 +0100 Subject: [PATCH 183/231] MAINTAINERS: s390/net: add Alexandra and Wenjia as maintainer Add Alexandra and Wenjia as maintainers for drivers/s390/net and iucv. Also, remove myself as maintainer for these areas. Signed-off-by: Karsten Graul Acked-by: Alexandra Winter Acked-by: Wenjia Zhang Signed-off-by: David S. Miller --- MAINTAINERS | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 360e9aa0205d..43d8fac7fb7c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16623,7 +16623,8 @@ F: drivers/iommu/s390-iommu.c S390 IUCV NETWORK LAYER M: Julian Wiedmann -M: Karsten Graul +M: Alexandra Winter +M: Wenjia Zhang L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported @@ -16634,7 +16635,8 @@ F: net/iucv/ S390 NETWORK DRIVERS M: Julian Wiedmann -M: Karsten Graul +M: Alexandra Winter +M: Wenjia Zhang L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported From e90e51d5f01d2baae5dcce280866bbb96816e978 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Nov 2021 07:36:41 -0500 Subject: [PATCH 184/231] KVM: VMX: clear vmx_x86_ops.sync_pir_to_irr if APICv is disabled There is nothing to synchronize if APICv is disabled, since neither other vCPUs nor assigned devices can set PIR.ON. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1fadec8cbf96..f90448809690 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7777,10 +7777,10 @@ static __init int hardware_setup(void) ple_window_shrink = 0; } - if (!cpu_has_vmx_apicv()) { + if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (!enable_apicv) vmx_x86_ops.sync_pir_to_irr = NULL; - } if (cpu_has_vmx_tsc_scaling()) { kvm_has_tsc_control = true; From 7cfc5c653b07782e7059527df8dc1e3143a7591e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 30 Nov 2021 03:46:07 -0500 Subject: [PATCH 185/231] KVM: fix avic_set_running for preemptable kernels avic_set_running() passes the current CPU to avic_vcpu_load(), albeit via vcpu->cpu rather than smp_processor_id(). If the thread is migrated while avic_set_running runs, the call to avic_vcpu_load() can use a stale value for the processor id. Avoid this by blocking preemption over the entire execution of avic_set_running(). Reported-by: Sean Christopherson Fixes: 8221c1370056 ("svm: Manage vcpu load/unload when enable AVIC") Cc: stable@vger.kernel.org Reviewed-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index affc0ea98d30..9d6066eb7c10 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -989,16 +989,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { struct vcpu_svm *svm = to_svm(vcpu); + int cpu = get_cpu(); + WARN_ON(cpu != vcpu->cpu); svm->avic_is_running = is_run; - if (!kvm_vcpu_apicv_active(vcpu)) - return; - - if (is_run) - avic_vcpu_load(vcpu, vcpu->cpu); - else - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) { + if (is_run) + avic_vcpu_load(vcpu, cpu); + else + avic_vcpu_put(vcpu); + } + put_cpu(); } void svm_vcpu_blocking(struct kvm_vcpu *vcpu) From d85ffff5302b1509efc482e8877c253b0a668b33 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 30 Nov 2021 14:47:31 +0200 Subject: [PATCH 186/231] ALSA: hda: Add Intel DG2 PCI ID and HDMI codec vid Add HD Audio PCI ID and HDMI codec vendor ID for Intel DG2. Reviewed-by: Uma Shankar Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20211130124732.696896-1-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 12 +++++++++++- sound/pci/hda/patch_hdmi.c | 1 + 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index fe51163f2d82..1b46b599a5cf 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -335,7 +335,10 @@ enum { ((pci)->device == 0x0c0c) || \ ((pci)->device == 0x0d0c) || \ ((pci)->device == 0x160c) || \ - ((pci)->device == 0x490d)) + ((pci)->device == 0x490d) || \ + ((pci)->device == 0x4f90) || \ + ((pci)->device == 0x4f91) || \ + ((pci)->device == 0x4f92)) #define IS_BXT(pci) ((pci)->vendor == 0x8086 && (pci)->device == 0x5a98) @@ -2473,6 +2476,13 @@ static const struct pci_device_id azx_ids[] = { /* DG1 */ { PCI_DEVICE(0x8086, 0x490d), .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, + /* DG2 */ + { PCI_DEVICE(0x8086, 0x4f90), + .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, + { PCI_DEVICE(0x8086, 0x4f91), + .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, + { PCI_DEVICE(0x8086, 0x4f92), + .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, /* Alderlake-S */ { PCI_DEVICE(0x8086, 0x7ad0), .driver_data = AZX_DRIVER_SKL | AZX_DCAPS_INTEL_SKYLAKE}, diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 65d2c5539919..98633d2684de 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4382,6 +4382,7 @@ HDA_CODEC_ENTRY(0x80862814, "DG1 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862815, "Alderlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862816, "Rocketlake HDMI", patch_i915_tgl_hdmi), +HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), From 289047db1143c42c81820352f195a393ff639a52 Mon Sep 17 00:00:00 2001 From: Kai Vehmanen Date: Tue, 30 Nov 2021 14:47:32 +0200 Subject: [PATCH 187/231] ALSA: hda/hdmi: fix HDA codec entry table order for ADL-P Keep the HDA_CODEC_ENTRY entries sorted by the codec VID. ADL-P is the only misplaced Intel HDMI codec. Signed-off-by: Kai Vehmanen Link: https://lore.kernel.org/r/20211130124732.696896-2-kai.vehmanen@linux.intel.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 98633d2684de..415701bd10ac 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -4380,11 +4380,11 @@ HDA_CODEC_ENTRY(0x8086280f, "Icelake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x80862812, "Tigerlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862814, "DG1 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862815, "Alderlake HDMI", patch_i915_tgl_hdmi), -HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862816, "Rocketlake HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862819, "DG2 HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x8086281a, "Jasperlake HDMI", patch_i915_icl_hdmi), HDA_CODEC_ENTRY(0x8086281b, "Elkhartlake HDMI", patch_i915_icl_hdmi), +HDA_CODEC_ENTRY(0x8086281c, "Alderlake-P HDMI", patch_i915_tgl_hdmi), HDA_CODEC_ENTRY(0x80862880, "CedarTrail HDMI", patch_generic_hdmi), HDA_CODEC_ENTRY(0x80862882, "Valleyview2 HDMI", patch_i915_byt_hdmi), HDA_CODEC_ENTRY(0x80862883, "Braswell HDMI", patch_i915_byt_hdmi), From d6e6a27d960f9f07aef0b979c49c6736ede28f75 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 30 Nov 2021 14:13:16 +0000 Subject: [PATCH 188/231] tools: Fix math.h breakage Commit 98e1385ef24b ("include/linux/radix-tree.h: replace kernel.h with the necessary inclusions") broke the radix tree test suite in two different ways; first by including math.h which didn't exist in the tools directory, and second by removing an implicit include of spinlock.h before lockdep.h. Fix both issues. Cc: Andy Shevchenko Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Andy Shevchenko Signed-off-by: Linus Torvalds --- tools/include/linux/kernel.h | 22 +-------------------- tools/include/linux/math.h | 25 ++++++++++++++++++++++++ tools/testing/radix-tree/linux/lockdep.h | 3 +++ 3 files changed, 29 insertions(+), 21 deletions(-) create mode 100644 tools/include/linux/math.h diff --git a/tools/include/linux/kernel.h b/tools/include/linux/kernel.h index a7e54a08fb54..3e8df500cfbd 100644 --- a/tools/include/linux/kernel.h +++ b/tools/include/linux/kernel.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -14,8 +15,6 @@ #define UINT_MAX (~0U) #endif -#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) - #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) #define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) @@ -52,15 +51,6 @@ _min1 < _min2 ? _min1 : _min2; }) #endif -#ifndef roundup -#define roundup(x, y) ( \ -{ \ - const typeof(y) __y = y; \ - (((x) + (__y - 1)) / __y) * __y; \ -} \ -) -#endif - #ifndef BUG_ON #ifdef NDEBUG #define BUG_ON(cond) do { if (cond) {} } while (0) @@ -104,16 +94,6 @@ int scnprintf_pad(char * buf, size_t size, const char * fmt, ...); #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) -/* - * This looks more complex than it should be. But we need to - * get the type for the ~ right in round_down (it needs to be - * as wide as the result!), and we want to evaluate the macro - * arguments just once each. - */ -#define __round_mask(x, y) ((__typeof__(x))((y)-1)) -#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) -#define round_down(x, y) ((x) & ~__round_mask(x, y)) - #define current_gfp_context(k) 0 #define synchronize_rcu() diff --git a/tools/include/linux/math.h b/tools/include/linux/math.h new file mode 100644 index 000000000000..4e7af99ec9eb --- /dev/null +++ b/tools/include/linux/math.h @@ -0,0 +1,25 @@ +#ifndef _TOOLS_MATH_H +#define _TOOLS_MATH_H + +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x, y) ((__typeof__(x))((y)-1)) +#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + +#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) + +#ifndef roundup +#define roundup(x, y) ( \ +{ \ + const typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#endif + +#endif diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h index 565fccdfe6e9..016cff473cfc 100644 --- a/tools/testing/radix-tree/linux/lockdep.h +++ b/tools/testing/radix-tree/linux/lockdep.h @@ -1,5 +1,8 @@ #ifndef _LINUX_LOCKDEP_H #define _LINUX_LOCKDEP_H + +#include + struct lock_class_key { unsigned int a; }; From 58e1100fdc5990b0cc0d4beaf2562a92e621ac7d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 30 Nov 2021 13:43:15 -0500 Subject: [PATCH 189/231] MAINTAINERS: co-maintain random.c random.c is a bit understaffed, and folks want more prompt reviews. I've got the crypto background and the interest to do these reviews, and have authored parts of the file already. Cc: Theodore Ts'o Cc: Greg Kroah-Hartman Signed-off-by: Jason A. Donenfeld Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 360e9aa0205d..913856599623 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15979,6 +15979,7 @@ F: arch/mips/generic/board-ranchu.c RANDOM NUMBER DRIVER M: "Theodore Ts'o" +M: Jason A. Donenfeld S: Maintained F: drivers/char/random.c From f123cffdd8fe8ea6c7fded4b88516a42798797d0 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 29 Nov 2021 09:53:27 -0800 Subject: [PATCH 190/231] net: netlink: af_netlink: Prevent empty skb by adding a check on len. Adding a check on len parameter to avoid empty skb. This prevents a division error in netem_enqueue function which is caused when skb->len=0 and skb->data_len=0 in the randomized corruption step as shown below. skb->data[prandom_u32() % skb_headlen(skb)] ^= 1<<(prandom_u32() % 8); Crash Report: [ 343.170349] netdevsim netdevsim0 netdevsim3: set [1, 0] type 2 family 0 port 6081 - 0 [ 343.216110] netem: version 1.3 [ 343.235841] divide error: 0000 [#1] PREEMPT SMP KASAN NOPTI [ 343.236680] CPU: 3 PID: 4288 Comm: reproducer Not tainted 5.16.0-rc1+ [ 343.237569] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 343.238707] RIP: 0010:netem_enqueue+0x1590/0x33c0 [sch_netem] [ 343.239499] Code: 89 85 58 ff ff ff e8 5f 5d e9 d3 48 8b b5 48 ff ff ff 8b 8d 50 ff ff ff 8b 85 58 ff ff ff 48 8b bd 70 ff ff ff 31 d2 2b 4f 74 f1 48 b8 00 00 00 00 00 fc ff df 49 01 d5 4c 89 e9 48 c1 e9 03 [ 343.241883] RSP: 0018:ffff88800bcd7368 EFLAGS: 00010246 [ 343.242589] RAX: 00000000ba7c0a9c RBX: 0000000000000001 RCX: 0000000000000000 [ 343.243542] RDX: 0000000000000000 RSI: ffff88800f8edb10 RDI: ffff88800f8eda40 [ 343.244474] RBP: ffff88800bcd7458 R08: 0000000000000000 R09: ffffffff94fb8445 [ 343.245403] R10: ffffffff94fb8336 R11: ffffffff94fb8445 R12: 0000000000000000 [ 343.246355] R13: ffff88800a5a7000 R14: ffff88800a5b5800 R15: 0000000000000020 [ 343.247291] FS: 00007fdde2bd7700(0000) GS:ffff888109780000(0000) knlGS:0000000000000000 [ 343.248350] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 343.249120] CR2: 00000000200000c0 CR3: 000000000ef4c000 CR4: 00000000000006e0 [ 343.250076] Call Trace: [ 343.250423] [ 343.250713] ? memcpy+0x4d/0x60 [ 343.251162] ? netem_init+0xa0/0xa0 [sch_netem] [ 343.251795] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.252443] netem_enqueue+0xe28/0x33c0 [sch_netem] [ 343.253102] ? stack_trace_save+0x87/0xb0 [ 343.253655] ? filter_irq_stacks+0xb0/0xb0 [ 343.254220] ? netem_init+0xa0/0xa0 [sch_netem] [ 343.254837] ? __kasan_check_write+0x14/0x20 [ 343.255418] ? _raw_spin_lock+0x88/0xd6 [ 343.255953] dev_qdisc_enqueue+0x50/0x180 [ 343.256508] __dev_queue_xmit+0x1a7e/0x3090 [ 343.257083] ? netdev_core_pick_tx+0x300/0x300 [ 343.257690] ? check_kcov_mode+0x10/0x40 [ 343.258219] ? _raw_spin_unlock_irqrestore+0x29/0x40 [ 343.258899] ? __kasan_init_slab_obj+0x24/0x30 [ 343.259529] ? setup_object.isra.71+0x23/0x90 [ 343.260121] ? new_slab+0x26e/0x4b0 [ 343.260609] ? kasan_poison+0x3a/0x50 [ 343.261118] ? kasan_unpoison+0x28/0x50 [ 343.261637] ? __kasan_slab_alloc+0x71/0x90 [ 343.262214] ? memcpy+0x4d/0x60 [ 343.262674] ? write_comp_data+0x2f/0x90 [ 343.263209] ? __kasan_check_write+0x14/0x20 [ 343.263802] ? __skb_clone+0x5d6/0x840 [ 343.264329] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.264958] dev_queue_xmit+0x1c/0x20 [ 343.265470] netlink_deliver_tap+0x652/0x9c0 [ 343.266067] netlink_unicast+0x5a0/0x7f0 [ 343.266608] ? netlink_attachskb+0x860/0x860 [ 343.267183] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.267820] ? write_comp_data+0x2f/0x90 [ 343.268367] netlink_sendmsg+0x922/0xe80 [ 343.268899] ? netlink_unicast+0x7f0/0x7f0 [ 343.269472] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.270099] ? write_comp_data+0x2f/0x90 [ 343.270644] ? netlink_unicast+0x7f0/0x7f0 [ 343.271210] sock_sendmsg+0x155/0x190 [ 343.271721] ____sys_sendmsg+0x75f/0x8f0 [ 343.272262] ? kernel_sendmsg+0x60/0x60 [ 343.272788] ? write_comp_data+0x2f/0x90 [ 343.273332] ? write_comp_data+0x2f/0x90 [ 343.273869] ___sys_sendmsg+0x10f/0x190 [ 343.274405] ? sendmsg_copy_msghdr+0x80/0x80 [ 343.274984] ? slab_post_alloc_hook+0x70/0x230 [ 343.275597] ? futex_wait_setup+0x240/0x240 [ 343.276175] ? security_file_alloc+0x3e/0x170 [ 343.276779] ? write_comp_data+0x2f/0x90 [ 343.277313] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.277969] ? write_comp_data+0x2f/0x90 [ 343.278515] ? __fget_files+0x1ad/0x260 [ 343.279048] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.279685] ? write_comp_data+0x2f/0x90 [ 343.280234] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.280874] ? sockfd_lookup_light+0xd1/0x190 [ 343.281481] __sys_sendmsg+0x118/0x200 [ 343.281998] ? __sys_sendmsg_sock+0x40/0x40 [ 343.282578] ? alloc_fd+0x229/0x5e0 [ 343.283070] ? write_comp_data+0x2f/0x90 [ 343.283610] ? write_comp_data+0x2f/0x90 [ 343.284135] ? __sanitizer_cov_trace_pc+0x21/0x60 [ 343.284776] ? ktime_get_coarse_real_ts64+0xb8/0xf0 [ 343.285450] __x64_sys_sendmsg+0x7d/0xc0 [ 343.285981] ? syscall_enter_from_user_mode+0x4d/0x70 [ 343.286664] do_syscall_64+0x3a/0x80 [ 343.287158] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 343.287850] RIP: 0033:0x7fdde24cf289 [ 343.288344] Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d b7 db 2c 00 f7 d8 64 89 01 48 [ 343.290729] RSP: 002b:00007fdde2bd6d98 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 343.291730] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fdde24cf289 [ 343.292673] RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000004 [ 343.293618] RBP: 00007fdde2bd6e20 R08: 0000000100000001 R09: 0000000000000000 [ 343.294557] R10: 0000000100000001 R11: 0000000000000246 R12: 0000000000000000 [ 343.295493] R13: 0000000000021000 R14: 0000000000000000 R15: 00007fdde2bd7700 [ 343.296432] [ 343.296735] Modules linked in: sch_netem ip6_vti ip_vti ip_gre ipip sit ip_tunnel geneve macsec macvtap tap ipvlan macvlan 8021q garp mrp hsr wireguard libchacha20poly1305 chacha_x86_64 poly1305_x86_64 ip6_udp_tunnel udp_tunnel libblake2s blake2s_x86_64 libblake2s_generic curve25519_x86_64 libcurve25519_generic libchacha xfrm_interface xfrm6_tunnel tunnel4 veth netdevsim psample batman_adv nlmon dummy team bonding tls vcan ip6_gre ip6_tunnel tunnel6 gre tun ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_security iptable_raw ebtable_filter ebtables rfkill ip6table_filter ip6_tables iptable_filter ppdev bochs drm_vram_helper drm_ttm_helper ttm drm_kms_helper cec parport_pc drm joydev floppy parport sg syscopyarea sysfillrect sysimgblt i2c_piix4 qemu_fw_cfg fb_sys_fops pcspkr [ 343.297459] ip_tables xfs virtio_net net_failover failover sd_mod sr_mod cdrom t10_pi ata_generic pata_acpi ata_piix libata virtio_pci virtio_pci_legacy_dev serio_raw virtio_pci_modern_dev dm_mirror dm_region_hash dm_log dm_mod [ 343.311074] Dumping ftrace buffer: [ 343.311532] (ftrace buffer empty) [ 343.312040] ---[ end trace a2e3db5a6ae05099 ]--- [ 343.312691] RIP: 0010:netem_enqueue+0x1590/0x33c0 [sch_netem] [ 343.313481] Code: 89 85 58 ff ff ff e8 5f 5d e9 d3 48 8b b5 48 ff ff ff 8b 8d 50 ff ff ff 8b 85 58 ff ff ff 48 8b bd 70 ff ff ff 31 d2 2b 4f 74 f1 48 b8 00 00 00 00 00 fc ff df 49 01 d5 4c 89 e9 48 c1 e9 03 [ 343.315893] RSP: 0018:ffff88800bcd7368 EFLAGS: 00010246 [ 343.316622] RAX: 00000000ba7c0a9c RBX: 0000000000000001 RCX: 0000000000000000 [ 343.317585] RDX: 0000000000000000 RSI: ffff88800f8edb10 RDI: ffff88800f8eda40 [ 343.318549] RBP: ffff88800bcd7458 R08: 0000000000000000 R09: ffffffff94fb8445 [ 343.319503] R10: ffffffff94fb8336 R11: ffffffff94fb8445 R12: 0000000000000000 [ 343.320455] R13: ffff88800a5a7000 R14: ffff88800a5b5800 R15: 0000000000000020 [ 343.321414] FS: 00007fdde2bd7700(0000) GS:ffff888109780000(0000) knlGS:0000000000000000 [ 343.322489] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 343.323283] CR2: 00000000200000c0 CR3: 000000000ef4c000 CR4: 00000000000006e0 [ 343.324264] Kernel panic - not syncing: Fatal exception in interrupt [ 343.333717] Dumping ftrace buffer: [ 343.334175] (ftrace buffer empty) [ 343.334653] Kernel Offset: 0x13600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [ 343.336027] Rebooting in 86400 seconds.. Reported-by: syzkaller Signed-off-by: Harshit Mogalapalli Link: https://lore.kernel.org/r/20211129175328.55339-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 4c575324a985..9eba2e648385 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1852,6 +1852,11 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + if (len == 0) { + pr_warn_once("Zero length message leads to an empty skb\n"); + return -ENODATA; + } + err = scm_send(sock, msg, &scm, true); if (err < 0) return err; From b0f38e15979fa8851e88e8aa371367f264e7b6e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 29 Nov 2021 22:39:47 -0800 Subject: [PATCH 191/231] natsemi: xtensa: fix section mismatch warnings Fix section mismatch warnings in xtsonic. The first one appears to be bogus and after fixing the second one, the first one is gone. WARNING: modpost: vmlinux.o(.text+0x529adc): Section mismatch in reference from the function sonic_get_stats() to the function .init.text:set_reset_devices() The function sonic_get_stats() references the function __init set_reset_devices(). This is often because sonic_get_stats lacks a __init annotation or the annotation of set_reset_devices is wrong. WARNING: modpost: vmlinux.o(.text+0x529b3b): Section mismatch in reference from the function xtsonic_probe() to the function .init.text:sonic_probe1() The function xtsonic_probe() references the function __init sonic_probe1(). This is often because xtsonic_probe lacks a __init annotation or the annotation of sonic_probe1 is wrong. Fixes: 74f2a5f0ef64 ("xtensa: Add support for the Sonic Ethernet device for the XT2000 board.") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Christophe JAILLET Cc: Finn Thain Cc: Chris Zankel Cc: linux-xtensa@linux-xtensa.org Cc: Thomas Bogendoerfer Acked-by: Max Filippov Link: https://lore.kernel.org/r/20211130063947.7529-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/natsemi/xtsonic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/natsemi/xtsonic.c b/drivers/net/ethernet/natsemi/xtsonic.c index ca4686094701..0a02d8bd0a3e 100644 --- a/drivers/net/ethernet/natsemi/xtsonic.c +++ b/drivers/net/ethernet/natsemi/xtsonic.c @@ -120,7 +120,7 @@ static const struct net_device_ops xtsonic_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, }; -static int __init sonic_probe1(struct net_device *dev) +static int sonic_probe1(struct net_device *dev) { unsigned int silicon_revision; struct sonic_local *lp = netdev_priv(dev); From c65d638ab39034cbaa36773b980d28106cfc81fa Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Wed, 17 Nov 2021 13:33:57 +0200 Subject: [PATCH 192/231] net/mlx5e: IPsec: Fix Software parser inner l3 type setting in case of encapsulation Current code wrongly uses the skb->protocol field which reflects the outer l3 protocol to set the inner l3 type in Software Parser (SWP) fields settings in the ethernet segment (eseg) in flows where inner l3 exists like in Vxlan over ESP flow, the above method wrongly use the outer protocol type instead of the inner one. thus breaking cases where inner and outer headers have different protocols. Fix by setting the inner l3 type in SWP according to the inner l3 ip header version. Fixes: 2ac9cfe78223 ("net/mlx5e: IPSec, Add Innova IPSec offload TX data path") Signed-off-by: Raed Salem Reviewed-by: Maor Dickman Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c index fb5397324aa4..2db9573a3fe6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_rxtx.c @@ -191,7 +191,7 @@ static void mlx5e_ipsec_set_swp(struct sk_buff *skb, eseg->swp_inner_l3_offset = skb_inner_network_offset(skb) / 2; eseg->swp_inner_l4_offset = (skb->csum_start + skb->head - skb->data) / 2; - if (skb->protocol == htons(ETH_P_IPV6)) + if (inner_ip_hdr(skb)->version == 6) eseg->swp_flags |= MLX5_ETH_WQE_SWP_INNER_L3_IPV6; break; default: From 51ebf5db67f5c6aed79c05f1aa5137bdf5ca6614 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 8 Jul 2021 12:48:24 +0300 Subject: [PATCH 193/231] net/mlx5e: Fix missing IPsec statistics on uplink representor The cited patch added the IPsec support to uplink representor, however as uplink representors have his private statistics where IPsec stats is not part of it, that effectively makes IPsec stats hidden when uplink representor stats queried. Resolve by adding IPsec stats to uplink representor private statistics. Fixes: 5589b8f1a2c7 ("net/mlx5e: Add IPsec support to uplink representor") Signed-off-by: Raed Salem Reviewed-by: Alaa Hleihel Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index e58a9ec42553..48895d79796a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1080,6 +1080,10 @@ static mlx5e_stats_grp_t mlx5e_ul_rep_stats_grps[] = { &MLX5E_STATS_GRP(pme), &MLX5E_STATS_GRP(channels), &MLX5E_STATS_GRP(per_port_buff_congest), +#ifdef CONFIG_MLX5_EN_IPSEC + &MLX5E_STATS_GRP(ipsec_sw), + &MLX5E_STATS_GRP(ipsec_hw), +#endif }; static unsigned int mlx5e_ul_rep_stats_grps_num(struct mlx5e_priv *priv) From 4cce2ccf08fbc27ae34ce0e72db15166e7b5f6a7 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 13 Sep 2021 13:54:30 +0300 Subject: [PATCH 194/231] net/mlx5e: Sync TIR params updates against concurrent create/modify Transport Interface Receive (TIR) objects perform the packet processing and reassembly and is also responsible for demultiplexing the packets into the different RQs. There are certain TIR context attributes that propagate to the pointed RQs and applied to them (like packet_merge offloads (LRO/SHAMPO) and tunneled_offload_en). When TIRs do not agree on attributes values, a "last one wins" policy is applied. Hence, if not synced properly, a race between TIR params update and a concurrent TIR create/modify operation might yield to a mismatch between the shadow parameters in SW and the actual applied state of the RQs in HW. tunneled_offload_en is a fixed attribute per profile, while packet merge offload state might be toggled and get out-of-sync. When this happens, packet_merge offload might be working although not requested, or the opposite. All updates to packet_merge state and all create/modify operations of regular redirection/steering TIRs are done under the same priv->state_lock, so they do not run in parallel, and no race is possible. However, there are other kind of TIRs (acceleration offloads TIRs, like TLS TIRs) which are created on demand for each new connection without holding the coarse priv->state_lock, hence might race. Fix this by synchronizing all packet_merge state reads and writes against all TIR create/modify operations. Include the modify operations of the regular redirection steering TIRs under the new lock, for better code layering and division of responsibilities. Fixes: 1182f3659357 ("net/mlx5e: kTLS, Add kTLS RX HW offload support") Signed-off-by: Tariq Toukan Reviewed-by: Moshe Shemesh Reviewed-by: Maxim Mikityanskiy Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/rx_res.c | 41 ++++++++++++++++++- .../ethernet/mellanox/mlx5/core/en/rx_res.h | 6 +-- .../mellanox/mlx5/core/en_accel/ktls_rx.c | 24 +---------- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c index 142953847996..0015a81eb9a1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.c @@ -13,6 +13,9 @@ struct mlx5e_rx_res { unsigned int max_nch; u32 drop_rqn; + struct mlx5e_packet_merge_param pkt_merge_param; + struct rw_semaphore pkt_merge_param_sem; + struct mlx5e_rss *rss[MLX5E_MAX_NUM_RSS]; bool rss_active; u32 rss_rqns[MLX5E_INDIR_RQT_SIZE]; @@ -392,6 +395,7 @@ static int mlx5e_rx_res_ptp_init(struct mlx5e_rx_res *res) if (err) goto out; + /* Separated from the channels RQs, does not share pkt_merge state with them */ mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, mlx5e_rqt_get_rqtn(&res->ptp.rqt), inner_ft_support); @@ -447,6 +451,9 @@ int mlx5e_rx_res_init(struct mlx5e_rx_res *res, struct mlx5_core_dev *mdev, res->max_nch = max_nch; res->drop_rqn = drop_rqn; + res->pkt_merge_param = *init_pkt_merge_param; + init_rwsem(&res->pkt_merge_param_sem); + err = mlx5e_rx_res_rss_init_def(res, init_pkt_merge_param, init_nch); if (err) goto err_out; @@ -513,7 +520,7 @@ u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res) return mlx5e_tir_get_tirn(&res->ptp.tir); } -u32 mlx5e_rx_res_get_rqtn_direct(struct mlx5e_rx_res *res, unsigned int ix) +static u32 mlx5e_rx_res_get_rqtn_direct(struct mlx5e_rx_res *res, unsigned int ix) { return mlx5e_rqt_get_rqtn(&res->channels[ix].direct_rqt); } @@ -656,6 +663,9 @@ int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, if (!builder) return -ENOMEM; + down_write(&res->pkt_merge_param_sem); + res->pkt_merge_param = *pkt_merge_param; + mlx5e_tir_builder_build_packet_merge(builder, pkt_merge_param); final_err = 0; @@ -681,6 +691,7 @@ int mlx5e_rx_res_packet_merge_set_param(struct mlx5e_rx_res *res, } } + up_write(&res->pkt_merge_param_sem); mlx5e_tir_builder_free(builder); return final_err; } @@ -689,3 +700,31 @@ struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res * { return mlx5e_rss_get_hash(res->rss[0]); } + +int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, + struct mlx5e_tir *tir) +{ + bool inner_ft_support = res->features & MLX5E_RX_RES_FEATURE_INNER_FT; + struct mlx5e_tir_builder *builder; + u32 rqtn; + int err; + + builder = mlx5e_tir_builder_alloc(false); + if (!builder) + return -ENOMEM; + + rqtn = mlx5e_rx_res_get_rqtn_direct(res, rxq); + + mlx5e_tir_builder_build_rqt(builder, res->mdev->mlx5e_res.hw_objs.td.tdn, rqtn, + inner_ft_support); + mlx5e_tir_builder_build_direct(builder); + mlx5e_tir_builder_build_tls(builder); + down_read(&res->pkt_merge_param_sem); + mlx5e_tir_builder_build_packet_merge(builder, &res->pkt_merge_param); + err = mlx5e_tir_init(tir, builder, res->mdev, false); + up_read(&res->pkt_merge_param_sem); + + mlx5e_tir_builder_free(builder); + + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h index d09f7d174a51..b39b20a720e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rx_res.h @@ -37,9 +37,6 @@ u32 mlx5e_rx_res_get_tirn_rss(struct mlx5e_rx_res *res, enum mlx5_traffic_types u32 mlx5e_rx_res_get_tirn_rss_inner(struct mlx5e_rx_res *res, enum mlx5_traffic_types tt); u32 mlx5e_rx_res_get_tirn_ptp(struct mlx5e_rx_res *res); -/* RQTN getters for modules that create their own TIRs */ -u32 mlx5e_rx_res_get_rqtn_direct(struct mlx5e_rx_res *res, unsigned int ix); - /* Activate/deactivate API */ void mlx5e_rx_res_channels_activate(struct mlx5e_rx_res *res, struct mlx5e_channels *chs); void mlx5e_rx_res_channels_deactivate(struct mlx5e_rx_res *res); @@ -69,4 +66,7 @@ struct mlx5e_rss *mlx5e_rx_res_rss_get(struct mlx5e_rx_res *res, u32 rss_idx); /* Workaround for hairpin */ struct mlx5e_rss_params_hash mlx5e_rx_res_get_current_hash(struct mlx5e_rx_res *res); +/* Accel TIRs */ +int mlx5e_rx_res_tls_tir_create(struct mlx5e_rx_res *res, unsigned int rxq, + struct mlx5e_tir *tir); #endif /* __MLX5_EN_RX_RES_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index a2a9f68579dd..15711814d2d2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -100,25 +100,6 @@ mlx5e_ktls_rx_resync_create_resp_list(void) return resp_list; } -static int mlx5e_ktls_create_tir(struct mlx5_core_dev *mdev, struct mlx5e_tir *tir, u32 rqtn) -{ - struct mlx5e_tir_builder *builder; - int err; - - builder = mlx5e_tir_builder_alloc(false); - if (!builder) - return -ENOMEM; - - mlx5e_tir_builder_build_rqt(builder, mdev->mlx5e_res.hw_objs.td.tdn, rqtn, false); - mlx5e_tir_builder_build_direct(builder); - mlx5e_tir_builder_build_tls(builder); - err = mlx5e_tir_init(tir, builder, mdev, false); - - mlx5e_tir_builder_free(builder); - - return err; -} - static void accel_rule_handle_work(struct work_struct *work) { struct mlx5e_ktls_offload_context_rx *priv_rx; @@ -609,7 +590,6 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, struct mlx5_core_dev *mdev; struct mlx5e_priv *priv; int rxq, err; - u32 rqtn; tls_ctx = tls_get_ctx(sk); priv = netdev_priv(netdev); @@ -635,9 +615,7 @@ int mlx5e_ktls_add_rx(struct net_device *netdev, struct sock *sk, priv_rx->sw_stats = &priv->tls->sw_stats; mlx5e_set_ktls_rx_priv_ctx(tls_ctx, priv_rx); - rqtn = mlx5e_rx_res_get_rqtn_direct(priv->rx_res, rxq); - - err = mlx5e_ktls_create_tir(mdev, &priv_rx->tir, rqtn); + err = mlx5e_rx_res_tls_tir_create(priv->rx_res, rxq, &priv_rx->tir); if (err) goto err_create_tir; From e45c0b34493c24eeeebf89f63a5293aac7728ed7 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Fri, 5 Nov 2021 15:03:20 +0200 Subject: [PATCH 195/231] net/mlx5: Move MODIFY_RQT command to ignore list in internal error state When the device is in internal error state, command interface isn't accessible and the driver decides which commands to fail and which to ignore. Move the MODIFY_RQT command to the ignore list in order to avoid the following redundant warning messages in internal error state: mlx5_core 0000:82:00.1: mlx5e_rss_disable:419:(pid 23754): Failed to redirect RQT 0x0 to drop RQ 0xc00848: err = -5 mlx5_core 0000:82:00.1: mlx5e_rx_res_channels_deactivate:598:(pid 23754): Failed to redirect direct RQT 0x1 to drop RQ 0xc00848 (channel 0): err = -5 mlx5_core 0000:82:00.1: mlx5e_rx_res_channels_deactivate:607:(pid 23754): Failed to redirect XSK RQT 0x19 to drop RQ 0xc00848 (channel 0): err = -5 Fixes: 43ec0f41fa73 ("net/mlx5e: Hide all implementation details of mlx5e_rx_res") Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 8eaa24d865c5..a46284ca5172 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -341,6 +341,7 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_DEALLOC_SF: case MLX5_CMD_OP_DESTROY_UCTX: case MLX5_CMD_OP_DESTROY_UMEM: + case MLX5_CMD_OP_MODIFY_RQT: return MLX5_CMD_STAT_OK; case MLX5_CMD_OP_QUERY_HCA_CAP: @@ -446,7 +447,6 @@ static int mlx5_internal_err_ret_value(struct mlx5_core_dev *dev, u16 op, case MLX5_CMD_OP_MODIFY_TIS: case MLX5_CMD_OP_QUERY_TIS: case MLX5_CMD_OP_CREATE_RQT: - case MLX5_CMD_OP_MODIFY_RQT: case MLX5_CMD_OP_QUERY_RQT: case MLX5_CMD_OP_CREATE_FLOW_TABLE: From ffdf45315226926e5ae5faf0ff76caca68f6d39c Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 18 Nov 2021 12:29:15 +0200 Subject: [PATCH 196/231] net/mlx5: Lag, Fix recreation of VF LAG Driver needs to nullify the port select attributes of the LAG when port selection is destroyed, otherwise it breaks recreation of the LAG. It fixes the below kernel oops: [ 587.906377] BUG: kernel NULL pointer dereference, address: 0000000000000008 [ 587.908843] #PF: supervisor read access in kernel mode [ 587.910730] #PF: error_code(0x0000) - not-present page [ 587.912580] PGD 0 P4D 0 [ 587.913632] Oops: 0000 [#1] SMP PTI [ 587.914644] CPU: 5 PID: 165 Comm: kworker/u20:5 Tainted: G OE 5.9.0_mlnx #1 [ 587.916152] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 587.918332] Workqueue: mlx5_lag mlx5_do_bond_work [mlx5_core] [ 587.919479] RIP: 0010:mlx5_del_flow_rules+0x10/0x270 [mlx5_core] [ 587.920568] mlx5_core 0000:08:00.1 enp8s0f1: Link up [ 587.920680] Code: c0 09 80 a0 e8 cf 42 a4 e0 48 c7 c3 f4 ff ff ff e8 8a 88 dd e0 e9 ab fe ff ff 0f 1f 44 00 00 41 56 41 55 49 89 fd 41 54 55 53 <48> 8b 47 08 48 8b 68 28 48 85 ed 74 2e 48 8d 7d 38 e8 6a 64 34 e1 [ 587.925116] bond0: (slave enp8s0f1): Enslaving as an active interface with an up link [ 587.930415] RSP: 0018:ffffc9000048fd88 EFLAGS: 00010282 [ 587.930417] RAX: ffff88846c14fac0 RBX: ffff88846cddcb80 RCX: 0000000080400007 [ 587.930417] RDX: 0000000080400008 RSI: ffff88846cddcb80 RDI: 0000000000000000 [ 587.930419] RBP: ffff88845fd80140 R08: 0000000000000001 R09: ffffffffa074ba00 [ 587.938132] R10: ffff88846c14fec0 R11: 0000000000000001 R12: ffff88846c122f10 [ 587.939473] R13: 0000000000000000 R14: 0000000000000001 R15: ffff88846d7a0000 [ 587.940800] FS: 0000000000000000(0000) GS:ffff88846fa80000(0000) knlGS:0000000000000000 [ 587.942416] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 587.943536] CR2: 0000000000000008 CR3: 000000000240a002 CR4: 0000000000770ee0 [ 587.944904] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 587.946308] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 587.947639] PKRU: 55555554 [ 587.948236] Call Trace: [ 587.948834] mlx5_lag_destroy_definer.isra.3+0x16/0x90 [mlx5_core] [ 587.950033] mlx5_lag_destroy_definers+0x5b/0x80 [mlx5_core] [ 587.951128] mlx5_deactivate_lag+0x6e/0x80 [mlx5_core] [ 587.952146] mlx5_do_bond+0x150/0x450 [mlx5_core] [ 587.953086] mlx5_do_bond_work+0x3e/0x50 [mlx5_core] [ 587.954086] process_one_work+0x1eb/0x3e0 [ 587.954899] worker_thread+0x2d/0x3c0 [ 587.955656] ? process_one_work+0x3e0/0x3e0 [ 587.956493] kthread+0x115/0x130 [ 587.957174] ? kthread_park+0x90/0x90 [ 587.957929] ret_from_fork+0x1f/0x30 [ 587.973055] ---[ end trace 71ccd6eca89f5513 ]--- Fixes: b7267869e923 ("net/mlx5: Lag, add support to create/destroy/modify port selection") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c index ad63dd45c8fb..a6592f9c3c05 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/port_sel.c @@ -608,4 +608,5 @@ void mlx5_lag_port_sel_destroy(struct mlx5_lag *ldev) if (port_sel->tunnel) mlx5_destroy_ttc_table(port_sel->inner.ttc); mlx5_lag_destroy_definers(ldev); + memset(port_sel, 0, sizeof(*port_sel)); } From 1e59b32e45e47c8ea5455182286ba010bfa87813 Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Tue, 21 Sep 2021 15:47:33 +0300 Subject: [PATCH 197/231] net/mlx5: E-switch, Respect BW share of the new group To enable transmit schduler on vport FW require non-zero configuration for vport's TSAR. If vport added to the group which has configured BW share value and TX rate values of the vport are zero, then scheduler wouldn't be enabled on this vport. Fix that by calling BW normalization if BW share of the new group is configured. Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Dmytro Linkin Reviewed-by: Roi Dayan Reviewed-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index c6cc67cb4f6a..4501e3d737f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -423,7 +423,7 @@ static int esw_qos_vport_update_group(struct mlx5_eswitch *esw, return err; /* Recalculate bw share weights of old and new groups */ - if (vport->qos.bw_share) { + if (vport->qos.bw_share || new_group->bw_share) { esw_qos_normalize_vports_min_rate(esw, curr_group, extack); esw_qos_normalize_vports_min_rate(esw, new_group, extack); } From 43a0696f11567278b9412f947e43dd7906c831a8 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 21 Oct 2021 12:46:17 +0000 Subject: [PATCH 198/231] net/mlx5: E-Switch, fix single FDB creation on BlueField Always use MLX5_FLOW_TABLE_OTHER_VPORT flag when creating egress ACL table for single FDB. Not doing so on BlueField will make firmware fail the command. On BlueField the E-Switch manager is the ECPF (vport 0xFFFE) which is filled in the flow table creation command but as the other_vport field wasn't set the firmware complains about a bad parameter. This is different from a regular HCA where the E-Switch manager vport is the PF (vport 0x0). Passing MLX5_FLOW_TABLE_OTHER_VPORT will make the firmware happy both on BlueField and on regular HCAs without special condition for each. This fixes the bellow firmware syndrome: mlx5_cmd_check:819:(pid 571): CREATE_FLOW_TABLE(0x930) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x754a4) Fixes: db202995f503 ("net/mlx5: E-Switch, add logic to enable shared FDB") Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index a46455694f7a..275af1d2b4d3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2512,6 +2512,7 @@ static int esw_set_master_egress_rule(struct mlx5_core_dev *master, struct mlx5_eswitch *esw = master->priv.eswitch; struct mlx5_flow_table_attr ft_attr = { .max_fte = 1, .prio = 0, .level = 0, + .flags = MLX5_FLOW_TABLE_OTHER_VPORT, }; struct mlx5_flow_namespace *egress_ns; struct mlx5_flow_table *acl; From 5c4e8ae7aa4875041102406801ee434e6c581aef Mon Sep 17 00:00:00 2001 From: Dmytro Linkin Date: Wed, 17 Nov 2021 11:47:21 +0200 Subject: [PATCH 199/231] net/mlx5: E-Switch, Check group pointer before reading bw_share value If log_esw_max_sched_depth is not supported group pointer of the vport is NULL. Hence, check the pointer before reading bw_share value. Fixes: 0fe132eac38c ("net/mlx5: E-switch, Allow to add vports to rate groups") Signed-off-by: Dmytro Linkin Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c index 4501e3d737f8..d377ddc70fc7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/qos.c @@ -130,7 +130,7 @@ static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw, /* If vports min rate divider is 0 but their group has bw_share configured, then * need to set bw_share for vports to minimal value. */ - if (!group_level && !max_guarantee && group->bw_share) + if (!group_level && !max_guarantee && group && group->bw_share) return 1; return 0; } From e219440da0c3a63b3cec23d08473436ae7d95fa6 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Tue, 23 Nov 2021 14:37:11 +0200 Subject: [PATCH 200/231] net/mlx5: E-Switch, Use indirect table only if all destinations support it When adding rule with multiple destinations, indirect table is used for all of the destinations if at least one of the destinations support it, this can cause creation of invalid indirect tables for the destinations that doesn't support it. Fixed it by using indirect table only if all destinations support it. Fixes: a508728a4c8b ("net/mlx5e: VF tunnel RX traffic offloading") Signed-off-by: Maor Dickman Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../mellanox/mlx5/core/eswitch_offloads.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 275af1d2b4d3..32bc08a39925 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -329,14 +329,25 @@ static bool esw_is_indir_table(struct mlx5_eswitch *esw, struct mlx5_flow_attr *attr) { struct mlx5_esw_flow_attr *esw_attr = attr->esw_attr; + bool result = false; int i; - for (i = esw_attr->split_count; i < esw_attr->out_count; i++) + /* Indirect table is supported only for flows with in_port uplink + * and the destination is vport on the same eswitch as the uplink, + * return false in case at least one of destinations doesn't meet + * this criteria. + */ + for (i = esw_attr->split_count; i < esw_attr->out_count; i++) { if (esw_attr->dests[i].rep && mlx5_esw_indir_table_needed(esw, attr, esw_attr->dests[i].rep->vport, - esw_attr->dests[i].mdev)) - return true; - return false; + esw_attr->dests[i].mdev)) { + result = true; + } else { + result = false; + break; + } + } + return result; } static int From 76091b0fb60970f610b7ba2d886cd7fb95c5eb2e Mon Sep 17 00:00:00 2001 From: Amir Tzin Date: Wed, 20 Oct 2021 12:45:05 +0300 Subject: [PATCH 201/231] net/mlx5: Fix use after free in mlx5_health_wait_pci_up The device health recovery flow calls mlx5_health_wait_pci_up() which queries the device for FW_RESET timeout after freeing the device timeouts structure on mlx5_function_teardown(). Fix this bug by moving timeouts structure init/cleanup to the device's init/uninit phases. Since it is necessary to reset default software timeouts on function reload, extract setting of defaults values from mlx5_tout_init() and call mlx5_tout_set_def_val() directly from mlx5_function_setup(). Fixes: 5945e1adeab5 ("net/mlx5: Read timeout values from init segment") Reported by: Niklas Schnelle Signed-off-by: Amir Tzin Signed-off-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/lib/tout.c | 5 ++--- .../ethernet/mellanox/mlx5/core/lib/tout.h | 1 + .../net/ethernet/mellanox/mlx5/core/main.c | 22 ++++++++++--------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c index 0dd96a6b140d..c1df0d3595d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.c @@ -31,11 +31,11 @@ static void tout_set(struct mlx5_core_dev *dev, u64 val, enum mlx5_timeouts_type dev->timeouts->to[type] = val; } -static void tout_set_def_val(struct mlx5_core_dev *dev) +void mlx5_tout_set_def_val(struct mlx5_core_dev *dev) { int i; - for (i = MLX5_TO_FW_PRE_INIT_TIMEOUT_MS; i < MAX_TIMEOUT_TYPES; i++) + for (i = 0; i < MAX_TIMEOUT_TYPES; i++) tout_set(dev, tout_def_sw_val[i], i); } @@ -45,7 +45,6 @@ int mlx5_tout_init(struct mlx5_core_dev *dev) if (!dev->timeouts) return -ENOMEM; - tout_set_def_val(dev); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h index 31faa5c17aa9..1c42ead782fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/tout.h @@ -34,6 +34,7 @@ int mlx5_tout_init(struct mlx5_core_dev *dev); void mlx5_tout_cleanup(struct mlx5_core_dev *dev); void mlx5_tout_query_iseg(struct mlx5_core_dev *dev); int mlx5_tout_query_dtor(struct mlx5_core_dev *dev); +void mlx5_tout_set_def_val(struct mlx5_core_dev *dev); u64 _mlx5_tout_ms(struct mlx5_core_dev *dev, enum mlx5_timeouts_types type); #define mlx5_tout_ms(dev, type) _mlx5_tout_ms(dev, MLX5_TO_##type##_MS) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index a92a92a52346..e127c0530b3a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -992,11 +992,7 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) if (mlx5_core_is_pf(dev)) pcie_print_link_status(dev->pdev); - err = mlx5_tout_init(dev); - if (err) { - mlx5_core_err(dev, "Failed initializing timeouts, aborting\n"); - return err; - } + mlx5_tout_set_def_val(dev); /* wait for firmware to accept initialization segments configurations */ @@ -1005,13 +1001,13 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) if (err) { mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n", mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT)); - goto err_tout_cleanup; + return err; } err = mlx5_cmd_init(dev); if (err) { mlx5_core_err(dev, "Failed initializing command interface, aborting\n"); - goto err_tout_cleanup; + return err; } mlx5_tout_query_iseg(dev); @@ -1094,8 +1090,6 @@ err_disable_hca: err_cmd_cleanup: mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); -err_tout_cleanup: - mlx5_tout_cleanup(dev); return err; } @@ -1114,7 +1108,6 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot) mlx5_core_disable_hca(dev, 0); mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN); mlx5_cmd_cleanup(dev); - mlx5_tout_cleanup(dev); return 0; } @@ -1476,6 +1469,12 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) mlx5_debugfs_root); INIT_LIST_HEAD(&priv->traps); + err = mlx5_tout_init(dev); + if (err) { + mlx5_core_err(dev, "Failed initializing timeouts, aborting\n"); + goto err_timeout_init; + } + err = mlx5_health_init(dev); if (err) goto err_health_init; @@ -1501,6 +1500,8 @@ err_adev_init: err_pagealloc_init: mlx5_health_cleanup(dev); err_health_init: + mlx5_tout_cleanup(dev); +err_timeout_init: debugfs_remove(dev->priv.dbg_root); mutex_destroy(&priv->pgdir_mutex); mutex_destroy(&priv->alloc_mutex); @@ -1518,6 +1519,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) mlx5_adev_cleanup(dev); mlx5_pagealloc_cleanup(dev); mlx5_health_cleanup(dev); + mlx5_tout_cleanup(dev); debugfs_remove_recursive(dev->priv.dbg_root); mutex_destroy(&priv->pgdir_mutex); mutex_destroy(&priv->alloc_mutex); From 924cc4633f048b4fb4af3d1f9a51d10867625339 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sat, 6 Nov 2021 20:19:09 +0200 Subject: [PATCH 202/231] net/mlx5: Fix too early queueing of log timestamp work The log timestamp work should not be queued before the command interface is initialized, move it to a later stage in the init flow. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 64f1abc4dc36..380f50d5462d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -835,6 +835,9 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); add_timer(&health->timer); + + if (mlx5_core_is_pf(dev)) + queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); } void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) @@ -902,8 +905,6 @@ int mlx5_health_init(struct mlx5_core_dev *dev) INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work); INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); INIT_DELAYED_WORK(&health->update_fw_log_ts_work, mlx5_health_log_ts_update); - if (mlx5_core_is_pf(dev)) - queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); return 0; From 502e82b91361955c66c8453b5b7a905b0b5bd5a1 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Sun, 7 Nov 2021 17:21:45 +0200 Subject: [PATCH 203/231] net/mlx5: Fix access to a non-supported register Validate MRTC register is supported before triggering a delayed work which accesses it. Fixes: 5a1023deeed0 ("net/mlx5: Add periodic update of host time to firmware") Signed-off-by: Aya Levin Reviewed-by: Gal Pressman Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 8 +++----- include/linux/mlx5/mlx5_ifc.h | 5 ++++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 380f50d5462d..3ca998874c50 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -836,7 +836,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms); add_timer(&health->timer); - if (mlx5_core_is_pf(dev)) + if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc)) queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e127c0530b3a..7df9c7f8d9c8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1071,18 +1071,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot) mlx5_set_driver_version(dev); - mlx5_start_health_poll(dev); - err = mlx5_query_hca_caps(dev); if (err) { mlx5_core_err(dev, "query hca failed\n"); - goto stop_health; + goto reclaim_boot_pages; } + mlx5_start_health_poll(dev); + return 0; -stop_health: - mlx5_stop_health_poll(dev, boot); reclaim_boot_pages: mlx5_reclaim_startup_pages(dev); err_disable_hca: diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3636df90899a..fbaab440a484 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -9698,7 +9698,10 @@ struct mlx5_ifc_mcam_access_reg_bits { u8 regs_84_to_68[0x11]; u8 tracer_registers[0x4]; - u8 regs_63_to_32[0x20]; + u8 regs_63_to_46[0x12]; + u8 mrtc[0x1]; + u8 regs_44_to_32[0xd]; + u8 regs_31_to_0[0x20]; }; From 8c8cf0382257b28378eeff535150c087a653ca19 Mon Sep 17 00:00:00 2001 From: Ben Ben-Ishay Date: Sun, 31 Oct 2021 18:31:02 +0200 Subject: [PATCH 204/231] net/mlx5e: SHAMPO, Fix constant expression result mlx5e_build_shampo_hd_umr uses counters i and index incorrectly as unsigned, thus the err state err_unmap could stuck in endless loop. Change i to int to solve the first issue. Reduce index check to solve the second issue, the caller function validates that index could not rotate. Fixes: 64509b052525 ("net/mlx5e: Add data path for SHAMPO feature") Signed-off-by: Ben Ben-Ishay Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 96967b0a2441..793511d5ee4c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -543,13 +543,13 @@ static int mlx5e_build_shampo_hd_umr(struct mlx5e_rq *rq, u16 klm_entries, u16 index) { struct mlx5e_shampo_hd *shampo = rq->mpwqe.shampo; - u16 entries, pi, i, header_offset, err, wqe_bbs, new_entries; + u16 entries, pi, header_offset, err, wqe_bbs, new_entries; u32 lkey = rq->mdev->mlx5e_res.hw_objs.mkey; struct page *page = shampo->last_page; u64 addr = shampo->last_addr; struct mlx5e_dma_info *dma_info; struct mlx5e_umr_wqe *umr_wqe; - int headroom; + int headroom, i; headroom = rq->buff.headroom; new_entries = klm_entries - (shampo->pi & (MLX5_UMR_KLM_ALIGNMENT - 1)); @@ -601,9 +601,7 @@ update_klm: err_unmap: while (--i >= 0) { - if (--index < 0) - index = shampo->hd_per_wq - 1; - dma_info = &shampo->info[index]; + dma_info = &shampo->info[--index]; if (!(i & (MLX5E_SHAMPO_WQ_HEADER_PER_PAGE - 1))) { dma_info->addr = ALIGN_DOWN(dma_info->addr, PAGE_SIZE); mlx5e_page_release(rq, dma_info, true); From 21635d9203e1cf2b73b67e9a86059a62f62a3563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:46 +0100 Subject: [PATCH 205/231] net: dsa: mv88e6xxx: Fix application of erratum 4.8 for 88E6393X MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to SERDES scripts for 88E6393X, erratum 4.8 has to be applied every time before SerDes is powered on. Split the code for erratum 4.8 into separate function and call it in mv88e6393x_serdes_power(). Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 53 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 6ea003678798..0658ee3b014c 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -1271,9 +1271,9 @@ void mv88e6390_serdes_get_regs(struct mv88e6xxx_chip *chip, int port, void *_p) } } -static int mv88e6393x_serdes_port_errata(struct mv88e6xxx_chip *chip, int lane) +static int mv88e6393x_serdes_erratum_4_6(struct mv88e6xxx_chip *chip, int lane) { - u16 reg, pcs; + u16 reg; int err; /* mv88e6393x family errata 4.6: @@ -1300,11 +1300,32 @@ static int mv88e6393x_serdes_port_errata(struct mv88e6xxx_chip *chip, int lane) if (err) return err; - err = mv88e6390_serdes_power_sgmii(chip, lane, false); - if (err) - return err; + return mv88e6390_serdes_power_sgmii(chip, lane, false); } + return 0; +} + +int mv88e6393x_serdes_setup_errata(struct mv88e6xxx_chip *chip) +{ + int err; + + err = mv88e6393x_serdes_erratum_4_6(chip, MV88E6393X_PORT0_LANE); + if (err) + return err; + + err = mv88e6393x_serdes_erratum_4_6(chip, MV88E6393X_PORT9_LANE); + if (err) + return err; + + return mv88e6393x_serdes_erratum_4_6(chip, MV88E6393X_PORT10_LANE); +} + +static int mv88e6393x_serdes_erratum_4_8(struct mv88e6xxx_chip *chip, int lane) +{ + u16 reg, pcs; + int err; + /* mv88e6393x family errata 4.8: * When a SERDES port is operating in 1000BASE-X or SGMII mode link may * not come up after hardware reset or software reset of SERDES core. @@ -1334,29 +1355,21 @@ static int mv88e6393x_serdes_port_errata(struct mv88e6xxx_chip *chip, int lane) MV88E6393X_ERRATA_4_8_REG, reg); } -int mv88e6393x_serdes_setup_errata(struct mv88e6xxx_chip *chip) -{ - int err; - - err = mv88e6393x_serdes_port_errata(chip, MV88E6393X_PORT0_LANE); - if (err) - return err; - - err = mv88e6393x_serdes_port_errata(chip, MV88E6393X_PORT9_LANE); - if (err) - return err; - - return mv88e6393x_serdes_port_errata(chip, MV88E6393X_PORT10_LANE); -} - int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, bool on) { u8 cmode = chip->ports[port].cmode; + int err; if (port != 0 && port != 9 && port != 10) return -EOPNOTSUPP; + if (on) { + err = mv88e6393x_serdes_erratum_4_8(chip, lane); + if (err) + return err; + } + switch (cmode) { case MV88E6XXX_PORT_STS_CMODE_SGMII: case MV88E6XXX_PORT_STS_CMODE_1000BASEX: From 8c3318b4874e2dee867f5ae8f6d38f78e044bf71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:47 +0100 Subject: [PATCH 206/231] net: dsa: mv88e6xxx: Drop unnecessary check in mv88e6393x_serdes_erratum_4_6() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check for lane is unnecessary, since the function is called only with allowed lane argument. Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 0658ee3b014c..3a6244596a67 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -1284,26 +1284,20 @@ static int mv88e6393x_serdes_erratum_4_6(struct mv88e6xxx_chip *chip, int lane) * It seems that after this workaround the SERDES is automatically * powered up (the bit is cleared), so power it down. */ - if (lane == MV88E6393X_PORT0_LANE || lane == MV88E6393X_PORT9_LANE || - lane == MV88E6393X_PORT10_LANE) { - err = mv88e6390_serdes_read(chip, lane, - MDIO_MMD_PHYXS, - MV88E6393X_SERDES_POC, ®); - if (err) - return err; + err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_POC, ®); + if (err) + return err; - reg &= ~MV88E6393X_SERDES_POC_PDOWN; - reg |= MV88E6393X_SERDES_POC_RESET; + reg &= ~MV88E6393X_SERDES_POC_PDOWN; + reg |= MV88E6393X_SERDES_POC_RESET; - err = mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS, - MV88E6393X_SERDES_POC, reg); - if (err) - return err; + err = mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_POC, reg); + if (err) + return err; - return mv88e6390_serdes_power_sgmii(chip, lane, false); - } - - return 0; + return mv88e6390_serdes_power_sgmii(chip, lane, false); } int mv88e6393x_serdes_setup_errata(struct mv88e6xxx_chip *chip) From 7527d66260ac0c603c6baca5146748061fcddbd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:48 +0100 Subject: [PATCH 207/231] net: dsa: mv88e6xxx: Save power by disabling SerDes trasmitter and receiver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Save power on 88E6393X by disabling SerDes receiver and transmitter after SerDes is SerDes is disabled. Signed-off-by: Marek Behún Cc: stable@vger.kernel.org # de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 46 +++++++++++++++++++++++++++--- drivers/net/dsa/mv88e6xxx/serdes.h | 3 ++ 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 3a6244596a67..ceb63d7f1f97 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -1271,6 +1271,28 @@ void mv88e6390_serdes_get_regs(struct mv88e6xxx_chip *chip, int port, void *_p) } } +static int mv88e6393x_serdes_power_lane(struct mv88e6xxx_chip *chip, int lane, + bool on) +{ + u16 reg; + int err; + + err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_CTRL1, ®); + if (err) + return err; + + if (on) + reg &= ~(MV88E6393X_SERDES_CTRL1_TX_PDOWN | + MV88E6393X_SERDES_CTRL1_RX_PDOWN); + else + reg |= MV88E6393X_SERDES_CTRL1_TX_PDOWN | + MV88E6393X_SERDES_CTRL1_RX_PDOWN; + + return mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_CTRL1, reg); +} + static int mv88e6393x_serdes_erratum_4_6(struct mv88e6xxx_chip *chip, int lane) { u16 reg; @@ -1297,7 +1319,11 @@ static int mv88e6393x_serdes_erratum_4_6(struct mv88e6xxx_chip *chip, int lane) if (err) return err; - return mv88e6390_serdes_power_sgmii(chip, lane, false); + err = mv88e6390_serdes_power_sgmii(chip, lane, false); + if (err) + return err; + + return mv88e6393x_serdes_power_lane(chip, lane, false); } int mv88e6393x_serdes_setup_errata(struct mv88e6xxx_chip *chip) @@ -1362,17 +1388,29 @@ int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, err = mv88e6393x_serdes_erratum_4_8(chip, lane); if (err) return err; + + err = mv88e6393x_serdes_power_lane(chip, lane, true); + if (err) + return err; } switch (cmode) { case MV88E6XXX_PORT_STS_CMODE_SGMII: case MV88E6XXX_PORT_STS_CMODE_1000BASEX: case MV88E6XXX_PORT_STS_CMODE_2500BASEX: - return mv88e6390_serdes_power_sgmii(chip, lane, on); + err = mv88e6390_serdes_power_sgmii(chip, lane, on); + break; case MV88E6393X_PORT_STS_CMODE_5GBASER: case MV88E6393X_PORT_STS_CMODE_10GBASER: - return mv88e6390_serdes_power_10g(chip, lane, on); + err = mv88e6390_serdes_power_10g(chip, lane, on); + break; } - return 0; + if (err) + return err; + + if (!on) + err = mv88e6393x_serdes_power_lane(chip, lane, false); + + return err; } diff --git a/drivers/net/dsa/mv88e6xxx/serdes.h b/drivers/net/dsa/mv88e6xxx/serdes.h index cbb3ba30caea..e9292c8beee4 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.h +++ b/drivers/net/dsa/mv88e6xxx/serdes.h @@ -93,6 +93,9 @@ #define MV88E6393X_SERDES_POC_PCS_MASK 0x0007 #define MV88E6393X_SERDES_POC_RESET BIT(15) #define MV88E6393X_SERDES_POC_PDOWN BIT(5) +#define MV88E6393X_SERDES_CTRL1 0xf003 +#define MV88E6393X_SERDES_CTRL1_TX_PDOWN BIT(9) +#define MV88E6393X_SERDES_CTRL1_RX_PDOWN BIT(8) #define MV88E6393X_ERRATA_4_8_REG 0xF074 #define MV88E6393X_ERRATA_4_8_BIT BIT(14) From 93fd8207bed80ce19aaf59932cbe1c03d418a37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:49 +0100 Subject: [PATCH 208/231] net: dsa: mv88e6xxx: Add fix for erratum 5.2 of 88E6393X family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add fix for erratum 5.2 of the 88E6393X (Amethyst) family: for 10gbase-r mode, some undocumented registers need to be written some special values. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 48 ++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index ceb63d7f1f97..9e4f18a4adc2 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -1375,6 +1375,50 @@ static int mv88e6393x_serdes_erratum_4_8(struct mv88e6xxx_chip *chip, int lane) MV88E6393X_ERRATA_4_8_REG, reg); } +static int mv88e6393x_serdes_erratum_5_2(struct mv88e6xxx_chip *chip, int lane, + u8 cmode) +{ + static const struct { + u16 dev, reg, val, mask; + } fixes[] = { + { MDIO_MMD_VEND1, 0x8093, 0xcb5a, 0xffff }, + { MDIO_MMD_VEND1, 0x8171, 0x7088, 0xffff }, + { MDIO_MMD_VEND1, 0x80c9, 0x311a, 0xffff }, + { MDIO_MMD_VEND1, 0x80a2, 0x8000, 0xff7f }, + { MDIO_MMD_VEND1, 0x80a9, 0x0000, 0xfff0 }, + { MDIO_MMD_VEND1, 0x80a3, 0x0000, 0xf8ff }, + { MDIO_MMD_PHYXS, MV88E6393X_SERDES_POC, + MV88E6393X_SERDES_POC_RESET, MV88E6393X_SERDES_POC_RESET }, + }; + int err, i; + u16 reg; + + /* mv88e6393x family errata 5.2: + * For optimal signal integrity the following sequence should be applied + * to SERDES operating in 10G mode. These registers only apply to 10G + * operation and have no effect on other speeds. + */ + if (cmode != MV88E6393X_PORT_STS_CMODE_10GBASER) + return 0; + + for (i = 0; i < ARRAY_SIZE(fixes); ++i) { + err = mv88e6390_serdes_read(chip, lane, fixes[i].dev, + fixes[i].reg, ®); + if (err) + return err; + + reg &= ~fixes[i].mask; + reg |= fixes[i].val; + + err = mv88e6390_serdes_write(chip, lane, fixes[i].dev, + fixes[i].reg, reg); + if (err) + return err; + } + + return 0; +} + int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, bool on) { @@ -1389,6 +1433,10 @@ int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, if (err) return err; + err = mv88e6393x_serdes_erratum_5_2(chip, lane, cmode); + if (err) + return err; + err = mv88e6393x_serdes_power_lane(chip, lane, true); if (err) return err; From 163000dbc772c1eae9bdfe7c8fe30155db1efd74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:50 +0100 Subject: [PATCH 209/231] net: dsa: mv88e6xxx: Fix inband AN for 2500base-x on 88E6393X family MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inband AN is broken on Amethyst in 2500base-x mode when set by standard mechanism (via cmode). (There probably is some weird setting done by default in the switch for this mode that make it cycle in some state or something, because when the peer is the mvneta controller, it receives link change interrupts every ~0.3ms, but the link is always down.) Get around this by configuring the PCS mode to 1000base-x (where inband AN works), and then changing the SerDes frequency while SerDes transmitter and receiver are disabled, before enabling SerDes PHY. After disabling SerDes PHY, change the PCS mode back to 2500base-x, to avoid confusing the device (if we leave it at 1000base-x PCS mode but with different frequency, and then change cmode to sgmii, the device won't change the frequency because it thinks it already has the correct one). The register which changes the frequency is undocumented. I discovered it by going through all registers in the ranges 4.f000-4.f100 and 1e.8000-1e.8200 for all SerDes cmodes (sgmii, 1000base-x, 2500base-x, 5gbase-r, 10gbase-r, usxgmii) and filtering out registers that didn't make sense (the value was the same for modes which have different frequency). The result of this was: reg sgmii 1000base-x 2500base-x 5gbase-r 10gbase-r usxgmii 04.f002 005b 0058 0059 005c 005d 005f 04.f076 3000 0000 1000 4000 5000 7000 04.f07c 0950 0950 1850 0550 0150 0150 1e.8000 0059 0059 0058 0055 0051 0051 1e.8140 0e20 0e20 0e28 0e21 0e42 0e42 Register 04.f002 is the documented Port Operational Confiuration register, it's last 3 bits select PCS type, so changing this register also changes the frequency to the appropriate value. Registers 04.f076 and 04.f07c are not writable. Undocumented register 1e.8000 was the one: changing bits 3:0 from 9 to 8 changed SerDes frequency to 3.125 GHz, while leaving the value of PCS mode in register 04.f002.2:0 at 1000base-x. Inband autonegotiation started working correctly. (I didn't try anything with register 1e.8140 since 1e.8000 solved the problem.) Since I don't have documentation for this register 1e.8000.3:0, I am using the constants without names, but my hypothesis is that this register selects PHY frequency. If in the future I have access to an oscilloscope able to handle these frequencies, I will try to test this hypothesis. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 61 +++++++++++++++++++++++++++++- drivers/net/dsa/mv88e6xxx/serdes.h | 1 + 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 9e4f18a4adc2..6f60376b932c 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -1419,6 +1419,54 @@ static int mv88e6393x_serdes_erratum_5_2(struct mv88e6xxx_chip *chip, int lane, return 0; } +static int mv88e6393x_serdes_fix_2500basex_an(struct mv88e6xxx_chip *chip, + int lane, u8 cmode, bool on) +{ + u16 reg; + int err; + + if (cmode != MV88E6XXX_PORT_STS_CMODE_2500BASEX) + return 0; + + /* Inband AN is broken on Amethyst in 2500base-x mode when set by + * standard mechanism (via cmode). + * We can get around this by configuring the PCS mode to 1000base-x + * and then writing value 0x58 to register 1e.8000. (This must be done + * while SerDes receiver and transmitter are disabled, which is, when + * this function is called.) + * It seem that when we do this configuration to 2500base-x mode (by + * changing PCS mode to 1000base-x and frequency to 3.125 GHz from + * 1.25 GHz) and then configure to sgmii or 1000base-x, the device + * thinks that it already has SerDes at 1.25 GHz and does not change + * the 1e.8000 register, leaving SerDes at 3.125 GHz. + * To avoid this, change PCS mode back to 2500base-x when disabling + * SerDes from 2500base-x mode. + */ + err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_POC, ®); + if (err) + return err; + + reg &= ~(MV88E6393X_SERDES_POC_PCS_MASK | MV88E6393X_SERDES_POC_AN); + if (on) + reg |= MV88E6393X_SERDES_POC_PCS_1000BASEX | + MV88E6393X_SERDES_POC_AN; + else + reg |= MV88E6393X_SERDES_POC_PCS_2500BASEX; + reg |= MV88E6393X_SERDES_POC_RESET; + + err = mv88e6390_serdes_write(chip, lane, MDIO_MMD_PHYXS, + MV88E6393X_SERDES_POC, reg); + if (err) + return err; + + err = mv88e6390_serdes_write(chip, lane, MDIO_MMD_VEND1, 0x8000, 0x58); + if (err) + return err; + + return 0; +} + int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, bool on) { @@ -1437,6 +1485,11 @@ int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, if (err) return err; + err = mv88e6393x_serdes_fix_2500basex_an(chip, lane, cmode, + true); + if (err) + return err; + err = mv88e6393x_serdes_power_lane(chip, lane, true); if (err) return err; @@ -1457,8 +1510,14 @@ int mv88e6393x_serdes_power(struct mv88e6xxx_chip *chip, int port, int lane, if (err) return err; - if (!on) + if (!on) { err = mv88e6393x_serdes_power_lane(chip, lane, false); + if (err) + return err; + + err = mv88e6393x_serdes_fix_2500basex_an(chip, lane, cmode, + false); + } return err; } diff --git a/drivers/net/dsa/mv88e6xxx/serdes.h b/drivers/net/dsa/mv88e6xxx/serdes.h index e9292c8beee4..8dd8ed225b45 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.h +++ b/drivers/net/dsa/mv88e6xxx/serdes.h @@ -93,6 +93,7 @@ #define MV88E6393X_SERDES_POC_PCS_MASK 0x0007 #define MV88E6393X_SERDES_POC_RESET BIT(15) #define MV88E6393X_SERDES_POC_PDOWN BIT(5) +#define MV88E6393X_SERDES_POC_AN BIT(3) #define MV88E6393X_SERDES_CTRL1 0xf003 #define MV88E6393X_SERDES_CTRL1_TX_PDOWN BIT(9) #define MV88E6393X_SERDES_CTRL1_RX_PDOWN BIT(8) From ede359d8843a2779d232ed30bc36089d4b5962e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Beh=C3=BAn?= Date: Tue, 30 Nov 2021 18:01:51 +0100 Subject: [PATCH 210/231] net: dsa: mv88e6xxx: Link in pcs_get_state() if AN is bypassed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Function mv88e6xxx_serdes_pcs_get_state() currently does not report link up if AN is enabled, Link bit is set, but Speed and Duplex Resolved bit is not set, which testing shows is the case for when auto-negotiation was bypassed (we have AN enabled but link partner does not). An example of such link partner is Marvell 88X3310 PHY, when put into the mode where host interface changes between 10gbase-r, 5gbase-r, 2500base-x and sgmii according to copper speed. The 88X3310 does not enable AN in 2500base-x, and so SerDes on mv88e6xxx currently does not link with it. Fix this. Fixes: a5a6858b793f ("net: dsa: mv88e6xxx: extend phylink to Serdes PHYs") Signed-off-by: Marek Behún Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/serdes.c | 48 ++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/serdes.c b/drivers/net/dsa/mv88e6xxx/serdes.c index 6f60376b932c..55273013bfb5 100644 --- a/drivers/net/dsa/mv88e6xxx/serdes.c +++ b/drivers/net/dsa/mv88e6xxx/serdes.c @@ -50,11 +50,22 @@ static int mv88e6390_serdes_write(struct mv88e6xxx_chip *chip, } static int mv88e6xxx_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, - u16 status, u16 lpa, + u16 ctrl, u16 status, u16 lpa, struct phylink_link_state *state) { + state->link = !!(status & MV88E6390_SGMII_PHY_STATUS_LINK); + if (status & MV88E6390_SGMII_PHY_STATUS_SPD_DPL_VALID) { - state->link = !!(status & MV88E6390_SGMII_PHY_STATUS_LINK); + /* The Spped and Duplex Resolved register is 1 if AN is enabled + * and complete, or if AN is disabled. So with disabled AN we + * still get here on link up. But we want to set an_complete + * only if AN was enabled, thus we look at BMCR_ANENABLE. + * (According to 802.3-2008 section 22.2.4.2.10, we should be + * able to get this same value from BMSR_ANEGCAPABLE, but tests + * show that these Marvell PHYs don't conform to this part of + * the specificaion - BMSR_ANEGCAPABLE is simply always 1.) + */ + state->an_complete = !!(ctrl & BMCR_ANENABLE); state->duplex = status & MV88E6390_SGMII_PHY_STATUS_DUPLEX_FULL ? DUPLEX_FULL : DUPLEX_HALF; @@ -81,6 +92,18 @@ static int mv88e6xxx_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, dev_err(chip->dev, "invalid PHY speed\n"); return -EINVAL; } + } else if (state->link && + state->interface != PHY_INTERFACE_MODE_SGMII) { + /* If Speed and Duplex Resolved register is 0 and link is up, it + * means that AN was enabled, but link partner had it disabled + * and the PHY invoked the Auto-Negotiation Bypass feature and + * linked anyway. + */ + state->duplex = DUPLEX_FULL; + if (state->interface == PHY_INTERFACE_MODE_2500BASEX) + state->speed = SPEED_2500; + else + state->speed = SPEED_1000; } else { state->link = false; } @@ -168,9 +191,15 @@ int mv88e6352_serdes_pcs_config(struct mv88e6xxx_chip *chip, int port, int mv88e6352_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, int port, int lane, struct phylink_link_state *state) { - u16 lpa, status; + u16 lpa, status, ctrl; int err; + err = mv88e6352_serdes_read(chip, MII_BMCR, &ctrl); + if (err) { + dev_err(chip->dev, "can't read Serdes PHY control: %d\n", err); + return err; + } + err = mv88e6352_serdes_read(chip, 0x11, &status); if (err) { dev_err(chip->dev, "can't read Serdes PHY status: %d\n", err); @@ -183,7 +212,7 @@ int mv88e6352_serdes_pcs_get_state(struct mv88e6xxx_chip *chip, int port, return err; } - return mv88e6xxx_serdes_pcs_get_state(chip, status, lpa, state); + return mv88e6xxx_serdes_pcs_get_state(chip, ctrl, status, lpa, state); } int mv88e6352_serdes_pcs_an_restart(struct mv88e6xxx_chip *chip, int port, @@ -883,9 +912,16 @@ int mv88e6390_serdes_pcs_config(struct mv88e6xxx_chip *chip, int port, static int mv88e6390_serdes_pcs_get_state_sgmii(struct mv88e6xxx_chip *chip, int port, int lane, struct phylink_link_state *state) { - u16 lpa, status; + u16 lpa, status, ctrl; int err; + err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, + MV88E6390_SGMII_BMCR, &ctrl); + if (err) { + dev_err(chip->dev, "can't read Serdes PHY control: %d\n", err); + return err; + } + err = mv88e6390_serdes_read(chip, lane, MDIO_MMD_PHYXS, MV88E6390_SGMII_PHY_STATUS, &status); if (err) { @@ -900,7 +936,7 @@ static int mv88e6390_serdes_pcs_get_state_sgmii(struct mv88e6xxx_chip *chip, return err; } - return mv88e6xxx_serdes_pcs_get_state(chip, status, lpa, state); + return mv88e6xxx_serdes_pcs_get_state(chip, ctrl, status, lpa, state); } static int mv88e6390_serdes_pcs_get_state_10g(struct mv88e6xxx_chip *chip, From 450fec13d9170127678f991698ac1a5b05c02e2f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 30 Nov 2021 12:31:23 -0500 Subject: [PATCH 211/231] tracing/histograms: String compares should not care about signed values When comparing two strings for the "onmatch" histogram trigger, fields that are strings use string comparisons, which do not care about being signed or not. Do not fail to match two string fields if one is unsigned char array and the other is a signed char array. Link: https://lore.kernel.org/all/20211129123043.5cfd687a@gandalf.local.home/ Cc: stable@vgerk.kernel.org Cc: Tom Zanussi Cc: Yafang Shao Fixes: b05e89ae7cf3b ("tracing: Accept different type for synthetic event fields") Reviewed-by: Masami Hiramatsu Reported-by: Sven Schnelle Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 9555b8e1d1e3..319f9c8ca7e7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3757,7 +3757,7 @@ static int check_synth_field(struct synth_event *event, if (strcmp(field->type, hist_field->type) != 0) { if (field->size != hist_field->size || - field->is_signed != hist_field->is_signed) + (!field->is_string && field->is_signed != hist_field->is_signed)) return -EINVAL; } From f25667e5980a4333729cac3101e5de1bb851f71a Mon Sep 17 00:00:00 2001 From: Chen Jun Date: Wed, 24 Nov 2021 14:08:01 +0000 Subject: [PATCH 212/231] tracing: Fix a kmemleak false positive in tracing_map Doing the command: echo 'hist:key=common_pid.execname,common_timestamp' > /sys/kernel/debug/tracing/events/xxx/trigger Triggers many kmemleak reports: unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 unreferenced object 0xffff0000c7ea4980 (size 128): comm "bash", pid 338, jiffies 4294912626 (age 9339.324s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000f3469921>] kmem_cache_alloc_trace+0x4c0/0x6f0 [<0000000054ca40c3>] hist_trigger_elt_data_alloc+0x140/0x178 [<00000000633bd154>] tracing_map_init+0x1f8/0x268 [<000000007e814ab9>] event_hist_trigger_func+0xca0/0x1ad0 [<00000000bf8520ed>] trigger_process_regex+0xd4/0x128 [<00000000f549355a>] event_trigger_write+0x7c/0x120 [<00000000b80f898d>] vfs_write+0xc4/0x380 [<00000000823e1055>] ksys_write+0x74/0xf8 [<000000008a9374aa>] __arm64_sys_write+0x24/0x30 [<0000000087124017>] do_el0_svc+0x88/0x1c0 [<00000000efd0dcd1>] el0_svc+0x1c/0x28 [<00000000dbfba9b3>] el0_sync_handler+0x88/0xc0 [<00000000e7399680>] el0_sync+0x148/0x180 The reason is elts->pages[i] is alloced by get_zeroed_page. and kmemleak will not scan the area alloced by get_zeroed_page. The address stored in elts->pages will be regarded as leaked. That is, the elts->pages[i] will have pointers loaded onto it as well, and without telling kmemleak about it, those pointers will look like memory without a reference. To fix this, call kmemleak_alloc to tell kmemleak to scan elts->pages[i] Link: https://lkml.kernel.org/r/20211124140801.87121-1-chenjun102@huawei.com Signed-off-by: Chen Jun Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 39bb56d2dcbe..9628b5571846 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "tracing_map.h" #include "trace.h" @@ -307,6 +308,7 @@ static void tracing_map_array_free(struct tracing_map_array *a) for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; + kmemleak_free(a->pages[i]); free_page((unsigned long)a->pages[i]); } @@ -342,6 +344,7 @@ static struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!a->pages[i]) goto free; + kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL); } out: return a; From 6bbfa44116689469267f1a6e3d233b52114139d2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 1 Dec 2021 23:45:50 +0900 Subject: [PATCH 213/231] kprobes: Limit max data_size of the kretprobe instances The 'kprobe::data_size' is unsigned, thus it can not be negative. But if user sets it enough big number (e.g. (size_t)-8), the result of 'data_size + sizeof(struct kretprobe_instance)' becomes smaller than sizeof(struct kretprobe_instance) or zero. In result, the kretprobe_instance are allocated without enough memory, and kretprobe accesses outside of allocated memory. To avoid this issue, introduce a max limitation of the kretprobe::data_size. 4KB per instance should be OK. Link: https://lkml.kernel.org/r/163836995040.432120.10322772773821182925.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: f47cd9b553aa ("kprobes: kretprobe user entry-handler") Reported-by: zhangyue Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index e974caf39d3e..8c8f7a4d93af 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -153,6 +153,8 @@ struct kretprobe { struct kretprobe_holder *rph; }; +#define KRETPROBE_MAX_DATA_SIZE 4096 + struct kretprobe_instance { union { struct freelist_node freelist; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e9db0c810554..21eccc961bba 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2086,6 +2086,9 @@ int register_kretprobe(struct kretprobe *rp) } } + if (rp->data_size > KRETPROBE_MAX_DATA_SIZE) + return -E2BIG; + rp->kp.pre_handler = pre_handler_kretprobe; rp->kp.post_handler = NULL; From e2dabc4f7e7b60299c20a36d6a7b24ed9bf8e572 Mon Sep 17 00:00:00 2001 From: Zhou Qingyang Date: Tue, 30 Nov 2021 19:08:48 +0800 Subject: [PATCH 214/231] net: qlogic: qlcnic: Fix a NULL pointer dereference in qlcnic_83xx_add_rings() In qlcnic_83xx_add_rings(), the indirect function of ahw->hw_ops->alloc_mbx_args will be called to allocate memory for cmd.req.arg, and there is a dereference of it in qlcnic_83xx_add_rings(), which could lead to a NULL pointer dereference on failure of the indirect function like qlcnic_83xx_alloc_mbx_args(). Fix this bug by adding a check of alloc_mbx_args(), this patch imitates the logic of mbx_cmd()'s failure handling. This bug was found by a static analyzer. The analysis employs differential checking to identify inconsistent security operations (e.g., checks or kfrees) between two code paths and confirms that the inconsistent operations are not recovered in the current function or the callers, so they constitute bugs. Note that, as a bug found by static analysis, it can be a false positive or hard to trigger. Multiple researchers have cross-reviewed the bug. Builds with CONFIG_QLCNIC=m show no new warnings, and our static analyzer no longer warns about this code. Fixes: 7f9664525f9c ("qlcnic: 83xx memory map and HW access routine") Signed-off-by: Zhou Qingyang Link: https://lore.kernel.org/r/20211130110848.109026-1-zhou1615@umn.edu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c index d51bac7ba5af..bd0607680329 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c @@ -1077,8 +1077,14 @@ static int qlcnic_83xx_add_rings(struct qlcnic_adapter *adapter) sds_mbx_size = sizeof(struct qlcnic_sds_mbx); context_id = recv_ctx->context_id; num_sds = adapter->drv_sds_rings - QLCNIC_MAX_SDS_RINGS; - ahw->hw_ops->alloc_mbx_args(&cmd, adapter, - QLCNIC_CMD_ADD_RCV_RINGS); + err = ahw->hw_ops->alloc_mbx_args(&cmd, adapter, + QLCNIC_CMD_ADD_RCV_RINGS); + if (err) { + dev_err(&adapter->pdev->dev, + "Failed to alloc mbx args %d\n", err); + return err; + } + cmd.req.arg[1] = 0 | (num_sds << 8) | (context_id << 16); /* set up status rings, mbx 2-81 */ From ee201011c1e1563c114a55c86eb164b236f18e84 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Tue, 30 Nov 2021 11:26:37 -0500 Subject: [PATCH 215/231] vrf: Reset IPCB/IP6CB when processing outbound pkts in vrf dev xmit IPCB/IP6CB need to be initialized when processing outbound v4 or v6 pkts in the codepath of vrf device xmit function so that leftover garbage doesn't cause futher code that uses the CB to incorrectly process the pkt. One occasion of the issue might occur when MPLS route uses the vrf device as the outgoing device such as when the route is added using "ip -f mpls route add