linux

iv/linux

Author	SHA1	Message	Date
Kees Cook	42bc47b353	treewide: Use array_size() in vmalloc() The vmalloc() function has no 2-factor argument form, so multiplication factors need to be wrapped in array_size(). This patch replaces cases of: vmalloc(a * b) with: vmalloc(array_size(a, b)) as well as handling cases of: vmalloc(a * b * c) with: vmalloc(array3_size(a, b, c)) This does, however, attempt to ignore constant size factors like: vmalloc(4 * 1024) though any constants defined via macros get caught up in the conversion. Any factors with a sizeof() of "unsigned char", "char", and "u8" were dropped, since they're redundant. The Coccinelle script used for this was: // Fix redundant parens around sizeof(). @@ type TYPE; expression THING, E; @@ ( vmalloc( - (sizeof(TYPE)) * E + sizeof(TYPE) * E , ...) \| vmalloc( - (sizeof(THING)) * E + sizeof(THING) * E , ...) ) // Drop single-byte sizes and redundant parens. @@ expression COUNT; typedef u8; typedef __u8; @@ ( vmalloc( - sizeof(u8) * (COUNT) + COUNT , ...) \| vmalloc( - sizeof(__u8) * (COUNT) + COUNT , ...) \| vmalloc( - sizeof(char) * (COUNT) + COUNT , ...) \| vmalloc( - sizeof(unsigned char) * (COUNT) + COUNT , ...) \| vmalloc( - sizeof(u8) * COUNT + COUNT , ...) \| vmalloc( - sizeof(__u8) * COUNT + COUNT , ...) \| vmalloc( - sizeof(char) * COUNT + COUNT , ...) \| vmalloc( - sizeof(unsigned char) * COUNT + COUNT , ...) ) // 2-factor product with sizeof(type/expression) and identifier or constant. @@ type TYPE; expression THING; identifier COUNT_ID; constant COUNT_CONST; @@ ( vmalloc( - sizeof(TYPE) * (COUNT_ID) + array_size(COUNT_ID, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * COUNT_ID + array_size(COUNT_ID, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * COUNT_CONST + array_size(COUNT_CONST, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(THING) * (COUNT_ID) + array_size(COUNT_ID, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * COUNT_ID + array_size(COUNT_ID, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * (COUNT_CONST) + array_size(COUNT_CONST, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * COUNT_CONST + array_size(COUNT_CONST, sizeof(THING)) , ...) ) // 2-factor product, only identifiers. @@ identifier SIZE, COUNT; @@ vmalloc( - SIZE * COUNT + array_size(COUNT, SIZE) , ...) // 3-factor product with 1 sizeof(type) or sizeof(expression), with // redundant parens removed. @@ expression THING; identifier STRIDE, COUNT; type TYPE; @@ ( vmalloc( - sizeof(TYPE) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(TYPE) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(TYPE)) , ...) \| vmalloc( - sizeof(THING) * (COUNT) * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * (COUNT) * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * COUNT * (STRIDE) + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) \| vmalloc( - sizeof(THING) * COUNT * STRIDE + array3_size(COUNT, STRIDE, sizeof(THING)) , ...) ) // 3-factor product with 2 sizeof(variable), with redundant parens removed. @@ expression THING1, THING2; identifier COUNT; type TYPE1, TYPE2; @@ ( vmalloc( - sizeof(TYPE1) * sizeof(TYPE2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) \| vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2)) , ...) \| vmalloc( - sizeof(THING1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) \| vmalloc( - sizeof(THING1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(THING1), sizeof(THING2)) , ...) \| vmalloc( - sizeof(TYPE1) * sizeof(THING2) * COUNT + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) \| vmalloc( - sizeof(TYPE1) * sizeof(THING2) * (COUNT) + array3_size(COUNT, sizeof(TYPE1), sizeof(THING2)) , ...) ) // 3-factor product, only identifiers, with redundant parens removed. @@ identifier STRIDE, SIZE, COUNT; @@ ( vmalloc( - (COUNT) * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - COUNT * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - COUNT * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - (COUNT) * (STRIDE) * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - COUNT * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - (COUNT) * STRIDE * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - (COUNT) * (STRIDE) * (SIZE) + array3_size(COUNT, STRIDE, SIZE) , ...) \| vmalloc( - COUNT * STRIDE * SIZE + array3_size(COUNT, STRIDE, SIZE) , ...) ) // Any remaining multi-factor products, first at least 3-factor products // when they're not all constants... @@ expression E1, E2, E3; constant C1, C2, C3; @@ ( vmalloc(C1 * C2 * C3, ...) \| vmalloc( - E1 * E2 * E3 + array3_size(E1, E2, E3) , ...) ) // And then all remaining 2 factors products when they're not all constants. @@ expression E1, E2; constant C1, C2; @@ ( vmalloc(C1 * C2, ...) \| vmalloc( - E1 * E2 + array_size(E1, E2) , ...) ) Signed-off-by: Kees Cook <keescook@chromium.org>	2018-06-12 16:19:22 -07:00
Peter Zijlstra	f64c6013a2	rcu/x86: Provide early rcu_cpu_starting() callback The x86/mtrr code does horrific things because hardware. It uses stop_machine_from_inactive_cpu(), which does a wakeup (of the stopper thread on another CPU), which uses RCU, all before the CPU is onlined. RCU complains about this, because wakeups use RCU and RCU does (rightfully) not consider offline CPUs for grace-periods. Fix this by initializing RCU way early in the MTRR case. Tested-by: Mike Galbraith <efault@gmx.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> [ paulmck: Add !SMP support, per 0day Test Robot report. ]	2018-05-22 16:12:26 -07:00
Paul E. McKenney	22df7316ac	Merge branches 'exp.2018.05.15a', 'fixes.2018.05.15a', 'lock.2018.05.15a' and 'torture.2018.05.15a' into HEAD exp.2018.05.15a: Parallelize expedited grace-period initialization. fixes.2018.05.15a: Miscellaneous fixes. lock.2018.05.15a: Decrease lock contention on root rcu_node structure, which is a step towards merging RCU flavors. torture.2018.05.15a: Torture-test updates.	2018-05-15 10:33:05 -07:00
Paul E. McKenney	034777d7f5	rcutorture: Print end-of-test state This commit adds end-of-test state printout to help check whether RCU shut down nicely. Note that this printout only helps for flavors of RCU that are not used much by the kernel. In particular, for normal RCU having a grace period in progress is expected behavior. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:32:08 -07:00
Paul E. McKenney	a458360af6	rcu: Drop early GP request check from rcu_gp_kthread() Now that grace-period requests use funnel locking and now that they set ->gp_flags to RCU_GP_FLAG_INIT even when the RCU grace-period kthread has not yet started, rcu_gp_kthread() no longer needs to check need_any_future_gp() at startup time. This commit therefore removes this check. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:31:04 -07:00
Paul E. McKenney	c1935209df	rcu: Simplify and inline cpu_needs_another_gp() Now that RCU no longer relies on failsafe checks, cpu_needs_another_gp() can be greatly simplified. This simplification eliminates the last call to rcu_future_needs_gp() and to rcu_segcblist_future_gp_needed(), both of which which can then be eliminated. And then, because cpu_needs_another_gp() is called only from __rcu_pending(), it can be inlined and eliminated. This commit carries out the simplification, inlining, and elimination called out above. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:59 -07:00
Paul E. McKenney	384f77f4cb	rcu: The rcu_gp_cleanup() function does not need cpu_needs_another_gp() All of the cpu_needs_another_gp() function's checks (except for newly arrived callbacks) have been subsumed into the rcu_gp_cleanup() function's scan of the rcu_node tree. This commit therefore drops the call to cpu_needs_another_gp(). The check for newly arrived callbacks is supplied by rcu_accelerate_cbs(). Any needed advancing (as in the earlier rcu_advance_cbs() call) will be supplied when the corresponding CPU becomes aware of the end of the now-completed grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:54 -07:00
Paul E. McKenney	665f08f1ce	rcu: Make rcu_start_this_gp() check for out-of-range requests If rcu_start_this_gp() is invoked with a requested grace period more than three in the future, then either the ->need_future_gp[] array needs to be bigger or the caller needs to be repaired. This commit therefore adds a WARN_ON_ONCE() checking for this condition. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:48 -07:00
Paul E. McKenney	360e0da67e	rcu: Add funnel locking to rcu_start_this_gp() The rcu_start_this_gp() function had a simple form of funnel locking that used only the leaves and root of the rcu_node tree, which is fine for systems with only a few hundred CPUs, but sub-optimal for systems having thousands of CPUs. This commit therefore adds full-tree funnel locking. This variant of funnel locking is unusual in the following ways: 1. The leaf-level rcu_node structure's ->lock is held throughout. Other funnel-locking implementations drop the leaf-level lock before progressing to the next level of the tree. 2. Funnel locking can be started at the root, which is convenient for code that already holds the root rcu_node structure's ->lock. Other funnel-locking implementations start at the leaves. 3. If an rcu_node structure other than the initial one believes that a grace period is in progress, it is not necessary to go further up the tree. This is because grace-period cleanup scans the full tree, so that marking the need for a subsequent grace period anywhere in the tree suffices -- but only if a grace period is currently in progress. 4. It is possible that the RCU grace-period kthread has not yet started, and this case must be handled appropriately. However, the general approach of using a tree to control lock contention is still in place. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:37 -07:00
Paul E. McKenney	41e80595ab	rcu: Make rcu_start_future_gp() caller select grace period The rcu_accelerate_cbs() function selects a grace-period target, which it uses to have rcu_segcblist_accelerate() assign numbers to recently queued callbacks. Then it invokes rcu_start_future_gp(), which selects a grace-period target again, which is a bit pointless. This commit therefore changes rcu_start_future_gp() to take the grace-period target as a parameter, thus avoiding double selection. This commit also changes the name of rcu_start_future_gp() to rcu_start_this_gp() to reflect this change in functionality, and also makes a similar change to the name of trace_rcu_future_gp(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:32 -07:00
Paul E. McKenney	d5cd96851d	rcu: Inline rcu_start_gp_advanced() into rcu_start_future_gp() The rcu_start_gp_advanced() is invoked only from rcu_start_future_gp() and much of its code is redundant when invoked from that context. This commit therefore inlines rcu_start_gp_advanced() into rcu_start_future_gp(), then removes rcu_start_gp_advanced(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:27 -07:00
Paul E. McKenney	a824a287f6	rcu: Clear request other than RCU_GP_FLAG_INIT at GP end Once the grace period has ended, any RCU_GP_FLAG_FQS requests are irrelevant: The grace period has ended, so there is no longer any point in forcing quiescent states in order to try to make it end sooner. This commit therefore causes rcu_gp_cleanup() to clear any bits other than RCU_GP_FLAG_INIT from ->gp_flags at the end of the grace period. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:20 -07:00
Paul E. McKenney	a508aa597e	rcu: Cleanup, don't put ->completed into an int It is true that currently only the low-order two bits are used, so there should be no problem given modern machines and compilers, but good hygiene and maintainability dictates use of an unsigned long instead of an int. This commit therefore makes this change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:15 -07:00
Paul E. McKenney	bd7af8463b	rcu: Switch __rcu_process_callbacks() to rcu_accelerate_cbs() The __rcu_process_callbacks() function currently checks to see if the current CPU needs a grace period and also if there is any other reason to kick off a new grace period. This is one of the fail-safe checks that has been rendered unnecessary by the changes that increase the accuracy of rcu_gp_cleanup()'s estimate as to whether another grace period is required. Because this particular fail-safe involved acquiring the root rcu_node structure's ->lock, which has seen excessive contention in real life, this fail-safe needs to go. However, one check must remain, namely the check for newly arrived RCU callbacks that have not yet been associated with a grace period. One might hope that the checks in __note_gp_changes(), which is invoked indirectly from rcu_check_quiescent_state(), would suffice, but this function won't be invoked at all if RCU is idle. It is therefore necessary to replace the fail-safe checks with a simpler check for newly arrived callbacks during an RCU idle period, which is exactly what this commit does. This change removes the final call to rcu_start_gp(), so this function is removed as well. Note that lockless use of cpu_needs_another_gp() is racy, but that these races are harmless in this case. If RCU really is idle, the values will not change, so the return value from cpu_needs_another_gp() will be correct. If RCU is not idle, the resulting redundant call to rcu_accelerate_cbs() will be harmless, and might even have the benefit of reducing grace-period latency a bit. This commit also moves interrupt disabling into the "if" statement to improve real-time response a bit. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:30:03 -07:00
Paul E. McKenney	a6058d85a2	rcu: Avoid __call_rcu_core() root rcu_node ->lock acquisition When __call_rcu_core() notices excessive numbers of callbacks pending on the current CPU, we know that at least one of them is not yet classified, namely the one that was just now queued. Therefore, it is not necessary to invoke rcu_start_gp() and thus not necessary to acquire the root rcu_node structure's ->lock. This commit therefore replaces the rcu_start_gp() with rcu_accelerate_cbs(), thus replacing an acquisition of the root rcu_node structure's ->lock with that of this CPU's leaf rcu_node structure. This decreases contention on the root rcu_node structure's ->lock. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:57 -07:00
Paul E. McKenney	ec4eaccef4	rcu: Make rcu_migrate_callbacks wake GP kthread when needed The rcu_migrate_callbacks() function invokes rcu_advance_cbs() twice, ignoring the return value. This is OK at pressent because of failsafe code that does the wakeup when needed. However, this failsafe code acquires the root rcu_node structure's lock frequently, while rcu_migrate_callbacks() does so only once per CPU-offline operation. This commit therefore makes rcu_migrate_callbacks() wake up the RCU GP kthread when either call to rcu_advance_cbs() returns true, thus removing need for the failsafe code. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:51 -07:00
Paul E. McKenney	6f576e2816	rcu: Convert ->need_future_gp[] array to boolean There is no longer any need for ->need_future_gp[] to count the number of requests for future grace periods, so this commit converts the additions to assignments to "true" and reduces the size of each element to one byte. While we are in the area, fix an obsolete comment. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:46 -07:00
Paul E. McKenney	0ae94e00ce	rcu: Make rcu_future_needs_gp() check all ->need_future_gps[] elements Currently, the rcu_future_needs_gp() function checks only the current element of the ->need_future_gps[] array, which might miss elements that were offset from the expected element, for example, due to races with the start or the end of a grace period. This commit therefore makes rcu_future_needs_gp() use the need_any_future_gp() macro to check all of the elements of this array. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:41 -07:00
Paul E. McKenney	51af970d19	rcu: Avoid losing ->need_future_gp[] values due to GP start/end races The rcu_cbs_completed() function provides the value of ->completed at which new callbacks can safely be invoked. This is recorded in two-element ->need_future_gp[] arrays in the rcu_node structure, and the elements of these arrays corresponding to the just-completed grace period are zeroed at the end of that grace period. However, the rcu_cbs_completed() function can return the current ->completed value plus either one or two, so it is possible for the corresponding ->need_future_gp[] entry to be cleared just after it was set, thus losing a request for a future grace period. This commit avoids this race by expanding ->need_future_gp[] to four elements. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:33 -07:00
Paul E. McKenney	fb31340f8a	rcu: Make rcu_gp_cleanup() more accurately predict need for new GP Currently, rcu_gp_cleanup() scans the rcu_node tree in order to reset state to reflect the end of the grace period. It also checks to see whether a new grace period is needed, but in a number of cases, rather than directly cause the new grace period to be immediately started, it instead leaves the grace-period-needed state where various fail-safes can find it. This works fine, but results in higher contention on the root rcu_node structure's ->lock, which is undesirable, and contention on that lock has recently become noticeable. This commit therefore makes rcu_gp_cleanup() immediately start a new grace period if there is any need for one. It is quite possible that it will later be necessary to throttle the grace-period rate, but that can be dealt with when and if. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:28 -07:00
Paul E. McKenney	5fe0a56298	rcu: Make rcu_gp_kthread() check for early-boot activity The rcu_gp_kthread() function immediately sleeps waiting to be notified of the need for a new grace period, which currently works because there are a number of code sequences that will provide the needed wakeup later. However, some of these code sequences need to acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_gp_kthread() check for early-boot activity when it starts up, omitting the initial sleep in that case. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:24 -07:00
Paul E. McKenney	c91a8675b9	rcu: Add accessor macros for the ->need_future_gp[] array Accessors for the ->need_future_gp[] array are currently open-coded, which makes them difficult to change. To improve maintainability, this commit adds need_future_gp_mask() to compute the indexing mask from the array size, need_future_gp_element() to access the element corresponding to the specified grace-period number, and need_any_future_gp() to determine if any future grace period is needed. This commit also applies need_future_gp_element() to existing open-coded single-element accesses. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:18 -07:00
Paul E. McKenney	825a9911f6	rcu: Make rcu_start_future_gp()'s grace-period check more precise The rcu_start_future_gp() function uses a sloppy check for a grace period being in progress, which works today because there are a number of code sequences that resolve the resulting races. However, some of these race-resolution code sequences must acquire the root rcu_node structure's ->lock, and contention on that lock has started manifesting. This commit therefore makes rcu_start_future_gp() check more precise, eliminating the sloppy lockless check of the rcu_state structure's ->gpnum and ->completed fields. The effect is that rcu_start_future_gp() will sometimes unnecessarily attempt to start a new grace period, but this overhead will be reduced later using funnel locking. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:13 -07:00
Paul E. McKenney	9036c2ffd5	rcu: Improve non-root rcu_cbs_completed() accuracy When rcu_cbs_completed() is invoked on a non-root rcu_node structure, it unconditionally assumes that two grace periods must complete before the callbacks at hand can be invoked. This is overly conservative because if that non-root rcu_node structure believes that no grace period is in progress, and if the corresponding rcu_state structure's ->gpnum field has not yet been incremented, then these callbacks may safely be invoked after only one grace period has completed. This change is required to permit grace-period start requests to use funnel locking, which is in turn permitted to reduce root rcu_node ->lock contention, which has been observed by Nick Piggin. Furthermore, such contention will likely be increased by the merging of RCU-bh, RCU-preempt, and RCU-sched, so it makes sense to take steps to decrease it. This commit therefore improves the accuracy of rcu_cbs_completed() when invoked on a non-root rcu_node structure as described above. Reported-by: Nicholas Piggin <npiggin@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:29:07 -07:00
Paul E. McKenney	5b4c11d54b	rcu: Add leaf-node macros This commit adds rcu_first_leaf_node() that returns a pointer to the first leaf rcu_node structure in the specified RCU flavor and an rcu_is_leaf_node() that returns true iff the specified rcu_node structure is a leaf. This commit also uses these macros where appropriate. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:28:07 -07:00
Paul E. McKenney	f7194ac32c	srcu: Add cleanup_srcu_struct_quiesced() The current cleanup_srcu_struct() flushes work, which prevents it from being invoked from some workqueue contexts, as well as from atomic (non-blocking) contexts. This patch therefore introduced a cleanup_srcu_struct_quiesced(), which can be invoked only after all activity on the specified srcu_struct has completed. This restriction allows cleanup_srcu_struct_quiesced() to be invoked from workqueue contexts as well as from atomic contexts. Suggested-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nitzan Carmi <nitzanc@mellanox.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:56 -07:00
Yury Norov	17672480fb	rcu: Declare rcu_eqs_special_set() in public header Because rcu_eqs_special_set() is declared only in internal header kernel/rcu/tree.h and stubbed in include/linux/rcutiny.h, it is inaccessible outside of the RCU implementation. This patch therefore moves the rcu_eqs_special_set() declaration to include/linux/rcutree.h, which allows it to be used in non-rcu kernel code. Signed-off-by: Yury Norov <ynorov@caviumnetworks.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:51 -07:00
Paul E. McKenney	265f5f28f0	rcu: Update rcu_bind_gp_kthread() header comment The header comment for rcu_bind_gp_kthread() refers to sysidle, which is no longer with us. However, it is still important to bind RCU's grace-period kthreads to the housekeeping CPU(s), so rather than remove rcu_bind_gp_kthread(), this commit updates the comment. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:46 -07:00
Paul E. McKenney	0e5da22e3f	rcu: Move __rcu_read_lock() and __rcu_read_unlock() to tree_plugin.h The __rcu_read_lock() and __rcu_read_unlock() functions were moved to kernel/rcu/update.c in order to implement tiny preemptible RCU. However, tiny preemptible RCU was removed from the kernel a long time ago, so this commit belatedly moves them back into the only remaining preemptible-RCU code. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:41 -07:00
Paul E. McKenney	cee4393989	rcu: Rename cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs() Commit `e31d28b6ab` ("trace: Eliminate cond_resched_rcu_qs() in favor of cond_resched()") substituted cond_resched() for the earlier call to cond_resched_rcu_qs(). However, the new-age cond_resched() does not do anything to help RCU-tasks grace periods because (1) RCU-tasks is only enabled when CONFIG_PREEMPT=y and (2) cond_resched() is a complete no-op when preemption is enabled. This situation results in hangs when running the trace benchmarks. A number of potential fixes were discussed on LKML (https://lkml.kernel.org/r/20180224151240.0d63a059@vmware.local.home), including making cond_resched() not be a no-op; making cond_resched() not be a no-op, but only when running tracing benchmarks; reverting the aforementioned commit (which works because cond_resched_rcu_qs() does provide an RCU-tasks quiescent state; and adding a call to the scheduler/RCU rcu_note_voluntary_context_switch() function. All were deemed unsatisfactory, either due to added cond_resched() overhead or due to magic functions inviting cargo culting. This commit renames cond_resched_rcu_qs() to cond_resched_tasks_rcu_qs(), which provides a clear hint as to what this function is doing and why and where it should be used, and then replaces the call to cond_resched() with cond_resched_tasks_rcu_qs() in the trace benchmark's benchmark_event_kthread() function. Reported-by: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:29 -07:00
Byungchul Park	6fba2b3767	rcu: Remove deprecated RCU debugfs tracing code Commit `ae91aa0adb` ("rcu: Remove debugfs tracing") removed the RCU debugfs tracing code, but did not remove the no-longer used ->exp_workdone{0,1,2,3} fields in the srcu_data structure. This commit therefore removes these fields along with the code that uselessly updates them. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:23 -07:00
Byungchul Park	efcd2d5436	rcu: Call wake_nocb_leader_defer() with 'FORCE' when nocb_q_count is high If an excessive number of callbacks have been queued, but the NOCB leader kthread's wakeup must be deferred, then we should wake up the leader unconditionally once it is safe to do so. This was handled correctly in commit `fbce7497ee` ("rcu: Parallelize and economize NOCB kthread wakeups"), but then commit `8be6e1b15c` ("rcu: Use timer as backstop for NOCB deferred wakeups") passed RCU_NOCB_WAKE instead of the correct RCU_NOCB_WAKE_FORCE to wake_nocb_leader_defer(). As an interesting aside, RCU_NOCB_WAKE_FORCE is never passed to anything, which should have been taken as a hint. ;-) This commit therefore passes RCU_NOCB_WAKE_FORCE instead of RCU_NOCB_WAKE to wake_nocb_leader_defer() when a callback is queued onto a NOCB CPU that already has an excessive number of callbacks pending. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:18 -07:00
Paul E. McKenney	ef12620626	rcu: Don't allocate rcu_nocb_mask if no one needs it Commit `44c65ff2e3` ("rcu: Eliminate NOCBs CPU-state Kconfig options") made allocation of rcu_nocb_mask depend only on the rcu_nocbs=, nohz_full=, or isolcpus= kernel boot parameters. However, it failed to change the initial value of rcu_init_nohz()'s local variable need_rcu_nocb_mask to false, which can result in useless allocation of an all-zero rcu_nocb_mask. This commit therefore fixes this bug by changing the initial value of need_rcu_nocb_mask to false. While we are in the area, also correct the error message that is printed when someone specifies that can-never-exist CPUs should be NOCBs CPUs. Reported-by: Byungchul Park <byungchul.park@lge.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Byungchul Park <byungchul.park@lge.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:11 -07:00
Byungchul Park	be01b4cab1	rcu: Inline rcu_preempt_do_callback() into its sole caller The rcu_preempt_do_callbacks() function was introduced in commit 09223371dea(rcu: Use softirq to address performance regression), where it was necessary to handle kernel builds both containing and not containing RCU-preempt. Since then, various changes (most notably `f8b7fc6b51` ("rcu: use softirq instead of kthreads except when RCU_BOOST=y")) have resulted in this function being invoked only from rcu_kthread_do_work(), which is present only in kernels containing RCU-preempt, which in turn means that the rcu_preempt_do_callbacks() function is no longer needed. This commit therefore inlines rcu_preempt_do_callbacks() into its sole remaining caller and also removes the rcu_state_p and rcu_data_p indirection for added clarity. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> [ paulmck: Remove the rcu_state_p and rcu_data_p indirection. ] Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:27:00 -07:00
Boqun Feng	55ebfce060	rcu: exp: Protect all sync_rcu_preempt_exp_done() with rcu_node lock Currently some callsites of sync_rcu_preempt_exp_done() are not called with the corresponding rcu_node's ->lock held, which could introduces bugs as per Paul: o CPU 0 in sync_rcu_preempt_exp_done() reads ->exp_tasks and sees that it is NULL. o CPU 1 blocks within an RCU read-side critical section, so it enqueues the task and points ->exp_tasks at it and clears CPU 1's bit in ->expmask. o All other CPUs clear their bits in ->expmask. o CPU 0 reads ->expmask, sees that it is zero, so incorrectly concludes that all quiescent states have completed, despite the fact that ->exp_tasks is non-NULL. To fix this, sync_rcu_preempt_exp_unlocked() is introduced to replace lockless callsites of sync_rcu_preempt_exp_done(). Further, a lockdep annotation is added into sync_rcu_preempt_exp_done() to prevent mis-use in the future. Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:26:07 -07:00
Boqun Feng	7be8c56f8f	rcu: exp: Fix "must hold exp_mutex" comments for QS reporting functions Since commit `d9a3da0699` ("rcu: Add expedited grace-period support for preemptible RCU"), there are comments for some funtions in rcu_report_exp_rnp()'s call-chain saying that exp_mutex or its predecessors needs to be held. However, exp_mutex and its predecessors were used only to synchronize between GPs, and it is clear that all variables visited by those functions are under the protection of rcu_node's ->lock. Moreover, those functions are currently called without held exp_mutex, and seems that doesn't introduce any trouble. So this patch fixes this problem by updating the comments to match the current code. Signed-off-by: Boqun Feng <boqun.feng@gmail.com> Fixes: `d9a3da0699` ("rcu: Add expedited grace-period support for preemptible RCU") Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:26:01 -07:00
Paul E. McKenney	25f3d7effa	rcu: Parallelize expedited grace-period initialization The latency of RCU expedited grace periods grows with increasing numbers of CPUs, eventually failing to be all that expedited. Much of the growth in latency is in the initialization phase, so this commit uses workqueues to carry out this initialization concurrently on a rcu_node-by-rcu_node basis. This change makes use of a new rcu_par_gp_wq because flushing a work item from another work item running from the same workqueue can result in deadlock. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Nicholas Piggin <npiggin@gmail.com>	2018-05-15 10:25:44 -07:00
Paul E. McKenney	338c46403f	Merge branches 'fixes.2018.02.23a', 'srcu.2018.02.20a' and 'torture.2018.02.20a' into HEAD fixes.2018.02.23a: Miscellaneous fixes srcu.2018.02.20a: SRCU updates torture.2018.02.20a: Torture-test updates	2018-02-23 15:15:41 -08:00
Paul E. McKenney	ad7c946b35	rcu: Create RCU-specific workqueues with rescuers RCU's expedited grace periods can participate in out-of-memory deadlocks due to all available system_wq kthreads being blocked and there not being memory available to create more. This commit prevents such deadlocks by allocating an RCU-specific workqueue_struct at early boot time, and providing it with a rescuer to ensure forward progress. This uses the shiny new init_rescuer() function provided by Tejun (but indirectly). This commit also causes SRCU to use this new RCU-specific workqueue_struct. Note that SRCU's use of workqueues never blocks them waiting for readers, so this should be safe from a forward-progress viewpoint. Note that this moves SRCU from system_power_efficient_wq to a normal workqueue. In the unlikely event that this results in measurable degradation, a separate power-efficient workqueue will be creates for SRCU. Reported-by: Prateek Sood <prsood@codeaurora.org> Reported-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Tejun Heo <tj@kernel.org>	2018-02-23 15:14:40 -08:00
Paul E. McKenney	85ba6bfe8b	torture: Provide more sensible nreader/nwriter defaults for rcuperf The default values for nreader and nwriter are apparently not all that user-friendly, resulting in people doing scalability tests that ran all runs at large scale. This commit therefore makes both the nreaders and nwriters module default to the number of CPUs, and adds a comment to rcuperf.c stating that the number of CPUs should be specified using the nr_cpus kernel boot parameter. This commit also eliminates the redundant rcuperf scripting specification of default values for these parameters. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:22:01 -08:00
Paul E. McKenney	db0c1a8aba	rcutorture: Record which grace-period primitives are tested The rcu_torture_writer() function adapts to requested testing from module parameters as well as the function pointers in the structure referenced by cur_ops. However, as long as the module parameters do not conflict with the function pointers, this adaptation is silent. This silence can result in confusion as to exactly what was tested, which could in turn result in untested RCU code making its way into mainline. This commit therefore makes rcu_torture_writer() announce exactly which portions of RCU's API it ends up testing. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:58 -08:00
Paul E. McKenney	f7c0e6ad4b	rcutorture: Re-enable testing of dynamic expediting During boot, normal grace periods are processed as expedited. When rcutorture is built into the kernel, it starts during boot and thus detects that normal grace periods are unconditionally expedited. Therefore, rcutorture concludes that there is no point in trying to dynamically enable expediting, do it disables this aspect of testing, which is a bit of an overreaction to the temporary boot-time expediting. This commit therefore rechecks forced expediting throughout the test, enabling dynamic expediting if normal grace periods are processed normally at any point. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:57 -08:00
Paul E. McKenney	eb0339934f	rcutorture: Avoid fake-writer use of undefined primitives Currently the rcu_torture_fakewriter() function invokes cur_ops->sync() and cur_ops->exp_sync() without first checking to see if they are in fact non-NULL. This results in kernel NULL pointer dereferences when testing RCU implementations that choose not to provide the full set of primitives. Given that it is perfectly reasonable to have specialized RCU implementations that provide only a subset of the RCU API, this is a bug in rcutorture. This commit therefore makes rcu_torture_fakewriter() check function pointers before invoking them, thus allowing it to test subsetted RCU implementations. Reported-by: Lihao Liang <lianglihao@huawei.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:56 -08:00
Paul E. McKenney	e0d31a34c6	rcutorture: Abstract function and module names This commit moves to __func__ for function names and for KBUILD_MODNAME for module names, all in the name of better resilience to change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:56 -08:00
Paul E. McKenney	68a675d433	rcutorture: Replace multi-instance kzalloc() with kcalloc() This commit replaces array-allocation calls to kzalloc() with equivalent calls to kcalloc(). Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:55 -08:00
Paul E. McKenney	6308f34775	rcu: Remove SRCU throttling The code in srcu_gp_end() inserts a delay every 0x3ff grace periods in order to prevent SRCU grace-period work from consuming an entire CPU when there is a long sequence of expedited SRCU grace-period requests. However, all of SRCU's grace-period work is carried out in workqueues, which are in turn within kthreads, which are automatically throttled as needed by the scheduler. In particular, if there is plenty of idle time, there is no point in throttling. This commit therefore removes the expedited SRCU grace-period throttling. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:13 -08:00
Byungchul Park	a72da917f1	srcu: Remove dead code in srcu_gp_end() Of course, compilers will optimize out a dead code. Anyway, remove any dead code for better readibility. Signed-off-by: Byungchul Park <byungchul.park@lge.com> Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:12 -08:00
Ildar Ismagilov	8ddbd8832d	srcu: Reduce scans of srcu_data in counter wrap check Currently, given a multi-level srcu_node tree, SRCU can scan the full set of srcu_data structures at each level when cleaning up after a grace period. This, though harmless otherwise, represents pointless overhead. This commit therefore eliminates this overhead by scanning the srcu_data structures only when traversing the leaf srcu_node structures. Signed-off-by: Ildar Ismagilov <devix84@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:12 -08:00
Ildar Ismagilov	a35d13ec36	srcu: Prevent sdp->srcu_gp_seq_needed_exp counter wrap SRCU checks each srcu_data structure's grace-period number for counter wrap four times per cycle by default. This frequency guarantees that normal comparisons will detect potential wrap. However, the expedited grace-period number is not checked. The consquences are not too horrible (a failure to expedite a grace period when requested), but it would be good to avoid such things. This commit therefore adds this check to the expedited grace-period number. Signed-off-by: Ildar Ismagilov <devix84@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:11 -08:00
Paul E. McKenney	cb4081cd4e	srcu: Abstract function name This commit moves to __func__ for function names in the name of better resilience to change. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2018-02-20 16:21:11 -08:00

1 2 3 4 5 ...

826 Commits