net/rds: Get rid of "wait_clean_list_grace" and add locking
Waiting for activity on the "clean_list" to quiesce is no substitute for proper locking. We can have multiple threads competing for "llist_del_first" via "rds_ib_reuse_mr", and a single thread competing for "llist_del_all" and "llist_del_first" via "rds_ib_flush_mr_pool". Since "llist_del_first" depends on "list->first->next" not to change in the midst of the operation, simply waiting for all current calls to "rds_ib_reuse_mr" to quiesce across all CPUs is woefully inadequate: By the time "wait_clean_list_grace" is done iterating over all CPUs to see that there is no concurrent caller to "rds_ib_reuse_mr", a new caller may have just shown up on the first CPU. Furthermore, <linux/llist.h> explicitly calls out the need for locking: * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, * and llist_del_first or llist_del_all used in other consumers, * then a lock is needed. Also, while at it, drop the unused "pool" parameter from "list_to_llist_nodes". Signed-off-by: Gerd Rausch <gerd.rausch@oracle.com> Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
2c7da8e6b0
commit
c9467447fc
@ -98,6 +98,7 @@ struct rds_ib_mr_pool {
|
|||||||
struct llist_head free_list; /* unused MRs */
|
struct llist_head free_list; /* unused MRs */
|
||||||
struct llist_head clean_list; /* unused & unmapped MRs */
|
struct llist_head clean_list; /* unused & unmapped MRs */
|
||||||
wait_queue_head_t flush_wait;
|
wait_queue_head_t flush_wait;
|
||||||
|
spinlock_t clean_lock; /* "clean_list" concurrency */
|
||||||
|
|
||||||
atomic_t free_pinned; /* memory pinned by free MRs */
|
atomic_t free_pinned; /* memory pinned by free MRs */
|
||||||
unsigned long max_items;
|
unsigned long max_items;
|
||||||
|
@ -40,9 +40,6 @@
|
|||||||
|
|
||||||
struct workqueue_struct *rds_ib_mr_wq;
|
struct workqueue_struct *rds_ib_mr_wq;
|
||||||
|
|
||||||
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
|
|
||||||
#define CLEAN_LIST_BUSY_BIT 0
|
|
||||||
|
|
||||||
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
|
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
|
||||||
{
|
{
|
||||||
struct rds_ib_device *rds_ibdev;
|
struct rds_ib_device *rds_ibdev;
|
||||||
@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
|
|||||||
{
|
{
|
||||||
struct rds_ib_mr *ibmr = NULL;
|
struct rds_ib_mr *ibmr = NULL;
|
||||||
struct llist_node *ret;
|
struct llist_node *ret;
|
||||||
unsigned long *flag;
|
unsigned long flags;
|
||||||
|
|
||||||
preempt_disable();
|
spin_lock_irqsave(&pool->clean_lock, flags);
|
||||||
flag = this_cpu_ptr(&clean_list_grace);
|
|
||||||
set_bit(CLEAN_LIST_BUSY_BIT, flag);
|
|
||||||
ret = llist_del_first(&pool->clean_list);
|
ret = llist_del_first(&pool->clean_list);
|
||||||
|
spin_unlock_irqrestore(&pool->clean_lock, flags);
|
||||||
if (ret) {
|
if (ret) {
|
||||||
ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
|
ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
|
||||||
if (pool->pool_type == RDS_IB_MR_8K_POOL)
|
if (pool->pool_type == RDS_IB_MR_8K_POOL)
|
||||||
@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
|
|||||||
rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
|
rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
|
||||||
}
|
}
|
||||||
|
|
||||||
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
|
|
||||||
preempt_enable();
|
|
||||||
return ibmr;
|
return ibmr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void wait_clean_list_grace(void)
|
|
||||||
{
|
|
||||||
int cpu;
|
|
||||||
unsigned long *flag;
|
|
||||||
|
|
||||||
for_each_online_cpu(cpu) {
|
|
||||||
flag = &per_cpu(clean_list_grace, cpu);
|
|
||||||
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
|
|
||||||
cpu_relax();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void rds_ib_sync_mr(void *trans_private, int direction)
|
void rds_ib_sync_mr(void *trans_private, int direction)
|
||||||
{
|
{
|
||||||
struct rds_ib_mr *ibmr = trans_private;
|
struct rds_ib_mr *ibmr = trans_private;
|
||||||
@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
|
|||||||
* of clusters. Each cluster has linked llist nodes of
|
* of clusters. Each cluster has linked llist nodes of
|
||||||
* MR_CLUSTER_SIZE mrs that are ready for reuse.
|
* MR_CLUSTER_SIZE mrs that are ready for reuse.
|
||||||
*/
|
*/
|
||||||
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
|
static void list_to_llist_nodes(struct list_head *list,
|
||||||
struct list_head *list,
|
|
||||||
struct llist_node **nodes_head,
|
struct llist_node **nodes_head,
|
||||||
struct llist_node **nodes_tail)
|
struct llist_node **nodes_tail)
|
||||||
{
|
{
|
||||||
@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
|
|||||||
*/
|
*/
|
||||||
dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
|
dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
|
||||||
dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
|
dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
|
||||||
if (free_all)
|
if (free_all) {
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&pool->clean_lock, flags);
|
||||||
llist_append_to_list(&pool->clean_list, &unmap_list);
|
llist_append_to_list(&pool->clean_list, &unmap_list);
|
||||||
|
spin_unlock_irqrestore(&pool->clean_lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
free_goal = rds_ib_flush_goal(pool, free_all);
|
free_goal = rds_ib_flush_goal(pool, free_all);
|
||||||
|
|
||||||
@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
|
|||||||
rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
|
rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
|
||||||
|
|
||||||
if (!list_empty(&unmap_list)) {
|
if (!list_empty(&unmap_list)) {
|
||||||
/* we have to make sure that none of the things we're about
|
unsigned long flags;
|
||||||
* to put on the clean list would race with other cpus trying
|
|
||||||
* to pull items off. The llist would explode if we managed to
|
|
||||||
* remove something from the clean list and then add it back again
|
|
||||||
* while another CPU was spinning on that same item in llist_del_first.
|
|
||||||
*
|
|
||||||
* This is pretty unlikely, but just in case wait for an llist grace period
|
|
||||||
* here before adding anything back into the clean list.
|
|
||||||
*/
|
|
||||||
wait_clean_list_grace();
|
|
||||||
|
|
||||||
list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
|
list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
|
||||||
if (ibmr_ret) {
|
if (ibmr_ret) {
|
||||||
*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
|
*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
|
||||||
clean_nodes = clean_nodes->next;
|
clean_nodes = clean_nodes->next;
|
||||||
}
|
}
|
||||||
/* more than one entry in llist nodes */
|
/* more than one entry in llist nodes */
|
||||||
if (clean_nodes)
|
if (clean_nodes) {
|
||||||
|
spin_lock_irqsave(&pool->clean_lock, flags);
|
||||||
llist_add_batch(clean_nodes, clean_tail,
|
llist_add_batch(clean_nodes, clean_tail,
|
||||||
&pool->clean_list);
|
&pool->clean_list);
|
||||||
|
spin_unlock_irqrestore(&pool->clean_lock, flags);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic_sub(unpinned, &pool->free_pinned);
|
atomic_sub(unpinned, &pool->free_pinned);
|
||||||
@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
|
|||||||
init_llist_head(&pool->free_list);
|
init_llist_head(&pool->free_list);
|
||||||
init_llist_head(&pool->drop_list);
|
init_llist_head(&pool->drop_list);
|
||||||
init_llist_head(&pool->clean_list);
|
init_llist_head(&pool->clean_list);
|
||||||
|
spin_lock_init(&pool->clean_lock);
|
||||||
mutex_init(&pool->flush_lock);
|
mutex_init(&pool->flush_lock);
|
||||||
init_waitqueue_head(&pool->flush_wait);
|
init_waitqueue_head(&pool->flush_wait);
|
||||||
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
|
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
|
||||||
|
Loading…
Reference in New Issue
Block a user