e52a293264
This change avoids a race that could result in a NULL pointer derference following a WARNing from kobject_add_internal, "don't try to register things with the same name in the same directory." The problem was found with a test that forgets and discovers an aoe device in a loop: while test ! -r /tmp/stop; do aoe-flush -a aoe-discover done The race was between aoedev_flush taking aoedevs out of the devlist, allowing a new discovery of the same AoE target to take place before the driver gets around to calling sysfs_remove_group. Fixing that one revealed another race between do_open and add_disk, and this patch avoids that, too. The fix required some care, because for flushing (forgetting) an aoedev, some of the steps must be performed under lock and some must be able to sleep. Also, for discovering a new aoedev, some steps might sleep. The check for a bad aoedev pointer remains from a time when about half of this patch was done, and it was possible for the bdev->bd_disk->private_data to become corrupted. The check should be removed eventually, but it is not expected to add significant overhead, occurring in the aoeblk_open routine. Signed-off-by: Ed Cashin <ecashin@coraid.com> Cc: Jens Axboe <axboe@kernel.dk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
521 lines
11 KiB
C
521 lines
11 KiB
C
/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
|
|
/*
|
|
* aoedev.c
|
|
* AoE device utility functions; maintains device list.
|
|
*/
|
|
|
|
#include <linux/hdreg.h>
|
|
#include <linux/blkdev.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/delay.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/bitmap.h>
|
|
#include <linux/kdev_t.h>
|
|
#include <linux/moduleparam.h>
|
|
#include "aoe.h"
|
|
|
|
static void dummy_timer(ulong);
|
|
static void freetgt(struct aoedev *d, struct aoetgt *t);
|
|
static void skbpoolfree(struct aoedev *d);
|
|
|
|
static int aoe_dyndevs = 1;
|
|
module_param(aoe_dyndevs, int, 0644);
|
|
MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
|
|
|
|
static struct aoedev *devlist;
|
|
static DEFINE_SPINLOCK(devlist_lock);
|
|
|
|
/* Because some systems will have one, many, or no
|
|
* - partitions,
|
|
* - slots per shelf,
|
|
* - or shelves,
|
|
* we need some flexibility in the way the minor numbers
|
|
* are allocated. So they are dynamic.
|
|
*/
|
|
#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
|
|
|
|
static DEFINE_SPINLOCK(used_minors_lock);
|
|
static DECLARE_BITMAP(used_minors, N_DEVS);
|
|
|
|
static int
|
|
minor_get_dyn(ulong *sysminor)
|
|
{
|
|
ulong flags;
|
|
ulong n;
|
|
int error = 0;
|
|
|
|
spin_lock_irqsave(&used_minors_lock, flags);
|
|
n = find_first_zero_bit(used_minors, N_DEVS);
|
|
if (n < N_DEVS)
|
|
set_bit(n, used_minors);
|
|
else
|
|
error = -1;
|
|
spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
|
|
*sysminor = n * AOE_PARTITIONS;
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
|
|
{
|
|
ulong flags;
|
|
ulong n;
|
|
int error = 0;
|
|
enum {
|
|
/* for backwards compatibility when !aoe_dyndevs,
|
|
* a static number of supported slots per shelf */
|
|
NPERSHELF = 16,
|
|
};
|
|
|
|
if (aoemin >= NPERSHELF) {
|
|
pr_err("aoe: %s %d slots per shelf\n",
|
|
"static minor device numbers support only",
|
|
NPERSHELF);
|
|
error = -1;
|
|
goto out;
|
|
}
|
|
|
|
n = aoemaj * NPERSHELF + aoemin;
|
|
if (n >= N_DEVS) {
|
|
pr_err("aoe: %s with e%ld.%d\n",
|
|
"cannot use static minor device numbers",
|
|
aoemaj, aoemin);
|
|
error = -1;
|
|
goto out;
|
|
}
|
|
|
|
spin_lock_irqsave(&used_minors_lock, flags);
|
|
if (test_bit(n, used_minors)) {
|
|
pr_err("aoe: %s %lu\n",
|
|
"existing device already has static minor number",
|
|
n);
|
|
error = -1;
|
|
} else
|
|
set_bit(n, used_minors);
|
|
spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
*sysminor = n * AOE_PARTITIONS;
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
static int
|
|
minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
|
|
{
|
|
if (aoe_dyndevs)
|
|
return minor_get_dyn(sysminor);
|
|
else
|
|
return minor_get_static(sysminor, aoemaj, aoemin);
|
|
}
|
|
|
|
static void
|
|
minor_free(ulong minor)
|
|
{
|
|
ulong flags;
|
|
|
|
minor /= AOE_PARTITIONS;
|
|
BUG_ON(minor >= N_DEVS);
|
|
|
|
spin_lock_irqsave(&used_minors_lock, flags);
|
|
BUG_ON(!test_bit(minor, used_minors));
|
|
clear_bit(minor, used_minors);
|
|
spin_unlock_irqrestore(&used_minors_lock, flags);
|
|
}
|
|
|
|
/*
|
|
* Users who grab a pointer to the device with aoedev_by_aoeaddr
|
|
* automatically get a reference count and must be responsible
|
|
* for performing a aoedev_put. With the addition of async
|
|
* kthread processing I'm no longer confident that we can
|
|
* guarantee consistency in the face of device flushes.
|
|
*
|
|
* For the time being, we only bother to add extra references for
|
|
* frames sitting on the iocq. When the kthreads finish processing
|
|
* these frames, they will aoedev_put the device.
|
|
*/
|
|
|
|
void
|
|
aoedev_put(struct aoedev *d)
|
|
{
|
|
ulong flags;
|
|
|
|
spin_lock_irqsave(&devlist_lock, flags);
|
|
d->ref--;
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
}
|
|
|
|
static void
|
|
dummy_timer(ulong vp)
|
|
{
|
|
struct aoedev *d;
|
|
|
|
d = (struct aoedev *)vp;
|
|
if (d->flags & DEVFL_TKILL)
|
|
return;
|
|
d->timer.expires = jiffies + HZ;
|
|
add_timer(&d->timer);
|
|
}
|
|
|
|
static void
|
|
aoe_failip(struct aoedev *d)
|
|
{
|
|
struct request *rq;
|
|
struct bio *bio;
|
|
unsigned long n;
|
|
|
|
aoe_failbuf(d, d->ip.buf);
|
|
|
|
rq = d->ip.rq;
|
|
if (rq == NULL)
|
|
return;
|
|
while ((bio = d->ip.nxbio)) {
|
|
clear_bit(BIO_UPTODATE, &bio->bi_flags);
|
|
d->ip.nxbio = bio->bi_next;
|
|
n = (unsigned long) rq->special;
|
|
rq->special = (void *) --n;
|
|
}
|
|
if ((unsigned long) rq->special == 0)
|
|
aoe_end_request(d, rq, 0);
|
|
}
|
|
|
|
static void
|
|
downdev_frame(struct list_head *pos)
|
|
{
|
|
struct frame *f;
|
|
|
|
f = list_entry(pos, struct frame, head);
|
|
list_del(pos);
|
|
if (f->buf) {
|
|
f->buf->nframesout--;
|
|
aoe_failbuf(f->t->d, f->buf);
|
|
}
|
|
aoe_freetframe(f);
|
|
}
|
|
|
|
void
|
|
aoedev_downdev(struct aoedev *d)
|
|
{
|
|
struct aoetgt *t, **tt, **te;
|
|
struct list_head *head, *pos, *nx;
|
|
struct request *rq;
|
|
int i;
|
|
|
|
d->flags &= ~DEVFL_UP;
|
|
|
|
/* clean out active and to-be-retransmitted buffers */
|
|
for (i = 0; i < NFACTIVE; i++) {
|
|
head = &d->factive[i];
|
|
list_for_each_safe(pos, nx, head)
|
|
downdev_frame(pos);
|
|
}
|
|
head = &d->rexmitq;
|
|
list_for_each_safe(pos, nx, head)
|
|
downdev_frame(pos);
|
|
|
|
/* reset window dressings */
|
|
tt = d->targets;
|
|
te = tt + NTARGETS;
|
|
for (; tt < te && (t = *tt); tt++) {
|
|
aoecmd_wreset(t);
|
|
t->nout = 0;
|
|
}
|
|
|
|
/* clean out the in-process request (if any) */
|
|
aoe_failip(d);
|
|
|
|
/* fast fail all pending I/O */
|
|
if (d->blkq) {
|
|
while ((rq = blk_peek_request(d->blkq))) {
|
|
blk_start_request(rq);
|
|
aoe_end_request(d, rq, 1);
|
|
}
|
|
}
|
|
|
|
if (d->gd)
|
|
set_capacity(d->gd, 0);
|
|
}
|
|
|
|
/* return whether the user asked for this particular
|
|
* device to be flushed
|
|
*/
|
|
static int
|
|
user_req(char *s, size_t slen, struct aoedev *d)
|
|
{
|
|
char *p;
|
|
size_t lim;
|
|
|
|
if (!d->gd)
|
|
return 0;
|
|
p = strrchr(d->gd->disk_name, '/');
|
|
if (!p)
|
|
p = d->gd->disk_name;
|
|
else
|
|
p += 1;
|
|
lim = sizeof(d->gd->disk_name);
|
|
lim -= p - d->gd->disk_name;
|
|
if (slen < lim)
|
|
lim = slen;
|
|
|
|
return !strncmp(s, p, lim);
|
|
}
|
|
|
|
static void
|
|
freedev(struct aoedev *d)
|
|
{
|
|
struct aoetgt **t, **e;
|
|
int freeing = 0;
|
|
unsigned long flags;
|
|
|
|
spin_lock_irqsave(&d->lock, flags);
|
|
if (d->flags & DEVFL_TKILL
|
|
&& !(d->flags & DEVFL_FREEING)) {
|
|
d->flags |= DEVFL_FREEING;
|
|
freeing = 1;
|
|
}
|
|
spin_unlock_irqrestore(&d->lock, flags);
|
|
if (!freeing)
|
|
return;
|
|
|
|
del_timer_sync(&d->timer);
|
|
if (d->gd) {
|
|
aoedisk_rm_sysfs(d);
|
|
del_gendisk(d->gd);
|
|
put_disk(d->gd);
|
|
blk_cleanup_queue(d->blkq);
|
|
}
|
|
t = d->targets;
|
|
e = t + NTARGETS;
|
|
for (; t < e && *t; t++)
|
|
freetgt(d, *t);
|
|
if (d->bufpool)
|
|
mempool_destroy(d->bufpool);
|
|
skbpoolfree(d);
|
|
minor_free(d->sysminor);
|
|
|
|
spin_lock_irqsave(&d->lock, flags);
|
|
d->flags |= DEVFL_FREED;
|
|
spin_unlock_irqrestore(&d->lock, flags);
|
|
}
|
|
|
|
enum flush_parms {
|
|
NOT_EXITING = 0,
|
|
EXITING = 1,
|
|
};
|
|
|
|
static int
|
|
flush(const char __user *str, size_t cnt, int exiting)
|
|
{
|
|
ulong flags;
|
|
struct aoedev *d, **dd;
|
|
char buf[16];
|
|
int all = 0;
|
|
int specified = 0; /* flush a specific device */
|
|
unsigned int skipflags;
|
|
|
|
skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
|
|
|
|
if (!exiting && cnt >= 3) {
|
|
if (cnt > sizeof buf)
|
|
cnt = sizeof buf;
|
|
if (copy_from_user(buf, str, cnt))
|
|
return -EFAULT;
|
|
all = !strncmp(buf, "all", 3);
|
|
if (!all)
|
|
specified = 1;
|
|
}
|
|
|
|
flush_scheduled_work();
|
|
/* pass one: without sleeping, do aoedev_downdev */
|
|
spin_lock_irqsave(&devlist_lock, flags);
|
|
for (d = devlist; d; d = d->next) {
|
|
spin_lock(&d->lock);
|
|
if (exiting) {
|
|
/* unconditionally take each device down */
|
|
} else if (specified) {
|
|
if (!user_req(buf, cnt, d))
|
|
goto cont;
|
|
} else if ((!all && (d->flags & DEVFL_UP))
|
|
|| d->flags & skipflags
|
|
|| d->nopen
|
|
|| d->ref)
|
|
goto cont;
|
|
|
|
aoedev_downdev(d);
|
|
d->flags |= DEVFL_TKILL;
|
|
cont:
|
|
spin_unlock(&d->lock);
|
|
}
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
|
|
/* pass two: call freedev, which might sleep,
|
|
* for aoedevs marked with DEVFL_TKILL
|
|
*/
|
|
restart:
|
|
spin_lock_irqsave(&devlist_lock, flags);
|
|
for (d = devlist; d; d = d->next) {
|
|
spin_lock(&d->lock);
|
|
if (d->flags & DEVFL_TKILL
|
|
&& !(d->flags & DEVFL_FREEING)) {
|
|
spin_unlock(&d->lock);
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
freedev(d);
|
|
goto restart;
|
|
}
|
|
spin_unlock(&d->lock);
|
|
}
|
|
|
|
/* pass three: remove aoedevs marked with DEVFL_FREED */
|
|
for (dd = &devlist, d = *dd; d; d = *dd) {
|
|
struct aoedev *doomed = NULL;
|
|
|
|
spin_lock(&d->lock);
|
|
if (d->flags & DEVFL_FREED) {
|
|
*dd = d->next;
|
|
doomed = d;
|
|
} else {
|
|
dd = &d->next;
|
|
}
|
|
spin_unlock(&d->lock);
|
|
kfree(doomed);
|
|
}
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
aoedev_flush(const char __user *str, size_t cnt)
|
|
{
|
|
return flush(str, cnt, NOT_EXITING);
|
|
}
|
|
|
|
/* This has been confirmed to occur once with Tms=3*1000 due to the
|
|
* driver changing link and not processing its transmit ring. The
|
|
* problem is hard enough to solve by returning an error that I'm
|
|
* still punting on "solving" this.
|
|
*/
|
|
static void
|
|
skbfree(struct sk_buff *skb)
|
|
{
|
|
enum { Sms = 250, Tms = 30 * 1000};
|
|
int i = Tms / Sms;
|
|
|
|
if (skb == NULL)
|
|
return;
|
|
while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
|
|
msleep(Sms);
|
|
if (i < 0) {
|
|
printk(KERN_ERR
|
|
"aoe: %s holds ref: %s\n",
|
|
skb->dev ? skb->dev->name : "netif",
|
|
"cannot free skb -- memory leaked.");
|
|
return;
|
|
}
|
|
skb->truesize -= skb->data_len;
|
|
skb_shinfo(skb)->nr_frags = skb->data_len = 0;
|
|
skb_trim(skb, 0);
|
|
dev_kfree_skb(skb);
|
|
}
|
|
|
|
static void
|
|
skbpoolfree(struct aoedev *d)
|
|
{
|
|
struct sk_buff *skb, *tmp;
|
|
|
|
skb_queue_walk_safe(&d->skbpool, skb, tmp)
|
|
skbfree(skb);
|
|
|
|
__skb_queue_head_init(&d->skbpool);
|
|
}
|
|
|
|
/* find it or allocate it */
|
|
struct aoedev *
|
|
aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
|
|
{
|
|
struct aoedev *d;
|
|
int i;
|
|
ulong flags;
|
|
ulong sysminor = 0;
|
|
|
|
spin_lock_irqsave(&devlist_lock, flags);
|
|
|
|
for (d=devlist; d; d=d->next)
|
|
if (d->aoemajor == maj && d->aoeminor == min) {
|
|
spin_lock(&d->lock);
|
|
if (d->flags & DEVFL_TKILL) {
|
|
spin_unlock(&d->lock);
|
|
d = NULL;
|
|
goto out;
|
|
}
|
|
d->ref++;
|
|
spin_unlock(&d->lock);
|
|
break;
|
|
}
|
|
if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
|
|
goto out;
|
|
d = kcalloc(1, sizeof *d, GFP_ATOMIC);
|
|
if (!d)
|
|
goto out;
|
|
INIT_WORK(&d->work, aoecmd_sleepwork);
|
|
spin_lock_init(&d->lock);
|
|
skb_queue_head_init(&d->skbpool);
|
|
init_timer(&d->timer);
|
|
d->timer.data = (ulong) d;
|
|
d->timer.function = dummy_timer;
|
|
d->timer.expires = jiffies + HZ;
|
|
add_timer(&d->timer);
|
|
d->bufpool = NULL; /* defer to aoeblk_gdalloc */
|
|
d->tgt = d->targets;
|
|
d->ref = 1;
|
|
for (i = 0; i < NFACTIVE; i++)
|
|
INIT_LIST_HEAD(&d->factive[i]);
|
|
INIT_LIST_HEAD(&d->rexmitq);
|
|
d->sysminor = sysminor;
|
|
d->aoemajor = maj;
|
|
d->aoeminor = min;
|
|
d->rttavg = RTTAVG_INIT;
|
|
d->rttdev = RTTDEV_INIT;
|
|
d->next = devlist;
|
|
devlist = d;
|
|
out:
|
|
spin_unlock_irqrestore(&devlist_lock, flags);
|
|
return d;
|
|
}
|
|
|
|
static void
|
|
freetgt(struct aoedev *d, struct aoetgt *t)
|
|
{
|
|
struct frame *f;
|
|
struct list_head *pos, *nx, *head;
|
|
struct aoeif *ifp;
|
|
|
|
for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
|
|
if (!ifp->nd)
|
|
break;
|
|
dev_put(ifp->nd);
|
|
}
|
|
|
|
head = &t->ffree;
|
|
list_for_each_safe(pos, nx, head) {
|
|
list_del(pos);
|
|
f = list_entry(pos, struct frame, head);
|
|
skbfree(f->skb);
|
|
kfree(f);
|
|
}
|
|
kfree(t);
|
|
}
|
|
|
|
void
|
|
aoedev_exit(void)
|
|
{
|
|
flush_scheduled_work();
|
|
aoe_flush_iocq();
|
|
flush(NULL, 0, EXITING);
|
|
}
|
|
|
|
int __init
|
|
aoedev_init(void)
|
|
{
|
|
return 0;
|
|
}
|