/* * device-mapper.c * * Copyright (C) 2001 Sistina Software * * This software is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU CC; see the file COPYING. If not, write to * the Free Software Foundation, 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ /* * Changelog * * 14/08/2001 - First Version [Joe Thornber] */ #include "dm.h" /* defines for blk.h */ #define MAJOR_NR DM_BLK_MAJOR #define DEVICE_NR(device) MINOR(device) /* has no partition bits */ #define DEVICE_NAME "device-mapper" /* name for messaging */ #define DEVICE_NO_RANDOM /* no entropy to contribute */ #define DEVICE_OFF(d) /* do-nothing */ #include <linux/blk.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/lvm.h> #include <linux/kmod.h> #define MAX_DEVICES 64 #define DEFAULT_READ_AHEAD 64 const char *_name = "device-mapper"; int _version[3] = { 0, 1, 0 }; struct io_hook { struct dm_table *table; struct target *target; int rw; void (*end_io) (struct buffer_head * bh, int uptodate); void *context; }; kmem_cache_t *_io_hook_cache; struct rw_semaphore _dev_lock; static struct mapped_device *_devs[MAX_DEVICES]; /* block device arrays */ static int _block_size[MAX_DEVICES]; static int _blksize_size[MAX_DEVICES]; static int _hardsect_size[MAX_DEVICES]; const char *_fs_dir = "device-mapper"; static devfs_handle_t _dev_dir; static int request(request_queue_t *q, int rw, struct buffer_head *bh); static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb); /* * setup and teardown the driver */ static int dm_init(void) { int ret; init_rwsem(&_dev_lock); if (!(_io_hook_cache = kmem_cache_create("dm io hooks", sizeof (struct io_hook), 0, 0, NULL, NULL))) return -ENOMEM; if ((ret = dmfs_init()) || (ret = dm_target_init()) || (ret = dm_init_blkdev())) return ret; /* set up the arrays */ read_ahead[MAJOR_NR] = DEFAULT_READ_AHEAD; blk_size[MAJOR_NR] = _block_size; blksize_size[MAJOR_NR] = _blksize_size; hardsect_size[MAJOR_NR] = _hardsect_size; if (devfs_register_blkdev(MAJOR_NR, _name, &dm_blk_dops) < 0) { printk(KERN_ERR "%s -- register_blkdev failed\n", _name); return -EIO; } blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), request); _dev_dir = devfs_mk_dir(0, _fs_dir, NULL); printk(KERN_INFO "%s %d.%d.%d initialised\n", _name, _version[0], _version[1], _version[2]); return 0; } static void dm_exit(void) { if (kmem_cache_destroy(_io_hook_cache)) WARN("it looks like there are still some io_hooks allocated"); dmfs_exit(); dm_cleanup_blkdev(); if (devfs_unregister_blkdev(MAJOR_NR, _name) < 0) printk(KERN_ERR "%s -- unregister_blkdev failed\n", _name); read_ahead[MAJOR_NR] = 0; blk_size[MAJOR_NR] = 0; blksize_size[MAJOR_NR] = 0; hardsect_size[MAJOR_NR] = 0; printk(KERN_INFO "%s %d.%d.%d cleaned up\n", _name, _version[0], _version[1], _version[2]); } /* * block device functions */ static int dm_blk_open(struct inode *inode, struct file *file) { int minor = MINOR(inode->i_rdev); struct mapped_device *md; if (minor >= MAX_DEVICES) return -ENXIO; down_write(&_dev_lock); md = _devs[minor]; if (!md || !is_active(md)) { up_write(&_dev_lock); return -ENXIO; } md->use_count++; up_write(&_dev_lock); MOD_INC_USE_COUNT; return 0; } static int dm_blk_close(struct inode *inode, struct file *file) { int minor = MINOR(inode->i_rdev); struct mapped_device *md; if (minor >= MAX_DEVICES) return -ENXIO; down_write(&_dev_lock); md = _devs[minor]; if (!md || md->use_count < 1) { WARN("reference count in mapped_device incorrect"); up_write(&_dev_lock); return -ENXIO; } md->use_count--; up_write(&_dev_lock); MOD_DEC_USE_COUNT; return 0; } /* In 512-byte units */ #define VOLUME_SIZE(minor) (_block_size[(minor)] >> 1) static int dm_blk_ioctl(struct inode *inode, struct file *file, uint command, ulong a) { int minor = MINOR(inode->i_rdev); long size; if (minor >= MAX_DEVICES) return -ENXIO; switch (command) { case BLKSSZGET: case BLKROGET: case BLKROSET: #if 0 case BLKELVSET: case BLKELVGET: #endif return blk_ioctl(inode->i_dev, command, a); break; case HDIO_GETGEO: { struct hd_geometry tmp = { heads:64, sectors:32 }; tmp.cylinders = VOLUME_SIZE(minor) / tmp.heads / tmp.sectors; if (copy_to_user((char *) a, &tmp, sizeof (tmp))) return -EFAULT; break; } case HDIO_GETGEO_BIG: { struct hd_big_geometry tmp = { heads:64, sectors:32 }; tmp.cylinders = VOLUME_SIZE(minor) / tmp.heads / tmp.sectors; if (copy_to_user((char *) a, &tmp, sizeof (tmp))) return -EFAULT; break; } case BLKGETSIZE: size = VOLUME_SIZE(minor); if (copy_to_user((void *) a, &size, sizeof (long))) return -EFAULT; break; case BLKFLSBUF: if (!capable(CAP_SYS_ADMIN)) return -EACCES; fsync_dev(inode->i_rdev); invalidate_buffers(inode->i_rdev); return 0; case BLKRAGET: if (copy_to_user ((void *) a, &read_ahead[MAJOR(inode->i_rdev)], sizeof (long))) return -EFAULT; return 0; case BLKRASET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; read_ahead[MAJOR(inode->i_rdev)] = a; return 0; case BLKRRPART: return -EINVAL; case LV_BMAP: return dm_user_bmap(inode, (struct lv_bmap *)a); default: printk(KERN_WARNING "%s - unknown block ioctl %d", _name, command); return -EINVAL; } return 0; } static inline struct io_hook *alloc_io_hook(void) { return kmem_cache_alloc(_io_hook_cache, GFP_NOIO); } static inline void free_io_hook(struct io_hook *ih) { kmem_cache_free(_io_hook_cache, ih); } /* * FIXME: need to decide if deferred_io's need * their own slab, I say no for now since they are * only used when the device is suspended. */ static inline struct deferred_io *alloc_deferred(void) { return kmalloc(sizeof (struct deferred_io), GFP_NOIO); } static inline void free_deferred(struct deferred_io *di) { kfree(di); } /* * bh->b_end_io routine that decrements the * pending count and then calls the original * bh->b_end_io fn. */ static void dec_pending(struct buffer_head *bh, int uptodate) { struct io_hook *ih = bh->b_private; if (!uptodate && ih->target->type->err) { if (ih->target->type->err(bh, ih->rw, ih->target->private)) return; } if (atomic_dec_and_test(&ih->table->pending)) /* nudge anyone waiting on suspend queue */ wake_up(&ih->table->wait); bh->b_end_io = ih->end_io; bh->b_private = ih->context; free_io_hook(ih); bh->b_end_io(bh, uptodate); } /* * add the bh to the list of deferred io. */ static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw) { struct deferred_io *di = alloc_deferred(); if (!di) return -ENOMEM; down_write(&_dev_lock); if (test_bit(DM_ACTIVE, &md->state)) { up_write(&_dev_lock); return 0; } di->bh = bh; di->rw = rw; di->next = md->deferred; md->deferred = di; up_write(&_dev_lock); return 1; } /* * do the bh mapping for a given leaf */ static inline int __map_buffer(struct mapped_device *md, struct buffer_head *bh, int rw, int leaf) { dm_map_fn fn; void *context; struct io_hook *ih = NULL; int r; struct target *ti = md->map->targets + leaf; fn = ti->type->map; context = ti->private; ih = alloc_io_hook(); if (!ih) return 0; ih->table = md->map; ih->rw = rw; ih->target = ti; ih->end_io = bh->b_end_io; ih->context = bh->b_private; r = fn(bh, rw, context); if (r > 0) { /* hook the end io request fn */ atomic_inc(&md->map->pending); bh->b_end_io = dec_pending; bh->b_private = ih; } else if (r == 0) /* we don't need to hook */ free_io_hook(ih); else if (r < 0) { free_io_hook(ih); return 0; } return 1; } /* * search the btree for the correct target. */ static inline int __find_node(struct dm_table *t, struct buffer_head *bh) { int l, n = 0, k = 0; offset_t *node; for (l = 0; l < t->depth; l++) { n = get_child(n, k); node = get_node(t, l, n); for (k = 0; k < KEYS_PER_NODE; k++) if (node[k] >= bh->b_rsector) break; } return (KEYS_PER_NODE * n) + k; } static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb) { struct buffer_head bh; struct mapped_device *md; unsigned long block; int minor = MINOR(inode->i_rdev); int err; if (minor >= MAX_DEVICES) return -ENXIO; md = _devs[minor]; if (md == NULL) return -ENXIO; if (get_user(block, &lvb->lv_block)) return -EFAULT; memset(&bh, 0, sizeof(bh)); bh.b_blocknr = block; bh.b_dev = bh.b_rdev = inode->i_rdev; bh.b_size = _blksize_size[minor]; bh.b_rsector = block * (bh.b_size >> 9); err = -EINVAL; down_read(&_dev_lock); if (test_bit(DM_ACTIVE, &md->state)) { struct target *t = md->map->targets + __find_node(md->map, &bh); struct target_type *target = t->type; if (target->flags & TF_BMAP) { err = target->map(&bh, READ, t->private); if (bh.b_private) { struct io_hook *ih = (struct io_hook *)bh.b_private; free_io_hook(ih); } err = (err == 0) ? -EINVAL : 0; } } up_read(&_dev_lock); if (err == 0) { if (put_user(kdev_t_to_nr(bh.b_rdev), &lvb->lv_dev)) return -EFAULT; if (put_user(bh.b_rsector / (bh.b_size >> 9), &lvb->lv_dev)) return -EFAULT; } return err; } static int request(request_queue_t *q, int rw, struct buffer_head *bh) { struct mapped_device *md; int r, minor = MINOR(bh->b_rdev); if (minor >= MAX_DEVICES) goto bad_no_lock; down_read(&_dev_lock); md = _devs[minor]; if (!md || !md->map) goto bad; /* if we're suspended we have to queue this io for later */ if (!test_bit(DM_ACTIVE, &md->state)) { up_read(&_dev_lock); r = queue_io(md, bh, rw); if (r < 0) goto bad_no_lock; else if (r > 0) return 0; /* deferred successfully */ down_read(&_dev_lock); /* FIXME: there's still a race here */ } if (!__map_buffer(md, bh, rw, __find_node(md->map, bh))) goto bad; up_read(&_dev_lock); return 1; bad: up_read(&_dev_lock); bad_no_lock: buffer_IO_error(bh); return 0; } /* * see if the device with a specific minor # is * free. */ static inline int __specific_dev(int minor) { if (minor > MAX_DEVICES) { WARN("request for a mapped_device > than MAX_DEVICES"); return 0; } if (!_devs[minor]) return minor; return -1; } /* * find the first free device. */ static inline int __any_old_dev(void) { int i; for (i = 0; i < MAX_DEVICES; i++) if (!_devs[i]) return i; return -1; } /* * allocate and initialise a blank device. */ static struct mapped_device *alloc_dev(int minor) { struct mapped_device *md = kmalloc(sizeof (*md), GFP_KERNEL); if (!md) return 0; memset(md, 0, sizeof (*md)); down_write(&_dev_lock); minor = (minor < 0) ? __any_old_dev() : __specific_dev(minor); if (minor < 0) { WARN("no free devices available"); up_write(&_dev_lock); kfree(md); return 0; } md->dev = MKDEV(DM_BLK_MAJOR, minor); md->name[0] = '\0'; md->state = 0; _devs[minor] = md; up_write(&_dev_lock); return md; } struct mapped_device *dm_find_by_minor(int minor) { struct mapped_device *md; down_read(&_dev_lock); md = _devs[minor]; up_read(&_dev_lock); return md; } static int register_device(struct mapped_device *md) { md->devfs_entry = devfs_register(_dev_dir, md->name, DEVFS_FL_CURRENT_OWNER, MAJOR(md->dev), MINOR(md->dev), S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP, &dm_blk_dops, NULL); if (!md->devfs_entry) return -ENOMEM; return 0; } static int unregister_device(struct mapped_device *md) { devfs_unregister(md->devfs_entry); return 0; } #ifdef CONFIG_HOTPLUG static void dm_sbin_hotplug(struct mapped_device *md, int create) { int i; char *argv[3]; char *envp[5]; char name[DM_NAME_LEN + 16]; if (!hotplug_path[0]) return; if (!current->fs->root) return; sprintf(name, "DMNAME=%s\n", md->name); i = 0; argv[i++] = hotplug_path; argv[i++] = "devmap"; argv[i] = 0; i = 0; envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i++] = name; if (create) envp[i++] = "ACTION=add"; else envp[i++] = "ACTION=remove"; envp[i] = 0; call_usermodehelper(argv[0], argv, envp); } #else #define dm_sbin_hotplug(md, create) do { } while(0) #endif /* CONFIG_HOTPLUG */ /* * constructor for a new device */ struct mapped_device *dm_create(const char *name, int minor) { int r; struct mapped_device *md; if (minor >= MAX_DEVICES) return ERR_PTR(-ENXIO); if (!(md = alloc_dev(minor))) return ERR_PTR(-ENXIO); down_write(&_dev_lock); strcpy(md->name, name); if ((r = register_device(md))) { up_write(&_dev_lock); return ERR_PTR(r); } up_write(&_dev_lock); dm_sbin_hotplug(md, 1); return md; } /* * destructor for the device. md->map is * deliberately not destroyed, dm-fs should manage * table objects. */ int dm_remove(struct mapped_device *md) { int minor, r; down_write(&_dev_lock); if (md->use_count) { up_write(&_dev_lock); return -EPERM; } if ((r = unregister_device(md))) { up_write(&_dev_lock); return r; } minor = MINOR(md->dev); _devs[minor] = 0; up_write(&_dev_lock); dm_sbin_hotplug(md, 0); kfree(md); return 0; } /* * Bind a table to the device. */ void __bind(struct mapped_device *md, struct dm_table *t) { int minor = MINOR(md->dev); md->map = t; /* In 1024-byte units */ _block_size[minor] = (t->highs[t->num_targets - 1] + 1) >> 1; _blksize_size[minor] = t->blksize_size; _hardsect_size[minor] = t->hardsect_size; register_disk(NULL, md->dev, 1, &dm_blk_dops, _block_size[minor]); } /* * requeue the deferred buffer_heads by calling * generic_make_request. */ static void __flush_deferred_io(struct mapped_device *md) { struct deferred_io *c, *n; for (c = md->deferred, md->deferred = 0; c; c = n) { n = c->next; generic_make_request(c->rw, c->bh); free_deferred(c); } } /* * make the device available for use, if was * previously suspended rather than newly created * then all queued io is flushed */ int dm_activate(struct mapped_device *md, struct dm_table *table) { /* check that the mapping has at least been loaded. */ if (!table->num_targets) return -EINVAL; down_write(&_dev_lock); /* you must be deactivated first */ if (is_active(md)) { up_write(&_dev_lock); return -EPERM; } __bind(md, table); set_bit(DM_ACTIVE, &md->state); __flush_deferred_io(md); up_write(&_dev_lock); return 0; } /* * Deactivate the device, the device must not be * opened by anyone. */ int dm_deactivate(struct mapped_device *md) { down_read(&_dev_lock); if (md->use_count) { up_read(&_dev_lock); return -EPERM; } fsync_dev(md->dev); up_read(&_dev_lock); down_write(&_dev_lock); if (md->use_count) { /* drat, somebody got in quick ... */ up_write(&_dev_lock); return -EPERM; } md->map = 0; clear_bit(DM_ACTIVE, &md->state); up_write(&_dev_lock); return 0; } /* * We need to be able to change a mapping table * under a mounted filesystem. for example we * might want to move some data in the background. * Before the table can be swapped with * dm_bind_table, dm_suspend must be called to * flush any in flight buffer_heads and ensure * that any further io gets deferred. */ void dm_suspend(struct mapped_device *md) { DECLARE_WAITQUEUE(wait, current); down_write(&_dev_lock); if (!is_active(md)) { up_write(&_dev_lock); return; } clear_bit(DM_ACTIVE, &md->state); up_write(&_dev_lock); /* wait for all the pending io to flush */ add_wait_queue(&md->map->wait, &wait); current->state = TASK_UNINTERRUPTIBLE; do { down_write(&_dev_lock); if (!atomic_read(&md->map->pending)) break; up_write(&_dev_lock); schedule(); } while (1); current->state = TASK_RUNNING; remove_wait_queue(&md->map->wait, &wait); md->map = 0; up_write(&_dev_lock); } struct block_device_operations dm_blk_dops = { open: dm_blk_open, release: dm_blk_close, ioctl: dm_blk_ioctl }; /* * module hooks */ module_init(dm_init); module_exit(dm_exit); MODULE_DESCRIPTION("device-mapper driver"); MODULE_AUTHOR("Joe Thornber <thornber@btconnect.com>"); /* * Local variables: * c-file-style: "linux" * End: */