1
0
mirror of git://sourceware.org/git/lvm2.git synced 2025-01-04 09:18:36 +03:00
lvm2/lib/device/dev-io.c
Zdenek Kabelac 70ad633638 devcache: add reason and always log_error
With these read errors it's useful to know the reason.
Also avoid to log error just once so we know exactly
how many times we did failing read.

On the other hand reduce repeated log_error() on code 'backtrace'
path and change severity of message to just log_debug() so the
actual read error is printed once for one read.
2018-03-15 10:50:28 +01:00

1288 lines
32 KiB
C

/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2012 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
* of the GNU Lesser General Public License v.2.1.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "lib.h"
#include "device.h"
#include "metadata.h"
#include "lvmcache.h"
#include "memlock.h"
#include "locking.h"
#include <limits.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#ifdef __linux__
# define u64 uint64_t /* Missing without __KERNEL__ */
# undef WNOHANG /* Avoid redefinition */
# undef WUNTRACED /* Avoid redefinition */
# include <linux/fs.h> /* For block ioctl definitions */
# define BLKSIZE_SHIFT SECTOR_SHIFT
# ifndef BLKGETSIZE64 /* fs.h out-of-date */
# define BLKGETSIZE64 _IOR(0x12, 114, size_t)
# endif /* BLKGETSIZE64 */
# ifndef BLKDISCARD
# define BLKDISCARD _IO(0x12,119)
# endif
#else
# include <sys/disk.h>
# define BLKBSZGET DKIOCGETBLOCKSIZE
# define BLKSSZGET DKIOCGETBLOCKSIZE
# define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
# define BLKFLSBUF DKIOCSYNCHRONIZECACHE
# define BLKSIZE_SHIFT 0
#endif
#ifdef O_DIRECT_SUPPORT
# ifndef O_DIRECT
# error O_DIRECT support configured but O_DIRECT definition not found in headers
# endif
#endif
/*
* Always read at least 8k from disk.
* This seems to be a good compromise for the existing LVM2 metadata layout.
*/
#define MIN_READ_SIZE (8 * 1024)
static DM_LIST_INIT(_open_devices);
static unsigned _dev_size_seqno = 1;
static const char *_reasons[] = {
"dev signatures",
"PV labels",
"VG metadata header",
"VG metadata content",
"extra VG metadata header",
"extra VG metadata content",
"LVM1 metadata",
"pool metadata",
"LV content",
"logging",
};
static const char *_reason_text(dev_io_reason_t reason)
{
return _reasons[(unsigned) reason];
}
/*
* Release the memory holding the last data we read
*/
static void _release_devbuf(struct device_buffer *devbuf)
{
dm_free(devbuf->malloc_address);
devbuf->malloc_address = NULL;
}
void devbufs_release(struct device *dev)
{
if ((dev->flags & DEV_REGULAR))
return;
_release_devbuf(&dev->last_devbuf);
_release_devbuf(&dev->last_extra_devbuf);
}
#ifdef AIO_SUPPORT
# include <libaio.h>
static io_context_t _aio_ctx = 0;
static struct io_event *_aio_events = NULL;
static int _aio_max = 0;
static int64_t _aio_memory_max = 0;
static int _aio_must_queue = 0; /* Have we reached AIO capacity? */
static DM_LIST_INIT(_aio_queue);
#define DEFAULT_AIO_COLLECTION_EVENTS 32
int dev_async_setup(struct cmd_context *cmd)
{
int r;
_aio_max = find_config_tree_int(cmd, devices_aio_max_CFG, NULL);
_aio_memory_max = find_config_tree_int(cmd, devices_aio_memory_CFG, NULL) * INT64_C(1024 * 1024);
/* Threshold is zero? */
if (!_aio_max || !_aio_memory_max) {
if (_aio_ctx)
dev_async_exit();
return 1;
}
/* Already set up? */
if (_aio_ctx)
return 1;
log_debug_io("Setting up aio context for up to %" PRId64 " MB across %d events.", _aio_memory_max, _aio_max);
if (!_aio_events && !(_aio_events = dm_zalloc(sizeof(*_aio_events) * DEFAULT_AIO_COLLECTION_EVENTS))) {
log_error("Failed to allocate io_event array for asynchronous I/O.");
return 0;
}
if ((r = io_setup(_aio_max, &_aio_ctx)) < 0) {
/*
* Possible errors:
* ENOSYS - aio not available in current kernel
* EAGAIN - _aio_max is too big
* EFAULT - invalid pointer
* EINVAL - _aio_ctx != 0 or kernel aio limits exceeded
* ENOMEM
*/
log_warn("WARNING: Asynchronous I/O setup for %d events failed: %s", _aio_max, strerror(-r));
log_warn("WARNING: Using only synchronous I/O.");
dm_free(_aio_events);
_aio_events = NULL;
_aio_ctx = 0;
return 0;
}
return 1;
}
/* Reset aio context after fork */
int dev_async_reset(struct cmd_context *cmd)
{
log_debug_io("Resetting asynchronous I/O context.");
_aio_ctx = 0;
dm_free(_aio_events);
_aio_events = NULL;
return dev_async_setup(cmd);
}
/*
* Track the amount of in-flight async I/O.
* If it exceeds the defined threshold set _aio_must_queue.
*/
static void _update_aio_counters(int nr, ssize_t bytes)
{
static int64_t aio_bytes = 0;
static int aio_count = 0;
aio_bytes += bytes;
aio_count += nr;
if (aio_count >= _aio_max || aio_bytes > _aio_memory_max)
_aio_must_queue = 1;
else
_aio_must_queue = 0;
}
static int _io(struct device_buffer *devbuf, unsigned ioflags);
int dev_async_getevents(void)
{
struct device_buffer *devbuf, *tmp;
lvm_callback_fn_t dev_read_callback_fn;
void *dev_read_callback_context;
int r, event_nr;
if (!_aio_ctx)
return 1;
do {
/* FIXME Add timeout - currently NULL - waits for ever for at least 1 item */
r = io_getevents(_aio_ctx, 1, DEFAULT_AIO_COLLECTION_EVENTS, _aio_events, NULL);
if (r > 0)
break;
if (!r)
return 1; /* Timeout elapsed */
if (r == -EINTR)
continue;
if (r == -EAGAIN) {
usleep(100);
return 1; /* Give the caller the opportunity to do other work before repeating */
}
/*
* ENOSYS - not supported by kernel
* EFAULT - memory invalid
* EINVAL - _aio_ctx invalid or min_nr/nr/timeout out of range
*/
log_error("Asynchronous event collection failed: %s", strerror(-r));
return 0;
} while (1);
for (event_nr = 0; event_nr < r; event_nr++) {
devbuf = _aio_events[event_nr].obj->data;
dm_free(_aio_events[event_nr].obj);
_update_aio_counters(-1, -devbuf->where.size);
dev_read_callback_fn = devbuf->dev_read_callback_fn;
dev_read_callback_context = devbuf->dev_read_callback_context;
/* Clear the callbacks as a precaution */
devbuf->dev_read_callback_context = NULL;
devbuf->dev_read_callback_fn = NULL;
if (_aio_events[event_nr].res == devbuf->where.size) {
if (dev_read_callback_fn)
dev_read_callback_fn(0, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, (char *)devbuf->buf + devbuf->data_offset);
} else {
/* FIXME If partial read is possible, resubmit remainder */
log_error("%s: asynchronous read only I/O failed (" FMTd64 ") of " FMTu64 " bytes at " FMTu64 " (for %s): %s",
dev_name(devbuf->where.dev), _aio_events[event_nr].res,
(uint64_t) devbuf->where.size, (uint64_t) devbuf->where.start,
_reason_text(devbuf->reason),
((int64_t)_aio_events[event_nr].res < 0) ? strerror(-(int64_t)_aio_events[event_nr].res) : 0);
_release_devbuf(devbuf);
if (dev_read_callback_fn)
dev_read_callback_fn(1, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, NULL);
else
r = 0;
}
}
/* Submit further queued events if we can */
dm_list_iterate_items_gen_safe(devbuf, tmp, &_aio_queue, aio_queued) {
if (_aio_must_queue)
break;
dm_list_del(&devbuf->aio_queued);
_io(devbuf, 1);
}
return 1;
}
static int _io_async(struct device_buffer *devbuf)
{
struct device_area *where = &devbuf->where;
struct iocb *iocb;
int r;
_update_aio_counters(1, devbuf->where.size);
if (!(iocb = dm_malloc(sizeof(*iocb)))) {
log_error("Failed to allocate I/O control block array for asynchronous I/O.");
return 0;
}
io_prep_pread(iocb, dev_fd(where->dev), devbuf->buf, where->size, where->start);
iocb->data = devbuf;
do {
r = io_submit(_aio_ctx, 1L, &iocb);
if (r ==1)
break; /* Success */
if (r == -EAGAIN) {
/* Try to release some resources then retry */
usleep(100);
if (dev_async_getevents())
return_0;
/* FIXME Add counter/timeout so we can't get stuck here for ever */
continue;
}
/*
* Possible errors:
* EFAULT - invalid data
* ENOSYS - no aio support in kernel
* EBADF - bad file descriptor in iocb
* EINVAL - invalid _aio_ctx / iocb not initialised / invalid operation for this fd
*/
log_error("Asynchronous event submission failed: %s", strerror(-r));
return 0;
} while (1);
return 1;
}
void dev_async_exit(void)
{
struct device_buffer *devbuf, *tmp;
lvm_callback_fn_t dev_read_callback_fn;
void *dev_read_callback_context;
int r;
if (!_aio_ctx)
return;
/* Discard any queued requests */
dm_list_iterate_items_gen_safe(devbuf, tmp, &_aio_queue, aio_queued) {
dm_list_del(&devbuf->aio_queued);
_update_aio_counters(-1, -devbuf->where.size);
dev_read_callback_fn = devbuf->dev_read_callback_fn;
dev_read_callback_context = devbuf->dev_read_callback_context;
_release_devbuf(devbuf);
if (dev_read_callback_fn)
dev_read_callback_fn(1, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, NULL);
}
log_debug_io("Destroying aio context.");
if ((r = io_destroy(_aio_ctx)) < 0)
/* Returns -ENOSYS if aio not in kernel or -EINVAL if _aio_ctx invalid */
log_error("Failed to destroy asynchronous I/O context: %s", strerror(-r));
dm_free(_aio_events);
_aio_events = NULL;
_aio_ctx = 0;
}
static void _queue_aio(struct device_buffer *devbuf)
{
dm_list_add(&_aio_queue, &devbuf->aio_queued);
log_debug_io("Queueing aio.");
}
#else
static int _aio_ctx = 0;
static int _aio_must_queue = 0;
int dev_async_setup(struct cmd_context *cmd)
{
return 1;
}
int dev_async_reset(struct cmd_context *cmd)
{
return 1;
}
int dev_async_getevents(void)
{
return 1;
}
void dev_async_exit(void)
{
}
static int _io_async(struct device_buffer *devbuf)
{
return 0;
}
static void _queue_aio(struct device_buffer *devbuf)
{
}
#endif /* AIO_SUPPORT */
/*-----------------------------------------------------------------
* The standard io loop that keeps submitting an io until it's
* all gone.
*---------------------------------------------------------------*/
static int _io_sync(struct device_buffer *devbuf)
{
struct device_area *where = &devbuf->where;
int fd = dev_fd(where->dev);
char *buffer = devbuf->buf;
ssize_t n = 0;
size_t total = 0;
if (lseek(fd, (off_t) where->start, SEEK_SET) == (off_t) -1) {
log_error("%s: lseek %" PRIu64 " failed: %s",
dev_name(where->dev), (uint64_t) where->start,
strerror(errno));
return 0;
}
while (total < (size_t) where->size) {
do
n = devbuf->write ?
write(fd, buffer, (size_t) where->size - total) :
read(fd, buffer, (size_t) where->size - total);
while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
if (n < 0)
log_error("%s: synchronous %s failed after %" PRIu64 " of %" PRIu64
" at %" PRIu64 " (for %s): %s", dev_name(where->dev),
devbuf->write ? "write" : "read",
(uint64_t) total,
(uint64_t) where->size, (uint64_t) where->start,
_reason_text(devbuf->reason),
strerror(errno));
if (n <= 0)
break;
total += n;
buffer += n;
}
return (total == (size_t) where->size);
}
static int _io(struct device_buffer *devbuf, unsigned ioflags)
{
struct device_area *where = &devbuf->where;
int fd = dev_fd(where->dev);
int async = (!devbuf->write && _aio_ctx && aio_supported_code_path(ioflags) && devbuf->dev_read_callback_fn) ? 1 : 0;
if (fd < 0) {
log_error("Attempt to read an unopened device (%s).",
dev_name(where->dev));
return 0;
}
if (!devbuf->buf && !(devbuf->malloc_address = devbuf->buf = dm_malloc_aligned((size_t) devbuf->where.size, 0))) {
log_error("I/O buffer malloc failed");
return 0;
}
log_debug_io("%s %s(fd %d):%8" PRIu64 " bytes (%ssync) at %" PRIu64 "%s (for %s)",
devbuf->write ? "Write" : "Read ", dev_name(where->dev), fd,
where->size, async ? "a" : "", (uint64_t) where->start,
(devbuf->write && test_mode()) ? " (test mode - suppressed)" : "", _reason_text(devbuf->reason));
/*
* Skip all writes in test mode.
*/
if (devbuf->write && test_mode())
return 1;
if (where->size > SSIZE_MAX) {
log_error("Read size too large: %" PRIu64, where->size);
return 0;
}
return async ? _io_async(devbuf) : _io_sync(devbuf);
}
/*-----------------------------------------------------------------
* LVM2 uses O_DIRECT when performing metadata io, which requires
* block size aligned accesses. If any io is not aligned we have
* to perform the io via a bounce buffer, obviously this is quite
* inefficient.
*---------------------------------------------------------------*/
/*
* Get the physical and logical block size for a device.
*/
int dev_get_block_size(struct device *dev, unsigned int *physical_block_size, unsigned int *block_size)
{
const char *name = dev_name(dev);
int needs_open;
int r = 1;
needs_open = (!dev->open_count && (dev->phys_block_size == -1 || dev->block_size == -1));
if (needs_open && !dev_open_readonly(dev))
return_0;
if (dev->block_size == -1) {
if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
log_sys_error("ioctl BLKBSZGET", name);
r = 0;
goto out;
}
log_debug_devs("%s: Block size is %u bytes", name, dev->block_size);
}
#ifdef BLKPBSZGET
/* BLKPBSZGET is available in kernel >= 2.6.32 only */
if (dev->phys_block_size == -1) {
if (ioctl(dev_fd(dev), BLKPBSZGET, &dev->phys_block_size) < 0) {
log_sys_error("ioctl BLKPBSZGET", name);
r = 0;
goto out;
}
log_debug_devs("%s: Physical block size is %u bytes", name, dev->phys_block_size);
}
#elif defined (BLKSSZGET)
/* if we can't get physical block size, just use logical block size instead */
if (dev->phys_block_size == -1) {
if (ioctl(dev_fd(dev), BLKSSZGET, &dev->phys_block_size) < 0) {
log_sys_error("ioctl BLKSSZGET", name);
r = 0;
goto out;
}
log_debug_devs("%s: Physical block size can't be determined: Using logical block size of %u bytes", name, dev->phys_block_size);
}
#else
/* if even BLKSSZGET is not available, use default 512b */
if (dev->phys_block_size == -1) {
dev->phys_block_size = 512;
log_debug_devs("%s: Physical block size can't be determined: Using block size of %u bytes instead", name, dev->phys_block_size);
}
#endif
*physical_block_size = (unsigned int) dev->phys_block_size;
*block_size = (unsigned int) dev->block_size;
out:
if (needs_open && !dev_close(dev))
stack;
return r;
}
/*
* Widens a region to be an aligned region.
*/
static void _widen_region(unsigned int block_size, struct device_area *region,
struct device_area *result)
{
uint64_t mask = block_size - 1, delta;
memcpy(result, region, sizeof(*result));
/* adjust the start */
delta = result->start & mask;
if (delta) {
result->start -= delta;
result->size += delta;
}
/* adjust the end */
delta = (result->start + result->size) & mask;
if (delta)
result->size += block_size - delta;
}
static int _aligned_io(struct device_area *where, char *write_buffer,
int should_write, dev_io_reason_t reason,
unsigned ioflags, lvm_callback_fn_t dev_read_callback_fn, void *dev_read_callback_context)
{
unsigned int physical_block_size = 0;
unsigned int block_size = 0;
unsigned buffer_was_widened = 0;
uintptr_t mask;
struct device_area widened;
struct device_buffer *devbuf;
int r = 0;
if (!(where->dev->flags & DEV_REGULAR) &&
!dev_get_block_size(where->dev, &physical_block_size, &block_size))
return_0;
if (!block_size)
block_size = lvm_getpagesize();
/* Apply minimum read size */
if (!should_write && block_size < MIN_READ_SIZE)
block_size = MIN_READ_SIZE;
mask = block_size - 1;
_widen_region(block_size, where, &widened);
/* Did we widen the buffer? When writing, this means means read-modify-write. */
if (where->size != widened.size || where->start != widened.start) {
buffer_was_widened = 1;
log_debug_io("Widening request for %" PRIu64 " bytes at %" PRIu64 " to %" PRIu64 " bytes at %" PRIu64 " on %s (for %s)",
where->size, (uint64_t) where->start, widened.size, (uint64_t) widened.start, dev_name(where->dev), _reason_text(reason));
}
devbuf = DEV_DEVBUF(where->dev, reason);
_release_devbuf(devbuf);
devbuf->where.dev = where->dev;
devbuf->where.start = widened.start;
devbuf->where.size = widened.size;
devbuf->write = should_write;
devbuf->reason = reason;
devbuf->dev_read_callback_fn = dev_read_callback_fn;
devbuf->dev_read_callback_context = dev_read_callback_context;
/* Store location of requested data relative to start of buf */
devbuf->data_offset = where->start - devbuf->where.start;
if (should_write && !buffer_was_widened && !((uintptr_t) write_buffer & mask))
/* Perform the I/O directly. */
devbuf->buf = write_buffer;
else if (!should_write)
/* Postpone buffer allocation until we're about to issue the I/O */
devbuf->buf = NULL;
else {
/* Allocate a bounce buffer with an extra block */
if (!(devbuf->malloc_address = devbuf->buf = dm_malloc((size_t) devbuf->where.size + block_size))) {
log_error("Bounce buffer malloc failed");
return 0;
}
/*
* Realign start of bounce buffer (using the extra sector)
*/
if (((uintptr_t) devbuf->buf) & mask)
devbuf->buf = (char *) ((((uintptr_t) devbuf->buf) + mask) & ~mask);
}
/* If we've reached our concurrent AIO limit, add this request to the queue */
if (!devbuf->write && _aio_ctx && aio_supported_code_path(ioflags) && dev_read_callback_fn && _aio_must_queue) {
_queue_aio(devbuf);
return 1;
}
devbuf->write = 0;
/* Do we need to read into the bounce buffer? */
if ((!should_write || buffer_was_widened) && !_io(devbuf, ioflags)) {
if (!should_write)
goto_bad;
/* FIXME Handle errors properly! */
/* FIXME pre-extend the file */
memset(devbuf->buf, '\n', devbuf->where.size);
}
if (!should_write)
return 1;
/* writes */
if (devbuf->malloc_address) {
memcpy((char *) devbuf->buf + devbuf->data_offset, write_buffer, (size_t) where->size);
log_debug_io("Overwriting %" PRIu64 " bytes at %" PRIu64 " (for %s)", where->size,
(uint64_t) where->start, _reason_text(devbuf->reason));
}
/* ... then we write */
devbuf->write = 1;
if (!(r = _io(devbuf, 0)))
goto_bad;
_release_devbuf(devbuf);
return 1;
bad:
_release_devbuf(devbuf);
return 0;
}
static int _dev_get_size_file(struct device *dev, uint64_t *size)
{
const char *name = dev_name(dev);
struct stat info;
if (dev->size_seqno == _dev_size_seqno) {
log_very_verbose("%s: using cached size %" PRIu64 " sectors",
name, dev->size);
*size = dev->size;
return 1;
}
if (stat(name, &info)) {
log_sys_error("stat", name);
return 0;
}
*size = info.st_size;
*size >>= SECTOR_SHIFT; /* Convert to sectors */
dev->size = *size;
dev->size_seqno = _dev_size_seqno;
log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
return 1;
}
static int _dev_get_size_dev(struct device *dev, uint64_t *size)
{
const char *name = dev_name(dev);
if (dev->size_seqno == _dev_size_seqno) {
log_very_verbose("%s: using cached size %" PRIu64 " sectors",
name, dev->size);
*size = dev->size;
return 1;
}
if (!dev_open_readonly(dev))
return_0;
if (ioctl(dev_fd(dev), BLKGETSIZE64, size) < 0) {
log_sys_error("ioctl BLKGETSIZE64", name);
if (!dev_close(dev))
log_sys_error("close", name);
return 0;
}
*size >>= BLKSIZE_SHIFT; /* Convert to sectors */
dev->size = *size;
dev->size_seqno = _dev_size_seqno;
if (!dev_close(dev))
log_sys_error("close", name);
log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
return 1;
}
static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead)
{
long read_ahead_long;
if (dev->read_ahead != -1) {
*read_ahead = (uint32_t) dev->read_ahead;
return 1;
}
if (!dev_open_readonly(dev))
return_0;
if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) {
log_sys_error("ioctl BLKRAGET", dev_name(dev));
if (!dev_close(dev))
stack;
return 0;
}
*read_ahead = (uint32_t) read_ahead_long;
dev->read_ahead = read_ahead_long;
log_very_verbose("%s: read_ahead is %u sectors",
dev_name(dev), *read_ahead);
if (!dev_close(dev))
stack;
return 1;
}
static int _dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64_t size_bytes)
{
uint64_t discard_range[2];
if (!dev_open(dev))
return_0;
discard_range[0] = offset_bytes;
discard_range[1] = size_bytes;
log_debug_devs("Discarding %" PRIu64 " bytes offset %" PRIu64 " bytes on %s.",
size_bytes, offset_bytes, dev_name(dev));
if (ioctl(dev->fd, BLKDISCARD, &discard_range) < 0) {
log_error("%s: BLKDISCARD ioctl at offset %" PRIu64 " size %" PRIu64 " failed: %s.",
dev_name(dev), offset_bytes, size_bytes, strerror(errno));
if (!dev_close(dev))
stack;
/* It doesn't matter if discard failed, so return success. */
return 1;
}
if (!dev_close(dev))
stack;
return 1;
}
/*-----------------------------------------------------------------
* Public functions
*---------------------------------------------------------------*/
void dev_size_seqno_inc(void)
{
_dev_size_seqno++;
}
int dev_get_size(struct device *dev, uint64_t *size)
{
if (!dev)
return 0;
if ((dev->flags & DEV_REGULAR))
return _dev_get_size_file(dev, size);
return _dev_get_size_dev(dev, size);
}
int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead)
{
if (!dev)
return 0;
if (dev->flags & DEV_REGULAR) {
*read_ahead = 0;
return 1;
}
return _dev_read_ahead_dev(dev, read_ahead);
}
int dev_discard_blocks(struct device *dev, uint64_t offset_bytes, uint64_t size_bytes)
{
if (!dev)
return 0;
if (dev->flags & DEV_REGULAR)
return 1;
return _dev_discard_blocks(dev, offset_bytes, size_bytes);
}
void dev_flush(struct device *dev)
{
if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
return;
if (fsync(dev->fd) >= 0)
return;
sync();
}
int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
{
struct stat buf;
const char *name;
int need_excl = 0, need_rw = 0;
if ((flags & O_ACCMODE) == O_RDWR)
need_rw = 1;
if ((flags & O_EXCL))
need_excl = 1;
if (dev->fd >= 0) {
if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
dev->open_count++;
return 1;
}
if (dev->open_count && !need_excl)
log_debug_devs("%s: Already opened read-only. Upgrading "
"to read-write.", dev_name(dev));
/* dev_close_immediate will decrement this */
dev->open_count++;
dev_close_immediate(dev);
// FIXME: dev with DEV_ALLOCED is released
// but code is referencing it
}
if (critical_section())
/* FIXME Make this log_error */
log_verbose("dev_open(%s) called while suspended",
dev_name(dev));
if (!(name = dev_name_confirmed(dev, quiet)))
return_0;
#ifdef O_DIRECT_SUPPORT
if (direct) {
if (!(dev->flags & DEV_O_DIRECT_TESTED))
dev->flags |= DEV_O_DIRECT;
if ((dev->flags & DEV_O_DIRECT))
flags |= O_DIRECT;
}
#endif
#ifdef O_NOATIME
/* Don't update atime on device inodes */
if (!(dev->flags & DEV_REGULAR) && !(dev->flags & DEV_NOT_O_NOATIME))
flags |= O_NOATIME;
#endif
if ((dev->fd = open(name, flags, 0777)) < 0) {
#ifdef O_NOATIME
if ((errno == EPERM) && (flags & O_NOATIME)) {
flags &= ~O_NOATIME;
dev->flags |= DEV_NOT_O_NOATIME;
if ((dev->fd = open(name, flags, 0777)) >= 0) {
log_debug_devs("%s: Not using O_NOATIME", name);
goto opened;
}
}
#endif
#ifdef O_DIRECT_SUPPORT
if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
flags &= ~O_DIRECT;
if ((dev->fd = open(name, flags, 0777)) >= 0) {
dev->flags &= ~DEV_O_DIRECT;
log_debug_devs("%s: Not using O_DIRECT", name);
goto opened;
}
}
#endif
if (quiet)
log_sys_debug("open", name);
else
log_sys_error("open", name);
dev->flags |= DEV_OPEN_FAILURE;
return 0;
}
#ifdef O_DIRECT_SUPPORT
opened:
if (direct)
dev->flags |= DEV_O_DIRECT_TESTED;
#endif
dev->open_count++;
dev->flags &= ~DEV_ACCESSED_W;
if (need_rw)
dev->flags |= DEV_OPENED_RW;
else
dev->flags &= ~DEV_OPENED_RW;
if (need_excl)
dev->flags |= DEV_OPENED_EXCL;
else
dev->flags &= ~DEV_OPENED_EXCL;
if (!(dev->flags & DEV_REGULAR) &&
((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
log_error("%s: fstat failed: Has device name changed?", name);
dev_close_immediate(dev);
return 0;
}
#ifndef O_DIRECT_SUPPORT
if (!(dev->flags & DEV_REGULAR))
dev_flush(dev);
#endif
if ((flags & O_CREAT) && !(flags & O_TRUNC))
dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
dm_list_add(&_open_devices, &dev->open_list);
log_debug_devs("Opened %s %s%s%s", dev_name(dev),
dev->flags & DEV_OPENED_RW ? "RW" : "RO",
dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
dev->flags &= ~DEV_OPEN_FAILURE;
return 1;
}
int dev_open_quiet(struct device *dev)
{
return dev_open_flags(dev, O_RDWR, 1, 1);
}
int dev_open(struct device *dev)
{
return dev_open_flags(dev, O_RDWR, 1, 0);
}
int dev_open_readonly(struct device *dev)
{
return dev_open_flags(dev, O_RDONLY, 1, 0);
}
int dev_open_readonly_buffered(struct device *dev)
{
return dev_open_flags(dev, O_RDONLY, 0, 0);
}
int dev_open_readonly_quiet(struct device *dev)
{
return dev_open_flags(dev, O_RDONLY, 1, 1);
}
int dev_test_excl(struct device *dev)
{
int flags;
int r;
flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
flags |= O_EXCL;
r = dev_open_flags(dev, flags, 1, 1);
if (r)
dev_close_immediate(dev);
return r;
}
static void _close(struct device *dev)
{
if (close(dev->fd))
log_sys_error("close", dev_name(dev));
dev->fd = -1;
dev->phys_block_size = -1;
dev->block_size = -1;
dm_list_del(&dev->open_list);
devbufs_release(dev);
log_debug_devs("Closed %s", dev_name(dev));
if (dev->flags & DEV_ALLOCED)
dev_destroy_file(dev);
}
static int _dev_close(struct device *dev, int immediate)
{
if (dev->fd < 0) {
log_error("Attempt to close device '%s' "
"which is not open.", dev_name(dev));
return 0;
}
#ifndef O_DIRECT_SUPPORT
if (dev->flags & DEV_ACCESSED_W)
dev_flush(dev);
#endif
if (dev->open_count > 0)
dev->open_count--;
if (immediate && dev->open_count)
log_debug_devs("%s: Immediate close attempt while still referenced",
dev_name(dev));
/* Close unless device is known to belong to a locked VG */
if (immediate ||
(dev->open_count < 1 && !lvmcache_pvid_is_locked(dev->pvid)))
_close(dev);
return 1;
}
int dev_close(struct device *dev)
{
return _dev_close(dev, 0);
}
int dev_close_immediate(struct device *dev)
{
return _dev_close(dev, 1);
}
void dev_close_all(void)
{
struct dm_list *doh, *doht;
struct device *dev;
dm_list_iterate_safe(doh, doht, &_open_devices) {
dev = dm_list_struct_base(doh, struct device, open_list);
if (dev->open_count < 1)
_close(dev);
}
}
static inline int _dev_is_valid(struct device *dev)
{
return (dev->max_error_count == NO_DEV_ERROR_COUNT_LIMIT ||
dev->error_count < dev->max_error_count);
}
static void _dev_inc_error_count(struct device *dev)
{
if (++dev->error_count == dev->max_error_count)
log_warn("WARNING: Error counts reached a limit of %d. "
"Device %s was disabled",
dev->max_error_count, dev_name(dev));
}
/*
* Data is returned (read-only) at DEV_DEVBUF_DATA(dev, reason).
* If dev_read_callback_fn is supplied, we always return 1 and take
* responsibility for calling it exactly once. This might happen before the
* function returns (if there's an error or the I/O is synchronous) or after.
* Any error is passed to that function, which must track it if required.
*/
static int _dev_read_callback(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason,
unsigned ioflags, lvm_callback_fn_t dev_read_callback_fn, void *callback_context)
{
struct device_area where;
struct device_buffer *devbuf;
uint64_t buf_end;
int cached = 0;
int ret = 0;
if (!dev->open_count) {
log_error(INTERNAL_ERROR "Attempt to access device %s while closed.", dev_name(dev));
goto out;
}
if (!_dev_is_valid(dev))
goto_out;
/*
* Can we satisfy this from data we stored last time we read?
*/
if ((devbuf = DEV_DEVBUF(dev, reason)) && devbuf->malloc_address) {
buf_end = devbuf->where.start + devbuf->where.size - 1;
if (offset >= devbuf->where.start && offset <= buf_end && offset + len - 1 <= buf_end) {
/* Reuse this buffer */
cached = 1;
devbuf->data_offset = offset - devbuf->where.start;
log_debug_io("Cached read for %" PRIu64 " bytes at %" PRIu64 " on %s (for %s)",
(uint64_t) len, (uint64_t) offset, dev_name(dev), _reason_text(reason));
ret = 1;
goto out;
}
}
where.dev = dev;
where.start = offset;
where.size = len;
ret = _aligned_io(&where, NULL, 0, reason, ioflags, dev_read_callback_fn, callback_context);
if (!ret) {
log_debug("Read from %s failed (for %s).", dev_name(dev), _reason_text(reason));
_dev_inc_error_count(dev);
}
out:
/* If we had an error or this was sync I/O, pass the result to any callback fn */
if ((!ret || !_aio_ctx || !aio_supported_code_path(ioflags) || cached) && dev_read_callback_fn) {
dev_read_callback_fn(!ret, ioflags, callback_context, DEV_DEVBUF_DATA(dev, reason));
return 1;
}
return ret;
}
void dev_read_callback(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason,
unsigned ioflags, lvm_callback_fn_t dev_read_callback_fn, void *callback_context)
{
/* Always returns 1 if callback fn is supplied */
if (!_dev_read_callback(dev, offset, len, reason, ioflags, dev_read_callback_fn, callback_context))
log_error(INTERNAL_ERROR "_dev_read_callback failed");
}
/* Returns pointer to read-only buffer. Caller does not free it. */
const char *dev_read(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason)
{
if (!_dev_read_callback(dev, offset, len, reason, 0, NULL, NULL))
return_NULL;
return DEV_DEVBUF_DATA(dev, reason);
}
/* Read into supplied retbuf owned by the caller. */
int dev_read_buf(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason, void *retbuf)
{
if (!_dev_read_callback(dev, offset, len, reason, 0, NULL, NULL))
return_0;
memcpy(retbuf, DEV_DEVBUF_DATA(dev, reason), len);
return 1;
}
/*
* Read from 'dev' in 2 distinct regions, denoted by (offset,len) and (offset2,len2).
* Caller is responsible for dm_free().
*/
const char *dev_read_circular(struct device *dev, uint64_t offset, size_t len,
uint64_t offset2, size_t len2, dev_io_reason_t reason)
{
char *buf = NULL;
if (!(buf = dm_malloc(len + len2))) {
log_error("Buffer allocation failed for split metadata.");
return NULL;
}
if (!dev_read_buf(dev, offset, len, reason, buf)) {
log_error("Read from %s failed", dev_name(dev));
dm_free(buf);
return NULL;
}
if (!dev_read_buf(dev, offset2, len2, reason, buf + len)) {
log_error("Circular read from %s failed", dev_name(dev));
dm_free(buf);
return NULL;
}
return buf;
}
/* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
* But fails if concurrent processes writing
*/
/* FIXME pre-extend the file */
int dev_append(struct device *dev, size_t len, dev_io_reason_t reason, char *buffer)
{
int r;
if (!dev->open_count)
return_0;
r = dev_write(dev, dev->end, len, reason, buffer);
dev->end += (uint64_t) len;
#ifndef O_DIRECT_SUPPORT
dev_flush(dev);
#endif
return r;
}
int dev_write(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason, void *buffer)
{
struct device_area where;
int ret;
if (!dev->open_count)
return_0;
if (!_dev_is_valid(dev))
return 0;
if (!len) {
log_error(INTERNAL_ERROR "Attempted to write 0 bytes to %s at " FMTu64, dev_name(dev), offset);
return 0;
}
where.dev = dev;
where.start = offset;
where.size = len;
dev->flags |= DEV_ACCESSED_W;
ret = _aligned_io(&where, buffer, 1, reason, 0, NULL, NULL);
if (!ret)
_dev_inc_error_count(dev);
return ret;
}
int dev_set(struct device *dev, uint64_t offset, size_t len, dev_io_reason_t reason, int value)
{
size_t s;
char buffer[4096] __attribute__((aligned(4096)));
if (!dev_open(dev))
return_0;
if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
log_debug_devs("Wiping %s at %" PRIu64 " length %" PRIsize_t,
dev_name(dev), offset, len);
else
log_debug_devs("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
" sectors", dev_name(dev), offset >> SECTOR_SHIFT,
len >> SECTOR_SHIFT);
memset(buffer, value, sizeof(buffer));
while (1) {
s = len > sizeof(buffer) ? sizeof(buffer) : len;
if (!dev_write(dev, offset, s, reason, buffer))
break;
len -= s;
if (!len)
break;
offset += s;
}
dev->flags |= DEV_ACCESSED_W;
if (!dev_close(dev))
stack;
return (len == 0);
}