1
0
mirror of git://sourceware.org/git/lvm2.git synced 2024-10-06 22:19:30 +03:00

[io paths] Unpick agk's aio stuff

This commit is contained in:
Joe Thornber 2018-04-20 10:43:50 -05:00 committed by David Teigland
parent d51429254f
commit 00f1b208a1
46 changed files with 584 additions and 1978 deletions

View File

@ -59,22 +59,6 @@ devices {
# This configuration option is advanced.
scan = [ "/dev" ]
# Configuration option devices/use_aio.
# Use linux asynchronous I/O for parallel device access where possible.
# This configuration option has an automatic default value.
# use_aio = 1
# Configuration option devices/aio_max.
# Maximum number of asynchronous I/Os to issue concurrently.
# This configuration option has an automatic default value.
# aio_max = 128
# Configuration option devices/aio_memory.
# Approximate maximum total amount of memory (in MB) used
# for asynchronous I/O buffers.
# This configuration option has an automatic default value.
# aio_memory = 10
# Configuration option devices/obtain_device_list_from_udev.
# Obtain the list of available devices from udev.
# This avoids opening or using any inapplicable non-block devices or

View File

@ -39,7 +39,6 @@ case "$host_os" in
LDDEPS="$LDDEPS .export.sym"
LIB_SUFFIX=so
DEVMAPPER=yes
AIO=yes
BUILD_LVMETAD=no
BUILD_LVMPOLLD=no
LOCKDSANLOCK=no
@ -59,7 +58,6 @@ case "$host_os" in
CLDNOWHOLEARCHIVE=
LIB_SUFFIX=dylib
DEVMAPPER=yes
AIO=no
ODIRECT=no
DM_IOCTLS=no
SELINUX=no
@ -1124,24 +1122,6 @@ if test "$DEVMAPPER" = yes; then
AC_DEFINE([DEVMAPPER_SUPPORT], 1, [Define to 1 to enable LVM2 device-mapper interaction.])
fi
################################################################################
dnl -- Disable aio
AC_MSG_CHECKING(whether to use asynchronous I/O)
AC_ARG_ENABLE(aio,
AC_HELP_STRING([--disable-aio],
[disable asynchronous I/O]),
AIO=$enableval)
AC_MSG_RESULT($AIO)
if test "$AIO" = yes; then
AC_CHECK_LIB(aio, io_setup,
[AC_DEFINE([AIO_SUPPORT], 1, [Define to 1 if aio is available.])
AIO_LIBS="-laio"
AIO_SUPPORT=yes],
[AIO_LIBS=
AIO_SUPPORT=no ])
fi
################################################################################
dnl -- Build lvmetad
AC_MSG_CHECKING(whether to build LVMetaD)
@ -2081,11 +2061,9 @@ AC_SUBST(DEFAULT_USE_LVMETAD)
AC_SUBST(DEFAULT_USE_LVMPOLLD)
AC_SUBST(DEFAULT_USE_LVMLOCKD)
AC_SUBST(DEVMAPPER)
AC_SUBST(AIO)
AC_SUBST(DLM_CFLAGS)
AC_SUBST(DLM_LIBS)
AC_SUBST(DL_LIBS)
AC_SUBST(AIO_LIBS)
AC_SUBST(DMEVENTD_PATH)
AC_SUBST(DM_LIB_PATCHLEVEL)
AC_SUBST(ELDFLAGS)

View File

@ -1,215 +0,0 @@
Introducing asynchronous I/O to LVM
===================================
Issuing I/O asynchronously means instructing the kernel to perform specific
I/O and return immediately without waiting for it to complete. The data
is collected from the kernel later.
Advantages
----------
A1. While waiting for the I/O to happen, the program could perform other
operations.
A2. When LVM is searching for its Physical Volumes, it issues a small amount of
I/O to a large number of disks. If this was issued in parallel the overall
runtime might be shorter while there should be little effect on the cpu time.
A3. If more than one timeout occurs when accessing any devices, these can be
taken in parallel, again reducing the runtime. This applies globally,
not just while the code is searching for Physical Volumes, so reading,
writing and committing the metadata may occasionally benefit too to some
extent and there are probably maintenance advantages in using the same
method of I/O throughout the main body of the code.
A4. By introducing a simple callback function mechanism, the conversion can be
performed largely incrementally by first refactoring and continuing to
use synchronous I/O with the callbacks performed immediately. This allows the
callbacks to be introduced without changing the running sequence of the code
initially. Future projects could refactor some of the calling sites to
simplify the code structure and even eliminate some of the nesting.
This allows each part of what might ultimately amount to a large change to be
introduced and tested independently.
Disadvantages
-------------
D1. The resulting code may be more complex with more failure modes to
handle. Mitigate by thorough auditing and testing, rolling out
gradually, and offering a simple switch to revert to the old behaviour.
D2. The linux asynchronous I/O implementation is less mature than
its synchronous I/O implementation and might show up problems that
depend on the version of the kernel or library used. Fixes or
workarounds for some of these might require kernel changes. For
example, there are suggestions that despite being supposedly async,
there are still cases where system calls can block. There might be
resource dependencies on other processes running on the system that make
it unsuitable for use while any devices are suspended. Mitigation
as for D1.
D3. The error handling within callbacks becomes more complicated.
However we know that existing call paths can already sometimes discard
errors, sometimes deliberately, sometimes not, so this aspect is in need
of a complete review anyway and the new approach will make the error
handling more transparent. Aim initially for overall behaviour that is
no worse than that of the existing code, then work on improving it
later.
D4. The work will take a few weeks to code and test. This leads to a
significant opportunity cost when compared against other enhancements
that could be achieved in that time. However, the proof-of-concept work
performed while writing this design has satisfied me that the work could
proceed and be committed incrementally as a background task.
Observations regarding LVM's I/O Architecture
---------------------------------------------
H1. All device, metadata and config file I/O is constrained to pass through a
single route in lib/device.
H2. The first step of the analysis was to instrument this code path with
log_debug messages. I/O is split into the following categories:
"dev signatures",
"PV labels",
"VG metadata header",
"VG metadata content",
"extra VG metadata header",
"extra VG metadata content",
"LVM1 metadata",
"pool metadata",
"LV content",
"logging",
H3. A bounce buffer is used for most I/O.
H4. Most callers finish using the supplied data before any further I/O is
issued. The few that don't could be converted trivially to do so.
H5. There is one stream of I/O per metadata area on each device.
H6. Some reads fall at offsets close to immediately preceding reads, so it's
possible to avoid these by caching one "block" per metadata area I/O stream.
H7. Simple analysis suggests a minimum aligned read size of 8k would deliver
immediate gains from this caching. A larger size might perform worse because
almost all the time the extra data read would not be used, but this can be
re-examined and tuned after the code is in place.
Proposal
--------
P1. Retain the "single I/O path" but offer an asynchronous option.
P2. Eliminate the bounce buffer in most cases by improving alignment.
P3. Reduce the number of reads by always reading a minimum of an aligned
8k block.
P4. Eliminate repeated reads by caching the last block read and changing
the lib/device interface to return a pointer to read-only data within
this block.
P5. Only perform these interface changes for code on the critical path
for now by converting other code sites to use wrappers around the new
interface.
P6. Treat asynchronous I/O as the interface of choice and optimise only
for this case.
P7. Convert the callers on the critical path to pass callback functions
to the device layer. These functions will be called later with the
read-only data, a context pointer and a success/failure indicator.
Where an existing function performs a sequence of I/O, this has the
advantage of breaking up the large function into smaller ones and
wrapping the parameters used into structures. While this might look
rather messy and ad-hoc in the short-term, it's a first step towards
breaking up confusingly long functions into component parts and wrapping
the existing long parameter lists into more appropriate structures and
refactoring these parts of the code.
P8. Limit the resources used by the asynchronous I/O by using two
tunable parameters, one limiting the number of outstanding I/Os issued
and another limiting the total amount of memory used.
P9. Provide a fallback option if asynchronous I/O is unavailable by
sharing the code paths but issuing the I/O synchronously and calling the
callback immediately.
P10. Only allocate the buffer for the I/O at the point where the I/O is
about to be issued.
P11. If the thresholds are exceeded, add the request to a simple queue,
and process it later after some I/O has completed.
Future work
-----------
F1. Perform a complete review of the error tracking so that device
failures are handled and reported more cleanly, extending the existing
basic error counting mechanism.
F2. Consider whether some of the nested callbacks can be eliminated,
which would allow for additional simplifications.
F3. Adjust the contents of the adhoc context structs into more logical
arrangements and use them more widely.
F4. Perform wider refactoring of these areas of code.
Testing considerations
----------------------
T1. The changes touch code on the device path, so a thorough re-test of
the device layer is required. The new code needs a full audit down
through the library layer into the kernel to check that all the error
conditions that are currently implemented (such as EAGAIN) are handled
sensibly. (LVM's I/O layer needs to remain as solid as we can make it.)
T2. The current test suite provides a reasonably broad range of coverage
of this area but is far from comprehensive.
Acceptance criteria
-------------------
A1. The current test suite should pass to the same extent as before the
changes.
A2. When all debugging and logging is disabled, strace -c must show
improvements e.g. the expected fewer number of reads.
A3. Running a range of commands under valgrind must not reveal any
new leaks due to the changes.
A4. All new coverity reports from the change must be addressed.
A5. CPU time should be similar to that before, as the same work
is being done overall, just in a different order.
A6. Tests need to show improved behaviour in targetted areas. For example,
if several devices are slow and time out, the delays should occur
in parallel and the elapsed time should be less than before.
Release considerations
----------------------
R1. Async I/O should be widely available and largely reliable on linux
nowadays (even though parts of its interface and implementation remain a
matter of controversy) so we should try to make its use the default
whereever it is supported. If certain types of systems have problems we
should try to detect those cases and disable it automatically there.
R2. Because the implications of an unexpected problem in the new code
could be severe for the people affected, the roll out needs to be gentle
without a deadline to allow us plenty of time to gain confidence in the
new code. Our own testing will only be able to cover a tiny fraction of
the different setups our users have, so we need to look out for problems
caused by this proactively and encourage people to test it on their own
systems and report back. It must go into the tree near the start of a
release cycle rather than at the end to provide time for our confidence
in it to grow.

View File

@ -1,8 +1,5 @@
/* include/configure.h.in. Generated from configure.in by autoheader. */
/* Define to 1 if aio is available. */
#undef AIO_SUPPORT
/* Define to 1 to use libblkid detection of signatures when wiping. */
#undef BLKID_WIPING_SUPPORT

51
lib/cache/lvmcache.c vendored
View File

@ -141,8 +141,6 @@ void lvmcache_seed_infos_from_lvmetad(struct cmd_context *cmd)
/* Volume Group metadata cache functions */
static void _free_cached_vgmetadata(struct lvmcache_vginfo *vginfo)
{
struct lvmcache_info *info;
if (!vginfo || !vginfo->vgmetadata)
return;
@ -156,10 +154,6 @@ static void _free_cached_vgmetadata(struct lvmcache_vginfo *vginfo)
vginfo->cft = NULL;
}
/* Invalidate any cached device buffers */
dm_list_iterate_items(info, &vginfo->infos)
devbufs_release(info->dev);
log_debug_cache("lvmcache: VG %s wiped.", vginfo->vgname);
release_vg(vginfo->cached_vg);
@ -548,6 +542,7 @@ const struct format_type *lvmcache_fmt_from_vgname(struct cmd_context *cmd,
{
struct lvmcache_vginfo *vginfo;
struct lvmcache_info *info;
struct label *label;
struct dm_list *devh, *tmp;
struct dm_list devs;
struct device_list *devl;
@ -592,7 +587,7 @@ const struct format_type *lvmcache_fmt_from_vgname(struct cmd_context *cmd,
dm_list_iterate_safe(devh, tmp, &devs) {
devl = dm_list_item(devh, struct device_list);
(void) label_read(devl->dev, NULL, UINT64_C(0));
(void) label_read(devl->dev, &label, UINT64_C(0));
dm_list_del(&devl->list);
dm_free(devl);
}
@ -773,8 +768,10 @@ char *lvmcache_vgname_from_pvid(struct cmd_context *cmd, const char *pvid)
static void _rescan_entry(struct lvmcache_info *info)
{
struct label *label;
if (info->status & CACHE_INVALID)
(void) label_read(info->dev, NULL, UINT64_C(0));
(void) label_read(info->dev, &label, UINT64_C(0));
}
static int _scan_invalid(void)
@ -1096,31 +1093,17 @@ next:
goto next;
}
/* Track the number of outstanding label reads */
/* FIXME Switch to struct and also track failed */
static void _process_label_data(int failed, unsigned ioflags, void *context, const void *data)
{
int *nr_labels_outstanding = context;
if (!*nr_labels_outstanding) {
log_error(INTERNAL_ERROR "_process_label_data called too many times");
return;
}
(*nr_labels_outstanding)--;
}
int lvmcache_label_scan(struct cmd_context *cmd)
{
struct dm_list del_cache_devs;
struct dm_list add_cache_devs;
struct lvmcache_info *info;
struct device_list *devl;
struct label *label;
struct dev_iter *iter;
struct device *dev;
struct format_type *fmt;
int dev_count = 0;
int nr_labels_outstanding = 0;
int r = 0;
@ -1159,22 +1142,13 @@ int lvmcache_label_scan(struct cmd_context *cmd)
_destroy_duplicate_device_list(&_found_duplicate_devs);
while ((dev = dev_iter_get(iter))) {
log_debug_io("Scanning device %s", dev_name(dev));
nr_labels_outstanding++;
if (!label_read_callback(dev, UINT64_C(0), AIO_SUPPORTED_CODE_PATH, _process_label_data, &nr_labels_outstanding))
nr_labels_outstanding--;
(void) label_read(dev, &label, UINT64_C(0));
dev_count++;
}
dev_iter_destroy(iter);
while (nr_labels_outstanding) {
log_very_verbose("Scanned %d device labels (%d outstanding)", dev_count, nr_labels_outstanding);
if (!dev_async_getevents())
return_0;
}
log_very_verbose("Scanned %d device labels (%d outstanding)", dev_count, nr_labels_outstanding);
log_very_verbose("Scanned %d device labels", dev_count);
/*
* _choose_preferred_devs() returns:
@ -1208,7 +1182,7 @@ int lvmcache_label_scan(struct cmd_context *cmd)
dm_list_iterate_items(devl, &add_cache_devs) {
log_debug_cache("Rescan preferred device %s for lvmcache", dev_name(devl->dev));
(void) label_read(devl->dev, NULL, UINT64_C(0));
(void) label_read(devl->dev, &label, UINT64_C(0));
}
dm_list_splice(&_unused_duplicate_devs, &del_cache_devs);
@ -1228,7 +1202,7 @@ int lvmcache_label_scan(struct cmd_context *cmd)
*/
if (_force_label_scan && cmd->is_long_lived &&
cmd->dump_filter && cmd->full_filter && cmd->full_filter->dump &&
!cmd->full_filter->dump(cmd->full_filter, cmd->mem, 0))
!cmd->full_filter->dump(cmd->full_filter, 0))
stack;
r = 1;
@ -1529,6 +1503,7 @@ const char *lvmcache_pvid_from_devname(struct cmd_context *cmd,
const char *devname)
{
struct device *dev;
struct label *label;
if (!(dev = dev_cache_get(devname, cmd->filter))) {
log_error("%s: Couldn't find device. Check your filters?",
@ -1536,7 +1511,7 @@ const char *lvmcache_pvid_from_devname(struct cmd_context *cmd,
return NULL;
}
if (!(label_read(dev, NULL, UINT64_C(0))))
if (!(label_read(dev, &label, UINT64_C(0))))
return NULL;
return dev->pvid;
@ -2001,7 +1976,7 @@ int lvmcache_add_orphan_vginfo(const char *vgname, struct format_type *fmt)
return _lvmcache_update_vgname(NULL, vgname, vgname, 0, "", fmt);
}
int lvmcache_update_vgname_and_id(struct lvmcache_info *info, const struct lvmcache_vgsummary *vgsummary)
int lvmcache_update_vgname_and_id(struct lvmcache_info *info, struct lvmcache_vgsummary *vgsummary)
{
const char *vgname = vgsummary->vgname;
const char *vgid = (char *)&vgsummary->vgid;

View File

@ -85,7 +85,7 @@ void lvmcache_del(struct lvmcache_info *info);
/* Update things */
int lvmcache_update_vgname_and_id(struct lvmcache_info *info,
const struct lvmcache_vgsummary *vgsummary);
struct lvmcache_vgsummary *vgsummary);
int lvmcache_update_vg(struct volume_group *vg, unsigned precommitted);
void lvmcache_lock_vgname(const char *vgname, int read_only);

2
lib/cache/lvmetad.c vendored
View File

@ -1771,7 +1771,7 @@ static int _lvmetad_pvscan_single(struct metadata_area *mda, void *baton)
struct volume_group *vg;
if (mda_is_ignored(mda) ||
!(vg = mda->ops->vg_read(b->fid, "", mda, NULL, NULL, 1, 0)))
!(vg = mda->ops->vg_read(b->fid, "", mda, NULL, NULL, 1)))
return 1;
/* FIXME Also ensure contents match etc. */

View File

@ -636,16 +636,6 @@ static int _process_config(struct cmd_context *cmd)
*/
cmd->default_settings.udev_fallback = udev_disabled ? 1 : -1;
#ifdef AIO_SUPPORT
cmd->use_aio = find_config_tree_bool(cmd, devices_use_aio_CFG, NULL);
#else
cmd->use_aio = 0;
#endif
if (cmd->use_aio && !dev_async_setup(cmd))
cmd->use_aio = 0;
log_debug_io("%ssing asynchronous I/O.", cmd->use_aio ? "U" : "Not u");
init_retry_deactivation(find_config_tree_bool(cmd, activation_retry_deactivation_CFG, NULL));
init_activation_checks(find_config_tree_bool(cmd, activation_checks_CFG, NULL));
@ -1298,7 +1288,7 @@ int init_filters(struct cmd_context *cmd, unsigned load_persistent_cache)
lvm_stat_ctim(&ts, &st);
cts = config_file_timestamp(cmd->cft);
if (timespeccmp(&ts, &cts, >) &&
!persistent_filter_load(cmd->mem, cmd->filter, NULL))
!persistent_filter_load(cmd->filter, NULL))
log_verbose("Failed to load existing device cache from %s",
dev_cache);
}
@ -2160,8 +2150,6 @@ int refresh_toolcontext(struct cmd_context *cmd)
cmd->lib_dir = NULL;
label_init();
if (!_init_lvm_conf(cmd))
return_0;
@ -2249,7 +2237,7 @@ void destroy_toolcontext(struct cmd_context *cmd)
int flags;
if (cmd->dump_filter && cmd->filter && cmd->filter->dump &&
!cmd->filter->dump(cmd->filter, cmd->mem, 1))
!cmd->filter->dump(cmd->filter, 1))
stack;
archive_exit(cmd);

View File

@ -165,7 +165,6 @@ struct cmd_context {
unsigned vg_notify:1;
unsigned lv_notify:1;
unsigned pv_notify:1;
unsigned use_aio:1;
unsigned activate_component:1; /* command activates component LV */
unsigned process_component_lvs:1; /* command processes also component LVs */

View File

@ -1,6 +1,6 @@
/*
* Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
* Copyright (C) 2004-2018 Red Hat, Inc. All rights reserved.
* Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
*
* This file is part of LVM2.
*
@ -279,7 +279,7 @@ struct dm_config_tree *config_file_open_and_read(const char *config_file,
}
log_very_verbose("Loading config file: %s", config_file);
if (!config_file_read(cmd->mem, cft)) {
if (!config_file_read(cft)) {
log_error("Failed to load config file %s", config_file);
goto bad;
}
@ -489,102 +489,32 @@ int override_config_tree_from_profile(struct cmd_context *cmd,
return 0;
}
struct process_config_file_params {
struct dm_config_tree *cft;
struct device *dev;
off_t offset;
size_t size;
off_t offset2;
size_t size2;
checksum_fn_t checksum_fn;
uint32_t checksum;
int checksum_only;
int no_dup_node_check;
lvm_callback_fn_t config_file_read_fd_callback;
void *config_file_read_fd_context;
int ret;
};
static void _process_config_file_buffer(int failed, unsigned ioflags, void *context, const void *data)
{
struct process_config_file_params *pcfp = context;
const char *fb = data, *fe;
if (failed) {
pcfp->ret = 0;
goto_out;
}
if (pcfp->checksum_fn && pcfp->checksum !=
(pcfp->checksum_fn(pcfp->checksum_fn(INITIAL_CRC, (const uint8_t *)fb, pcfp->size),
(const uint8_t *)(fb + pcfp->size), pcfp->size2))) {
log_error("%s: Checksum error at offset %" PRIu64, dev_name(pcfp->dev), (uint64_t) pcfp->offset);
pcfp->ret = 0;
goto out;
}
if (!pcfp->checksum_only) {
fe = fb + pcfp->size + pcfp->size2;
if (pcfp->no_dup_node_check) {
if (!dm_config_parse_without_dup_node_check(pcfp->cft, fb, fe))
pcfp->ret = 0;
} else if (!dm_config_parse(pcfp->cft, fb, fe))
pcfp->ret = 0;
}
out:
if (pcfp->config_file_read_fd_callback)
pcfp->config_file_read_fd_callback(!pcfp->ret, ioflags, pcfp->config_file_read_fd_context, NULL);
}
/*
* When checksum_only is set, the checksum of buffer is only matched
* and function avoids parsing of mda into config tree which
* remains unmodified and should not be used.
*/
int config_file_read_fd(struct dm_pool *mem, struct dm_config_tree *cft, struct device *dev, dev_io_reason_t reason,
int config_file_read_fd(struct dm_config_tree *cft, struct device *dev, dev_io_reason_t reason,
off_t offset, size_t size, off_t offset2, size_t size2,
checksum_fn_t checksum_fn, uint32_t checksum,
int checksum_only, int no_dup_node_check, unsigned ioflags,
lvm_callback_fn_t config_file_read_fd_callback, void *config_file_read_fd_context)
int checksum_only, int no_dup_node_check)
{
char *fb;
char *fb, *fe;
int r = 0;
off_t mmap_offset = 0;
int use_mmap = 1;
const char *buf = NULL;
unsigned circular = size2 ? 1 : 0; /* Wrapped around end of disk metadata buffer? */
off_t mmap_offset = 0;
char *buf = NULL;
struct config_source *cs = dm_config_get_custom(cft);
struct process_config_file_params *pcfp;
if (!_is_file_based_config_source(cs->type)) {
log_error(INTERNAL_ERROR "config_file_read_fd: expected file, special file "
"or profile config source, found %s config source.",
_config_source_names[cs->type]);
goto bad;
return 0;
}
if (!(pcfp = dm_pool_zalloc(mem, sizeof(*pcfp)))) {
log_debug("config_file_read_fd: process_config_file_params struct allocation failed");
goto bad;
}
pcfp->cft = cft;
pcfp->dev = dev;
pcfp->offset = offset;
pcfp->size = size;
pcfp->offset2 = offset2;
pcfp->size2 = size2;
pcfp->checksum_fn = checksum_fn;
pcfp->checksum = checksum;
pcfp->checksum_only = checksum_only;
pcfp->no_dup_node_check = no_dup_node_check;
pcfp->config_file_read_fd_callback = config_file_read_fd_callback;
pcfp->config_file_read_fd_context = config_file_read_fd_context;
pcfp->ret = 1;
/* Only use mmap with regular files */
if (!(dev->flags & DEV_REGULAR) || circular)
if (!(dev->flags & DEV_REGULAR) || size2)
use_mmap = 0;
if (use_mmap) {
@ -594,40 +524,56 @@ int config_file_read_fd(struct dm_pool *mem, struct dm_config_tree *cft, struct
MAP_PRIVATE, dev_fd(dev), offset - mmap_offset);
if (fb == (caddr_t) (-1)) {
log_sys_error("mmap", dev_name(dev));
goto bad;
goto out;
}
_process_config_file_buffer(0, ioflags, pcfp, fb + mmap_offset);
r = pcfp->ret;
fb = fb + mmap_offset;
} else {
if (!(buf = dm_malloc(size + size2))) {
log_error("Failed to allocate circular buffer.");
return 0;
}
if (!dev_read_circular(dev, (uint64_t) offset, size,
(uint64_t) offset2, size2, reason, buf)) {
goto out;
}
fb = buf;
}
if (checksum_fn && checksum !=
(checksum_fn(checksum_fn(INITIAL_CRC, (const uint8_t *)fb, size),
(const uint8_t *)(fb + size), size2))) {
log_error("%s: Checksum error at offset %" PRIu64, dev_name(dev), (uint64_t) offset);
goto out;
}
if (!checksum_only) {
fe = fb + size + size2;
if (no_dup_node_check) {
if (!dm_config_parse_without_dup_node_check(cft, fb, fe))
goto_out;
} else {
if (!dm_config_parse(cft, fb, fe))
goto_out;
}
}
r = 1;
out:
if (!use_mmap)
dm_free(buf);
else {
/* unmap the file */
if (munmap(fb, size + mmap_offset)) {
if (munmap(fb - mmap_offset, size + mmap_offset)) {
log_sys_error("munmap", dev_name(dev));
r = 0;
}
} else {
if (circular) {
if (!(buf = dev_read_circular(dev, (uint64_t) offset, size, (uint64_t) offset2, size2, reason)))
goto_out;
_process_config_file_buffer(0, ioflags, pcfp, buf);
dm_free((void *)buf);
} else {
dev_read_callback(dev, (uint64_t) offset, size, reason, ioflags, _process_config_file_buffer, pcfp);
if (config_file_read_fd_callback)
return 1;
}
r = pcfp->ret;
}
out:
return r;
bad:
if (config_file_read_fd_callback)
config_file_read_fd_callback(1, ioflags, config_file_read_fd_context, NULL);
return 0;
}
int config_file_read(struct dm_pool *mem, struct dm_config_tree *cft)
int config_file_read(struct dm_config_tree *cft)
{
const char *filename = NULL;
struct config_source *cs = dm_config_get_custom(cft);
@ -655,8 +601,8 @@ int config_file_read(struct dm_pool *mem, struct dm_config_tree *cft)
}
}
r = config_file_read_fd(mem, cft, cf->dev, DEV_IO_MDA_CONTENT, 0, (size_t) info.st_size, 0, 0,
(checksum_fn_t) NULL, 0, 0, 0, 0, NULL, NULL);
r = config_file_read_fd(cft, cf->dev, DEV_IO_MDA_CONTENT, 0, (size_t) info.st_size, 0, 0,
(checksum_fn_t) NULL, 0, 0, 0);
if (!cf->keep_open) {
if (!dev_close(cf->dev))

View File

@ -239,13 +239,11 @@ config_source_t config_get_source_type(struct dm_config_tree *cft);
typedef uint32_t (*checksum_fn_t) (uint32_t initial, const uint8_t *buf, uint32_t size);
struct dm_config_tree *config_open(config_source_t source, const char *filename, int keep_open);
int config_file_read_fd(struct dm_pool *mem, struct dm_config_tree *cft, struct device *dev, dev_io_reason_t reason,
int config_file_read_fd(struct dm_config_tree *cft, struct device *dev, dev_io_reason_t reason,
off_t offset, size_t size, off_t offset2, size_t size2,
checksum_fn_t checksum_fn, uint32_t checksum,
int skip_parse, int no_dup_node_check, unsigned ioflags,
lvm_callback_fn_t config_file_read_fd_callback, void *config_file_read_fd_context);
int config_file_read(struct dm_pool *mem, struct dm_config_tree *cft);
int skip_parse, int no_dup_node_check);
int config_file_read(struct dm_config_tree *cft);
struct dm_config_tree *config_file_open_and_read(const char *config_file, config_source_t source,
struct cmd_context *cmd);
int config_write(struct dm_config_tree *cft, struct config_def_tree_spec *tree_spec,

View File

@ -226,16 +226,6 @@ cfg(devices_dir_CFG, "dir", devices_CFG_SECTION, CFG_ADVANCED, CFG_TYPE_STRING,
cfg_array(devices_scan_CFG, "scan", devices_CFG_SECTION, CFG_ADVANCED, CFG_TYPE_STRING, "#S/dev", vsn(1, 0, 0), NULL, 0, NULL,
"Directories containing device nodes to use with LVM.\n")
cfg(devices_use_aio_CFG, "use_aio", devices_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_BOOL, DEFAULT_USE_AIO, vsn(2, 2, 178), NULL, 0, NULL,
"Use linux asynchronous I/O for parallel device access where possible.\n")
cfg(devices_aio_max_CFG, "aio_max", devices_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, DEFAULT_AIO_MAX, vsn(2, 2, 178), NULL, 0, NULL,
"Maximum number of asynchronous I/Os to issue concurrently.\n")
cfg(devices_aio_memory_CFG, "aio_memory", devices_CFG_SECTION, CFG_DEFAULT_COMMENTED, CFG_TYPE_INT, DEFAULT_AIO_MEMORY, vsn(2, 2, 178), NULL, 0, NULL,
"Approximate maximum total amount of memory (in MB) used\n"
"for asynchronous I/O buffers.\n")
cfg_array(devices_loopfiles_CFG, "loopfiles", devices_CFG_SECTION, CFG_DEFAULT_UNDEFINED | CFG_UNSUPPORTED, CFG_TYPE_STRING, NULL, vsn(1, 2, 0), NULL, 0, NULL, NULL)
cfg(devices_obtain_device_list_from_udev_CFG, "obtain_device_list_from_udev", devices_CFG_SECTION, 0, CFG_TYPE_BOOL, DEFAULT_OBTAIN_DEVICE_LIST_FROM_UDEV, vsn(2, 2, 85), NULL, 0, NULL,

View File

@ -32,9 +32,6 @@
#define DEFAULT_SYSTEM_ID_SOURCE "none"
#define DEFAULT_OBTAIN_DEVICE_LIST_FROM_UDEV 1
#define DEFAULT_EXTERNAL_DEVICE_INFO_SOURCE "none"
#define DEFAULT_USE_AIO 1
#define DEFAULT_AIO_MAX 128
#define DEFAULT_AIO_MEMORY 10
#define DEFAULT_SYSFS_SCAN 1
#define DEFAULT_MD_COMPONENT_DETECTION 1
#define DEFAULT_FW_RAID_COMPONENT_DETECTION 0

View File

@ -1245,24 +1245,12 @@ int dev_cache_check_for_open_devices(void)
int dev_cache_exit(void)
{
struct btree_iter *b;
int num_open = 0;
dev_async_exit();
if (_cache.names)
if ((num_open = _check_for_open_devices(1)) > 0)
log_error(INTERNAL_ERROR "%d device(s) were left open and have been closed.", num_open);
if (_cache.devices) {
/* FIXME Replace with structured devbuf cache */
b = btree_first(_cache.devices);
while (b) {
devbufs_release(btree_get_data(b));
b = btree_next(b);
}
}
if (_cache.mem)
dm_pool_destroy(_cache.mem);

View File

@ -23,10 +23,10 @@
* predicate for devices.
*/
struct dev_filter {
int (*passes_filter) (struct dev_filter *f, struct device *dev);
void (*destroy) (struct dev_filter *f);
void (*wipe) (struct dev_filter *f);
int (*dump) (struct dev_filter *f, struct dm_pool *mem, int merge_existing);
int (*passes_filter) (struct dev_filter * f, struct device * dev);
void (*destroy) (struct dev_filter * f);
void (*wipe) (struct dev_filter * f);
int (*dump) (struct dev_filter * f, int merge_existing);
void *private;
unsigned use_count;
};

View File

@ -53,12 +53,6 @@
# endif
#endif
/*
* Always read at least 8k from disk.
* This seems to be a good compromise for the existing LVM2 metadata layout.
*/
#define MIN_READ_SIZE (8 * 1024)
static DM_LIST_INIT(_open_devices);
static unsigned _dev_size_seqno = 1;
@ -80,319 +74,38 @@ static const char *_reason_text(dev_io_reason_t reason)
return _reasons[(unsigned) reason];
}
/*
* Release the memory holding the last data we read
*/
static void _release_devbuf(struct device_buffer *devbuf)
{
dm_free(devbuf->malloc_address);
devbuf->malloc_address = NULL;
}
void devbufs_release(struct device *dev)
{
if ((dev->flags & DEV_REGULAR))
return;
_release_devbuf(&dev->last_devbuf);
_release_devbuf(&dev->last_extra_devbuf);
}
#ifdef AIO_SUPPORT
# include <libaio.h>
static io_context_t _aio_ctx = 0;
static struct io_event *_aio_events = NULL;
static int _aio_max = 0;
static int64_t _aio_memory_max = 0;
static int _aio_must_queue = 0; /* Have we reached AIO capacity? */
static DM_LIST_INIT(_aio_queue);
#define DEFAULT_AIO_COLLECTION_EVENTS 32
int dev_async_setup(struct cmd_context *cmd)
{
int r;
_aio_max = find_config_tree_int(cmd, devices_aio_max_CFG, NULL);
_aio_memory_max = find_config_tree_int(cmd, devices_aio_memory_CFG, NULL) * INT64_C(1024 * 1024);
/* Threshold is zero? */
if (!_aio_max || !_aio_memory_max) {
if (_aio_ctx)
dev_async_exit();
return 1;
}
/* Already set up? */
if (_aio_ctx)
return 1;
log_debug_io("Setting up aio context for up to %" PRId64 " MB across %d events.", _aio_memory_max, _aio_max);
if (!_aio_events && !(_aio_events = dm_zalloc(sizeof(*_aio_events) * DEFAULT_AIO_COLLECTION_EVENTS))) {
log_error("Failed to allocate io_event array for asynchronous I/O.");
return 0;
}
if ((r = io_setup(_aio_max, &_aio_ctx)) < 0) {
/*
* Possible errors:
* ENOSYS - aio not available in current kernel
* EAGAIN - _aio_max is too big
* EFAULT - invalid pointer
* EINVAL - _aio_ctx != 0 or kernel aio limits exceeded
* ENOMEM
*/
log_warn("WARNING: Asynchronous I/O setup for %d events failed: %s", _aio_max, strerror(-r));
log_warn("WARNING: Using only synchronous I/O.");
dm_free(_aio_events);
_aio_events = NULL;
_aio_ctx = 0;
return 0;
}
return 1;
}
/* Reset aio context after fork */
int dev_async_reset(struct cmd_context *cmd)
{
log_debug_io("Resetting asynchronous I/O context.");
_aio_ctx = 0;
dm_free(_aio_events);
_aio_events = NULL;
return dev_async_setup(cmd);
}
/*
* Track the amount of in-flight async I/O.
* If it exceeds the defined threshold set _aio_must_queue.
*/
static void _update_aio_counters(int nr, ssize_t bytes)
{
static int64_t aio_bytes = 0;
static int aio_count = 0;
aio_bytes += bytes;
aio_count += nr;
if (aio_count >= _aio_max || aio_bytes > _aio_memory_max)
_aio_must_queue = 1;
else
_aio_must_queue = 0;
}
static int _io(struct device_buffer *devbuf, unsigned ioflags);
int dev_async_getevents(void)
{
struct device_buffer *devbuf, *tmp;
lvm_callback_fn_t dev_read_callback_fn;
void *dev_read_callback_context;
int r, event_nr;
if (!_aio_ctx)
return 1;
do {
/* FIXME Add timeout - currently NULL - waits for ever for at least 1 item */
r = io_getevents(_aio_ctx, 1, DEFAULT_AIO_COLLECTION_EVENTS, _aio_events, NULL);
if (r > 0)
break;
if (!r)
return 1; /* Timeout elapsed */
if (r == -EINTR)
continue;
if (r == -EAGAIN) {
usleep(100);
return 1; /* Give the caller the opportunity to do other work before repeating */
}
/*
* ENOSYS - not supported by kernel
* EFAULT - memory invalid
* EINVAL - _aio_ctx invalid or min_nr/nr/timeout out of range
*/
log_error("Asynchronous event collection failed: %s", strerror(-r));
return 0;
} while (1);
for (event_nr = 0; event_nr < r; event_nr++) {
devbuf = _aio_events[event_nr].obj->data;
dm_free(_aio_events[event_nr].obj);
_update_aio_counters(-1, -devbuf->where.size);
dev_read_callback_fn = devbuf->dev_read_callback_fn;
dev_read_callback_context = devbuf->dev_read_callback_context;
/* Clear the callbacks as a precaution */
devbuf->dev_read_callback_context = NULL;
devbuf->dev_read_callback_fn = NULL;
if (_aio_events[event_nr].res == devbuf->where.size) {
if (dev_read_callback_fn)
dev_read_callback_fn(0, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, (char *)devbuf->buf + devbuf->data_offset);
} else {
/* FIXME If partial read is possible, resubmit remainder */
log_error("%s: asynchronous read only I/O failed (" FMTd64 ") of " FMTu64 " bytes at " FMTu64 " (for %s): %s",
dev_name(devbuf->where.dev), _aio_events[event_nr].res,
(uint64_t) devbuf->where.size, (uint64_t) devbuf->where.start,
_reason_text(devbuf->reason),
(((int64_t)_aio_events[event_nr].res) < 0) ? strerror(-(int64_t)_aio_events[event_nr].res) : 0);
_release_devbuf(devbuf);
if (dev_read_callback_fn)
dev_read_callback_fn(1, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, NULL);
else
r = 0;
}
}
/* Submit further queued events if we can */
dm_list_iterate_items_gen_safe(devbuf, tmp, &_aio_queue, aio_queued) {
if (_aio_must_queue)
break;
dm_list_del(&devbuf->aio_queued);
_io(devbuf, 1);
}
return 1;
}
static int _io_async(struct device_buffer *devbuf)
{
struct device_area *where = &devbuf->where;
struct iocb *iocb;
int r;
_update_aio_counters(1, devbuf->where.size);
if (!(iocb = dm_malloc(sizeof(*iocb)))) {
log_error("Failed to allocate I/O control block array for asynchronous I/O.");
return 0;
}
io_prep_pread(iocb, dev_fd(where->dev), devbuf->buf, where->size, where->start);
iocb->data = devbuf;
do {
r = io_submit(_aio_ctx, 1L, &iocb);
if (r ==1)
break; /* Success */
if (r == -EAGAIN) {
/* Try to release some resources then retry */
usleep(100);
if (dev_async_getevents())
return_0;
/* FIXME Add counter/timeout so we can't get stuck here for ever */
continue;
}
/*
* Possible errors:
* EFAULT - invalid data
* ENOSYS - no aio support in kernel
* EBADF - bad file descriptor in iocb
* EINVAL - invalid _aio_ctx / iocb not initialised / invalid operation for this fd
*/
log_error("Asynchronous event submission failed: %s", strerror(-r));
return 0;
} while (1);
return 1;
}
void dev_async_exit(void)
{
struct device_buffer *devbuf, *tmp;
lvm_callback_fn_t dev_read_callback_fn;
void *dev_read_callback_context;
int r;
if (!_aio_ctx)
return;
/* Discard any queued requests */
dm_list_iterate_items_gen_safe(devbuf, tmp, &_aio_queue, aio_queued) {
dm_list_del(&devbuf->aio_queued);
_update_aio_counters(-1, -devbuf->where.size);
dev_read_callback_fn = devbuf->dev_read_callback_fn;
dev_read_callback_context = devbuf->dev_read_callback_context;
_release_devbuf(devbuf);
if (dev_read_callback_fn)
dev_read_callback_fn(1, AIO_SUPPORTED_CODE_PATH, dev_read_callback_context, NULL);
}
log_debug_io("Destroying aio context.");
if ((r = io_destroy(_aio_ctx)) < 0)
/* Returns -ENOSYS if aio not in kernel or -EINVAL if _aio_ctx invalid */
log_error("Failed to destroy asynchronous I/O context: %s", strerror(-r));
dm_free(_aio_events);
_aio_events = NULL;
_aio_ctx = 0;
}
static void _queue_aio(struct device_buffer *devbuf)
{
dm_list_add(&_aio_queue, &devbuf->aio_queued);
log_debug_io("Queueing aio.");
}
#else
static int _aio_ctx = 0;
static int _aio_must_queue = 0;
int dev_async_setup(struct cmd_context *cmd)
{
return 1;
}
int dev_async_reset(struct cmd_context *cmd)
{
return 1;
}
int dev_async_getevents(void)
{
return 1;
}
void dev_async_exit(void)
{
}
static int _io_async(struct device_buffer *devbuf)
{
return 0;
}
static void _queue_aio(struct device_buffer *devbuf)
{
}
#endif /* AIO_SUPPORT */
/*-----------------------------------------------------------------
* The standard io loop that keeps submitting an io until it's
* all gone.
*---------------------------------------------------------------*/
static int _io_sync(struct device_buffer *devbuf)
static int _io(struct device_area *where, char *buffer, int should_write, dev_io_reason_t reason)
{
struct device_area *where = &devbuf->where;
int fd = dev_fd(where->dev);
char *buffer = devbuf->buf;
ssize_t n = 0;
size_t total = 0;
if (fd < 0) {
log_error("Attempt to read an unopened device (%s).",
dev_name(where->dev));
return 0;
}
log_debug_io("%s %s:%8" PRIu64 " bytes (sync) at %" PRIu64 "%s (for %s)",
should_write ? "Write" : "Read ", dev_name(where->dev),
where->size, (uint64_t) where->start,
(should_write && test_mode()) ? " (test mode - suppressed)" : "", _reason_text(reason));
/*
* Skip all writes in test mode.
*/
if (should_write && test_mode())
return 1;
if (where->size > SSIZE_MAX) {
log_error("Read size too large: %" PRIu64, where->size);
return 0;
}
if (lseek(fd, (off_t) where->start, SEEK_SET) == (off_t) -1) {
log_error("%s: lseek %" PRIu64 " failed: %s",
dev_name(where->dev), (uint64_t) where->start,
@ -402,19 +115,18 @@ static int _io_sync(struct device_buffer *devbuf)
while (total < (size_t) where->size) {
do
n = devbuf->write ?
n = should_write ?
write(fd, buffer, (size_t) where->size - total) :
read(fd, buffer, (size_t) where->size - total);
while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
if (n < 0)
log_error("%s: synchronous %s failed after %" PRIu64 " of %" PRIu64
" at %" PRIu64 " (for %s): %s", dev_name(where->dev),
devbuf->write ? "write" : "read",
(uint64_t) total,
(uint64_t) where->size, (uint64_t) where->start,
_reason_text(devbuf->reason),
strerror(errno));
log_error_once("%s: %s failed after %" PRIu64 " of %" PRIu64
" at %" PRIu64 ": %s", dev_name(where->dev),
should_write ? "write" : "read",
(uint64_t) total,
(uint64_t) where->size,
(uint64_t) where->start, strerror(errno));
if (n <= 0)
break;
@ -426,42 +138,6 @@ static int _io_sync(struct device_buffer *devbuf)
return (total == (size_t) where->size);
}
static int _io(struct device_buffer *devbuf, unsigned ioflags)
{
struct device_area *where = &devbuf->where;
int fd = dev_fd(where->dev);
int async = (!devbuf->write && _aio_ctx && aio_supported_code_path(ioflags) && devbuf->dev_read_callback_fn) ? 1 : 0;
if (fd < 0) {
log_error("Attempt to read an unopened device (%s).",
dev_name(where->dev));
return 0;
}
if (!devbuf->buf && !(devbuf->malloc_address = devbuf->buf = dm_malloc_aligned((size_t) devbuf->where.size, 0))) {
log_error("I/O buffer malloc failed");
return 0;
}
log_debug_io("%s %s(fd %d):%8" PRIu64 " bytes (%ssync) at %" PRIu64 "%s (for %s)",
devbuf->write ? "Write" : "Read ", dev_name(where->dev), fd,
where->size, async ? "a" : "", (uint64_t) where->start,
(devbuf->write && test_mode()) ? " (test mode - suppressed)" : "", _reason_text(devbuf->reason));
/*
* Skip all writes in test mode.
*/
if (devbuf->write && test_mode())
return 1;
if (where->size > SSIZE_MAX) {
log_error("Read size too large: %" PRIu64, where->size);
return 0;
}
return async ? _io_async(devbuf) : _io_sync(devbuf);
}
/*-----------------------------------------------------------------
* LVM2 uses O_DIRECT when performing metadata io, which requires
* block size aligned accesses. If any io is not aligned we have
@ -551,16 +227,15 @@ static void _widen_region(unsigned int block_size, struct device_area *region,
result->size += block_size - delta;
}
static int _aligned_io(struct device_area *where, char *write_buffer,
int should_write, dev_io_reason_t reason,
unsigned ioflags, lvm_callback_fn_t dev_read_callback_fn, void *dev_read_callback_context)
static int _aligned_io(struct device_area *where, char *buffer,
int should_write, dev_io_reason_t reason)
{
char *bounce, *bounce_buf;
unsigned int physical_block_size = 0;
unsigned int block_size = 0;
unsigned buffer_was_widened = 0;
uintptr_t mask;
struct device_area widened;
struct device_buffer *devbuf;
int r = 0;
if (!(where->dev->flags & DEV_REGULAR) &&
@ -569,11 +244,6 @@ static int _aligned_io(struct device_area *where, char *write_buffer,
if (!block_size)
block_size = lvm_getpagesize();
/* Apply minimum read size */
if (!should_write && block_size < MIN_READ_SIZE)
block_size = MIN_READ_SIZE;
mask = block_size - 1;
_widen_region(block_size, where, &widened);
@ -583,75 +253,50 @@ static int _aligned_io(struct device_area *where, char *write_buffer,
buffer_was_widened = 1;
log_debug_io("Widening request for %" PRIu64 " bytes at %" PRIu64 " to %" PRIu64 " bytes at %" PRIu64 " on %s (for %s)",
where->size, (uint64_t) where->start, widened.size, (uint64_t) widened.start, dev_name(where->dev), _reason_text(reason));
}
devbuf = DEV_DEVBUF(where->dev, reason);
_release_devbuf(devbuf);
devbuf->where.dev = where->dev;
devbuf->where.start = widened.start;
devbuf->where.size = widened.size;
devbuf->write = should_write;
devbuf->reason = reason;
devbuf->dev_read_callback_fn = dev_read_callback_fn;
devbuf->dev_read_callback_context = dev_read_callback_context;
/* Store location of requested data relative to start of buf */
devbuf->data_offset = where->start - devbuf->where.start;
if (should_write && !buffer_was_widened && !((uintptr_t) write_buffer & mask))
} else if (!((uintptr_t) buffer & mask))
/* Perform the I/O directly. */
devbuf->buf = write_buffer;
else if (!should_write)
/* Postpone buffer allocation until we're about to issue the I/O */
devbuf->buf = NULL;
else {
/* Allocate a bounce buffer with an extra block */
if (!(devbuf->malloc_address = devbuf->buf = dm_malloc((size_t) devbuf->where.size + block_size))) {
log_error("Bounce buffer malloc failed");
return 0;
}
return _io(where, buffer, should_write, reason);
/*
* Realign start of bounce buffer (using the extra sector)
*/
if (((uintptr_t) devbuf->buf) & mask)
devbuf->buf = (char *) ((((uintptr_t) devbuf->buf) + mask) & ~mask);
/* Allocate a bounce buffer with an extra block */
if (!(bounce_buf = bounce = dm_malloc((size_t) widened.size + block_size))) {
log_error("Bounce buffer malloc failed");
return 0;
}
/* If we've reached our concurrent AIO limit, add this request to the queue */