linux/drivers/md/dm-core.h

342 lines
7.3 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Internal header file _only_ for device mapper core
*
* Copyright (C) 2016 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
#ifndef DM_CORE_INTERNAL_H
#define DM_CORE_INTERNAL_H
#include <linux/kthread.h>
#include <linux/ktime.h>
#include <linux/blk-mq.h>
#include <linux/blk-crypto-profile.h>
#include <linux/jump_label.h>
#include <trace/events/block.h>
#include "dm.h"
dm ima: measure data on table load DM configures a block device with various target specific attributes passed to it as a table. DM loads the table, and calls each target’s respective constructors with the attributes as input parameters. Some of these attributes are critical to ensure the device meets certain security bar. Thus, IMA should measure these attributes, to ensure they are not tampered with, during the lifetime of the device. So that the external services can have high confidence in the configuration of the block-devices on a given system. Some devices may have large tables. And a given device may change its state (table-load, suspend, resume, rename, remove, table-clear etc.) many times. Measuring these attributes each time when the device changes its state will significantly increase the size of the IMA logs. Further, once configured, these attributes are not expected to change unless a new table is loaded, or a device is removed and recreated. Therefore the clear-text of the attributes should only be measured during table load, and the hash of the active/inactive table should be measured for the remaining device state changes. Export IMA function ima_measure_critical_data() to allow measurement of DM device parameters, as well as target specific attributes, during table load. Compute the hash of the inactive table and store it for measurements during future state change. If a load is called multiple times, update the inactive table hash with the hash of the latest populated table. So that the correct inactive table hash is measured when the device transitions to different states like resume, remove, rename, etc. Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com> Signed-off-by: Colin Ian King <colin.king@canonical.com> # leak fix Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-07-13 03:48:58 +03:00
#include "dm-ima.h"
#define DM_RESERVED_MAX_IOS 1024
dm: add two stage requeue mechanism Commit 61b6e2e5321d ("dm: fix BLK_STS_DM_REQUEUE handling when dm_io represents split bio") reverted DM core's bio splitting back to using bio_split()+bio_chain() because it was found that otherwise DM's BLK_STS_DM_REQUEUE would trigger a live-lock waiting for bio completion that would never occur. Restore using bio_trim()+bio_inc_remaining(), like was done in commit 7dd76d1feec7 ("dm: improve bio splitting and associated IO accounting"), but this time with proper handling for the above scenario that is covered in more detail in the commit header for 61b6e2e5321d. Solve this issue by adding a two staged dm_io requeue mechanism that uses the new dm_bio_rewind() via dm_io_rewind(): 1) requeue the dm_io into the requeue_list added to struct mapped_device, and schedule it via new added requeue work. This workqueue just clones the dm_io->orig_bio (which DM saves and ensures its end sector isn't modified). dm_io_rewind() uses the sectors and sectors_offset members of the dm_io that are recorded relative to the end of orig_bio: dm_bio_rewind()+bio_trim() are then used to make that cloned bio reflect the subset of the original bio that is represented by the dm_io that is being requeued. 2) the 2nd stage requeue is same with original requeue, but io->orig_bio points to new cloned bio (which matches the requeued dm_io as described above). This allows DM core to shift the need for bio cloning from bio-split time (during IO submission) to the less likely BLK_STS_DM_REQUEUE handling (after IO completes with that error). Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
2022-06-24 17:12:55 +03:00
struct dm_io;
struct dm_kobject_holder {
struct kobject kobj;
struct completion completion;
};
/*
* DM core internal structures used directly by dm.c, dm-rq.c and dm-table.c.
* DM targets must _not_ deference a mapped_device or dm_table to directly
* access their members!
*/
/*
* For mempools pre-allocation at the table loading time.
*/
struct dm_md_mempools {
struct bio_set bs;
struct bio_set io_bs;
};
struct mapped_device {
struct mutex suspend_lock;
struct mutex table_devices_lock;
struct list_head table_devices;
/*
* The current mapping (struct dm_table *).
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
void __rcu *map;
unsigned long flags;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
enum dm_queue_mode type;
int numa_node_id;
struct request_queue *queue;
atomic_t holders;
atomic_t open_count;
struct dm_target *immutable_target;
struct target_type *immutable_target_type;
char name[16];
struct gendisk *disk;
struct dax_device *dax_dev;
wait_queue_head_t wait;
unsigned long __percpu *pending_io;
/* forced geometry settings */
struct hd_geometry geometry;
/*
* Processing queue (flush)
*/
struct workqueue_struct *wq;
/*
* A list of ios that arrived while we were suspended.
*/
struct work_struct work;
spinlock_t deferred_lock;
struct bio_list deferred;
dm: add two stage requeue mechanism Commit 61b6e2e5321d ("dm: fix BLK_STS_DM_REQUEUE handling when dm_io represents split bio") reverted DM core's bio splitting back to using bio_split()+bio_chain() because it was found that otherwise DM's BLK_STS_DM_REQUEUE would trigger a live-lock waiting for bio completion that would never occur. Restore using bio_trim()+bio_inc_remaining(), like was done in commit 7dd76d1feec7 ("dm: improve bio splitting and associated IO accounting"), but this time with proper handling for the above scenario that is covered in more detail in the commit header for 61b6e2e5321d. Solve this issue by adding a two staged dm_io requeue mechanism that uses the new dm_bio_rewind() via dm_io_rewind(): 1) requeue the dm_io into the requeue_list added to struct mapped_device, and schedule it via new added requeue work. This workqueue just clones the dm_io->orig_bio (which DM saves and ensures its end sector isn't modified). dm_io_rewind() uses the sectors and sectors_offset members of the dm_io that are recorded relative to the end of orig_bio: dm_bio_rewind()+bio_trim() are then used to make that cloned bio reflect the subset of the original bio that is represented by the dm_io that is being requeued. 2) the 2nd stage requeue is same with original requeue, but io->orig_bio points to new cloned bio (which matches the requeued dm_io as described above). This allows DM core to shift the need for bio cloning from bio-split time (during IO submission) to the less likely BLK_STS_DM_REQUEUE handling (after IO completes with that error). Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
2022-06-24 17:12:55 +03:00
/*
* requeue work context is needed for cloning one new bio
* to represent the dm_io to be requeued, since each
* dm_io may point to the original bio from FS.
*/
struct work_struct requeue_work;
struct dm_io *requeue_list;
void *interface_ptr;
/*
* Event handling.
*/
wait_queue_head_t eventq;
atomic_t event_nr;
atomic_t uevent_seq;
struct list_head uevent_list;
spinlock_t uevent_lock; /* Protect access to uevent_list */
/* for blk-mq request-based DM support */
bool init_tio_pdu:1;
struct blk_mq_tag_set *tag_set;
struct dm_stats stats;
/* the number of internal suspends */
unsigned int internal_suspend_count;
int swap_bios;
struct semaphore swap_bios_semaphore;
struct mutex swap_bios_lock;
/*
* io objects are allocated from here.
*/
struct dm_md_mempools *mempools;
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
struct srcu_struct io_barrier;
dm: introduce zone append emulation For zoned targets that cannot support zone append operations, implement an emulation using regular write operations. If the original BIO submitted by the user is a zone append operation, change its clone into a regular write operation directed at the target zone write pointer position. To do so, an array of write pointer offsets (write pointer position relative to the start of a zone) is added to struct mapped_device. All operations that modify a sequential zone write pointer (writes, zone reset, zone finish and zone append) are intersepted in __map_bio() and processed using the new functions dm_zone_map_bio(). Detection of the target ability to natively support zone append operations is done from dm_table_set_restrictions() by calling the function dm_set_zones_restrictions(). A target that does not support zone append operation, either by explicitly declaring it using the new struct dm_target field zone_append_not_supported, or because the device table contains a non-zoned device, has its mapped device marked with the new flag DMF_ZONE_APPEND_EMULATED. The helper function dm_emulate_zone_append() is introduced to test a mapped device for this new flag. Atomicity of the zones write pointer tracking and updates is done using a zone write locking mechanism based on a bitmap. This is similar to the block layer method but based on BIOs rather than struct request. A zone write lock is taken in dm_zone_map_bio() for any clone BIO with an operation type that changes the BIO target zone write pointer position. The zone write lock is released if the clone BIO is failed before submission or when dm_zone_endio() is called when the clone BIO completes. The zone write lock bitmap of the mapped device, together with a bitmap indicating zone types (conv_zones_bitmap) and the write pointer offset array (zwp_offset) are allocated and initialized with a full device zone report in dm_set_zones_restrictions() using the function dm_revalidate_zones(). For failed operations that may have modified a zone write pointer, the zone write pointer offset is marked as invalid in dm_zone_endio(). Zones with an invalid write pointer offset are checked and the write pointer updated using an internal report zone operation when the faulty zone is accessed again by the user. All functions added for this emulation have a minimal overhead for zoned targets natively supporting zone append operations. Regular device targets are also not affected. The added code also does not impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all dm zone related functions. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
#ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones;
unsigned int *zwp_offset;
#endif
dm ima: measure data on table load DM configures a block device with various target specific attributes passed to it as a table. DM loads the table, and calls each target’s respective constructors with the attributes as input parameters. Some of these attributes are critical to ensure the device meets certain security bar. Thus, IMA should measure these attributes, to ensure they are not tampered with, during the lifetime of the device. So that the external services can have high confidence in the configuration of the block-devices on a given system. Some devices may have large tables. And a given device may change its state (table-load, suspend, resume, rename, remove, table-clear etc.) many times. Measuring these attributes each time when the device changes its state will significantly increase the size of the IMA logs. Further, once configured, these attributes are not expected to change unless a new table is loaded, or a device is removed and recreated. Therefore the clear-text of the attributes should only be measured during table load, and the hash of the active/inactive table should be measured for the remaining device state changes. Export IMA function ima_measure_critical_data() to allow measurement of DM device parameters, as well as target specific attributes, during table load. Compute the hash of the inactive table and store it for measurements during future state change. If a load is called multiple times, update the inactive table hash with the hash of the latest populated table. So that the correct inactive table hash is measured when the device transitions to different states like resume, remove, rename, etc. Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com> Signed-off-by: Colin Ian King <colin.king@canonical.com> # leak fix Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-07-13 03:48:58 +03:00
#ifdef CONFIG_IMA
struct dm_ima_measurements ima;
#endif
};
/*
* Bits for the flags field of struct mapped_device.
*/
#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
#define DMF_DEFERRED_REMOVE 6
#define DMF_SUSPENDED_INTERNALLY 7
#define DMF_POST_SUSPENDING 8
dm: introduce zone append emulation For zoned targets that cannot support zone append operations, implement an emulation using regular write operations. If the original BIO submitted by the user is a zone append operation, change its clone into a regular write operation directed at the target zone write pointer position. To do so, an array of write pointer offsets (write pointer position relative to the start of a zone) is added to struct mapped_device. All operations that modify a sequential zone write pointer (writes, zone reset, zone finish and zone append) are intersepted in __map_bio() and processed using the new functions dm_zone_map_bio(). Detection of the target ability to natively support zone append operations is done from dm_table_set_restrictions() by calling the function dm_set_zones_restrictions(). A target that does not support zone append operation, either by explicitly declaring it using the new struct dm_target field zone_append_not_supported, or because the device table contains a non-zoned device, has its mapped device marked with the new flag DMF_ZONE_APPEND_EMULATED. The helper function dm_emulate_zone_append() is introduced to test a mapped device for this new flag. Atomicity of the zones write pointer tracking and updates is done using a zone write locking mechanism based on a bitmap. This is similar to the block layer method but based on BIOs rather than struct request. A zone write lock is taken in dm_zone_map_bio() for any clone BIO with an operation type that changes the BIO target zone write pointer position. The zone write lock is released if the clone BIO is failed before submission or when dm_zone_endio() is called when the clone BIO completes. The zone write lock bitmap of the mapped device, together with a bitmap indicating zone types (conv_zones_bitmap) and the write pointer offset array (zwp_offset) are allocated and initialized with a full device zone report in dm_set_zones_restrictions() using the function dm_revalidate_zones(). For failed operations that may have modified a zone write pointer, the zone write pointer offset is marked as invalid in dm_zone_endio(). Zones with an invalid write pointer offset are checked and the write pointer updated using an internal report zone operation when the faulty zone is accessed again by the user. All functions added for this emulation have a minimal overhead for zoned targets natively supporting zone append operations. Regular device targets are also not affected. The added code also does not impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all dm zone related functions. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
#define DMF_EMULATE_ZONE_APPEND 9
dm: disable DISCARD if the underlying storage no longer supports it Storage devices which report supporting discard commands like WRITE_SAME_16 with unmap, but reject discard commands sent to the storage device. This is a clear storage firmware bug but it doesn't change the fact that should a program cause discards to be sent to a multipath device layered on this buggy storage, all paths can end up failed at the same time from the discards, causing possible I/O loss. The first discard to a path will fail with Illegal Request, Invalid field in cdb, e.g.: kernel: sd 8:0:8:19: [sdfn] tag#0 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE kernel: sd 8:0:8:19: [sdfn] tag#0 Sense Key : Illegal Request [current] kernel: sd 8:0:8:19: [sdfn] tag#0 Add. Sense: Invalid field in cdb kernel: sd 8:0:8:19: [sdfn] tag#0 CDB: Write same(16) 93 08 00 00 00 00 00 a0 08 00 00 00 80 00 00 00 kernel: blk_update_request: critical target error, dev sdfn, sector 10487808 The SCSI layer converts this to the BLK_STS_TARGET error number, the sd device disables its support for discard on this path, and because of the BLK_STS_TARGET error multipath fails the discard without failing any path or retrying down a different path. But subsequent discards can cause path failures. Any discards sent to the path which already failed a discard ends up failing with EIO from blk_cloned_rq_check_limits with an "over max size limit" error since the discard limit was set to 0 by the sd driver for the path. As the error is EIO, this now fails the path and multipath tries to send the discard down the next path. This cycle continues as discards are sent until all paths fail. Fix this by training DM core to disable DISCARD if the underlying storage already did so. Also, fix branching in dm_done() and clone_endio() to reflect the mutually exclussive nature of the IO operations in question. Cc: stable@vger.kernel.org Reported-by: David Jeffery <djeffery@redhat.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-04-03 19:23:11 +03:00
void disable_discard(struct mapped_device *md);
void disable_write_zeroes(struct mapped_device *md);
static inline sector_t dm_get_size(struct mapped_device *md)
{
return get_capacity(md->disk);
}
static inline struct dm_stats *dm_get_stats(struct mapped_device *md)
{
return &md->stats;
}
DECLARE_STATIC_KEY_FALSE(stats_enabled);
DECLARE_STATIC_KEY_FALSE(swap_bios_enabled);
DECLARE_STATIC_KEY_FALSE(zoned_enabled);
dm: introduce zone append emulation For zoned targets that cannot support zone append operations, implement an emulation using regular write operations. If the original BIO submitted by the user is a zone append operation, change its clone into a regular write operation directed at the target zone write pointer position. To do so, an array of write pointer offsets (write pointer position relative to the start of a zone) is added to struct mapped_device. All operations that modify a sequential zone write pointer (writes, zone reset, zone finish and zone append) are intersepted in __map_bio() and processed using the new functions dm_zone_map_bio(). Detection of the target ability to natively support zone append operations is done from dm_table_set_restrictions() by calling the function dm_set_zones_restrictions(). A target that does not support zone append operation, either by explicitly declaring it using the new struct dm_target field zone_append_not_supported, or because the device table contains a non-zoned device, has its mapped device marked with the new flag DMF_ZONE_APPEND_EMULATED. The helper function dm_emulate_zone_append() is introduced to test a mapped device for this new flag. Atomicity of the zones write pointer tracking and updates is done using a zone write locking mechanism based on a bitmap. This is similar to the block layer method but based on BIOs rather than struct request. A zone write lock is taken in dm_zone_map_bio() for any clone BIO with an operation type that changes the BIO target zone write pointer position. The zone write lock is released if the clone BIO is failed before submission or when dm_zone_endio() is called when the clone BIO completes. The zone write lock bitmap of the mapped device, together with a bitmap indicating zone types (conv_zones_bitmap) and the write pointer offset array (zwp_offset) are allocated and initialized with a full device zone report in dm_set_zones_restrictions() using the function dm_revalidate_zones(). For failed operations that may have modified a zone write pointer, the zone write pointer offset is marked as invalid in dm_zone_endio(). Zones with an invalid write pointer offset are checked and the write pointer updated using an internal report zone operation when the faulty zone is accessed again by the user. All functions added for this emulation have a minimal overhead for zoned targets natively supporting zone append operations. Regular device targets are also not affected. The added code also does not impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all dm zone related functions. Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
static inline bool dm_emulate_zone_append(struct mapped_device *md)
{
if (blk_queue_is_zoned(md->queue))
return test_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
return false;
}
#define DM_TABLE_MAX_DEPTH 16
struct dm_table {
struct mapped_device *md;
enum dm_queue_mode type;
/* btree table */
unsigned int depth;
unsigned int counts[DM_TABLE_MAX_DEPTH]; /* in nodes */
sector_t *index[DM_TABLE_MAX_DEPTH];
unsigned int num_targets;
unsigned int num_allocated;
sector_t *highs;
struct dm_target *targets;
struct target_type *immutable_target_type;
bool integrity_supported:1;
bool singleton:1;
unsigned integrity_added:1;
/*
* Indicates the rw permissions for the new logical device. This
* should be a combination of BLK_OPEN_READ and BLK_OPEN_WRITE.
*/
blk_mode_t mode;
/* a list of devices used by this table */
struct list_head devices;
/* events get handed up using this callback */
void (*event_fn)(void *data);
void *event_context;
struct dm_md_mempools *mempools;
dm: add support for passing through inline crypto support Update the device-mapper core to support exposing the inline crypto support of the underlying device(s) through the device-mapper device. This works by creating a "passthrough keyslot manager" for the dm device, which declares support for encryption settings which all underlying devices support. When a supported setting is used, the bio cloning code handles cloning the crypto context to the bios for all the underlying devices. When an unsupported setting is used, the blk-crypto fallback is used as usual. Crypto support on each underlying device is ignored unless the corresponding dm target opts into exposing it. This is needed because for inline crypto to semantically operate on the original bio, the data must not be transformed by the dm target. Thus, targets like dm-linear can expose crypto support of the underlying device, but targets like dm-crypt can't. (dm-crypt could use inline crypto itself, though.) A DM device's table can only be changed if the "new" inline encryption capabilities are a (*not* necessarily strict) superset of the "old" inline encryption capabilities. Attempts to make changes to the table that result in some inline encryption capability becoming no longer supported will be rejected. For the sake of clarity, key eviction from underlying devices will be handled in a future patch. Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Satya Tangirala <satyat@google.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-02-01 08:10:17 +03:00
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
blk-crypto: rename blk_keyslot_manager to blk_crypto_profile blk_keyslot_manager is misnamed because it doesn't necessarily manage keyslots. It actually does several different things: - Contains the crypto capabilities of the device. - Provides functions to control the inline encryption hardware. Originally these were just for programming/evicting keyslots; however, new functionality (hardware-wrapped keys) will require new functions here which are unrelated to keyslots. Moreover, device-mapper devices already (ab)use "keyslot_evict" to pass key eviction requests to their underlying devices even though device-mapper devices don't have any keyslots themselves (so it really should be "evict_key", not "keyslot_evict"). - Sometimes (but not always!) it manages keyslots. Originally it always did, but device-mapper devices don't have keyslots themselves, so they use a "passthrough keyslot manager" which doesn't actually manage keyslots. This hack works, but the terminology is unnatural. Also, some hardware doesn't have keyslots and thus also uses a "passthrough keyslot manager" (support for such hardware is yet to be upstreamed, but it will happen eventually). Let's stop having keyslot managers which don't actually manage keyslots. Instead, rename blk_keyslot_manager to blk_crypto_profile. This is a fairly big change, since for consistency it also has to update keyslot manager-related function names, variable names, and comments -- not just the actual struct name. However it's still a fairly straightforward change, as it doesn't change any actual functionality. Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC Reviewed-by: Mike Snitzer <snitzer@redhat.com> Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Link: https://lore.kernel.org/r/20211018180453.40441-4-ebiggers@kernel.org Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-10-18 21:04:52 +03:00
struct blk_crypto_profile *crypto_profile;
dm: add support for passing through inline crypto support Update the device-mapper core to support exposing the inline crypto support of the underlying device(s) through the device-mapper device. This works by creating a "passthrough keyslot manager" for the dm device, which declares support for encryption settings which all underlying devices support. When a supported setting is used, the bio cloning code handles cloning the crypto context to the bios for all the underlying devices. When an unsupported setting is used, the blk-crypto fallback is used as usual. Crypto support on each underlying device is ignored unless the corresponding dm target opts into exposing it. This is needed because for inline crypto to semantically operate on the original bio, the data must not be transformed by the dm target. Thus, targets like dm-linear can expose crypto support of the underlying device, but targets like dm-crypt can't. (dm-crypt could use inline crypto itself, though.) A DM device's table can only be changed if the "new" inline encryption capabilities are a (*not* necessarily strict) superset of the "old" inline encryption capabilities. Attempts to make changes to the table that result in some inline encryption capability becoming no longer supported will be rejected. For the sake of clarity, key eviction from underlying devices will be handled in a future patch. Co-developed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Satya Tangirala <satyat@google.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-02-01 08:10:17 +03:00
#endif
};
static inline struct dm_target *dm_table_get_target(struct dm_table *t,
unsigned int index)
{
BUG_ON(index >= t->num_targets);
return t->targets + index;
}
/*
* One of these is allocated per clone bio.
*/
#define DM_TIO_MAGIC 28714
struct dm_target_io {
unsigned short magic;
blk_short_t flags;
unsigned int target_bio_nr;
struct dm_io *io;
struct dm_target *ti;
unsigned int *len_ptr;
sector_t old_sector;
struct bio clone;
};
#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
#define DM_IO_BIO_OFFSET \
(offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
/*
* dm_target_io flags
*/
enum {
DM_TIO_INSIDE_DM_IO,
DM_TIO_IS_DUPLICATE_BIO
};
static inline bool dm_tio_flagged(struct dm_target_io *tio, unsigned int bit)
{
return (tio->flags & (1U << bit)) != 0;
}
static inline void dm_tio_set_flag(struct dm_target_io *tio, unsigned int bit)
{
tio->flags |= (1U << bit);
}
static inline bool dm_tio_is_normal(struct dm_target_io *tio)
{
return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) &&
!dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
}
/*
* One of these is allocated per original bio.
* It contains the first clone used for that original.
*/
#define DM_IO_MAGIC 19577
struct dm_io {
unsigned short magic;
blk_short_t flags;
spinlock_t lock;
unsigned long start_time;
void *data;
struct dm_io *next;
struct dm_stats_aux stats_aux;
blk_status_t status;
atomic_t io_count;
struct mapped_device *md;
/* The three fields represent mapped part of original bio */
struct bio *orig_bio;
unsigned int sector_offset; /* offset to end of orig_bio */
unsigned int sectors;
/* last member of dm_target_io is 'struct bio' */
struct dm_target_io tio;
};
/*
* dm_io flags
*/
enum {
DM_IO_ACCOUNTED,
DM_IO_WAS_SPLIT,
DM_IO_BLK_STAT
};
static inline bool dm_io_flagged(struct dm_io *io, unsigned int bit)
{
return (io->flags & (1U << bit)) != 0;
}
static inline void dm_io_set_flag(struct dm_io *io, unsigned int bit)
{
io->flags |= (1U << bit);
}
dm: add two stage requeue mechanism Commit 61b6e2e5321d ("dm: fix BLK_STS_DM_REQUEUE handling when dm_io represents split bio") reverted DM core's bio splitting back to using bio_split()+bio_chain() because it was found that otherwise DM's BLK_STS_DM_REQUEUE would trigger a live-lock waiting for bio completion that would never occur. Restore using bio_trim()+bio_inc_remaining(), like was done in commit 7dd76d1feec7 ("dm: improve bio splitting and associated IO accounting"), but this time with proper handling for the above scenario that is covered in more detail in the commit header for 61b6e2e5321d. Solve this issue by adding a two staged dm_io requeue mechanism that uses the new dm_bio_rewind() via dm_io_rewind(): 1) requeue the dm_io into the requeue_list added to struct mapped_device, and schedule it via new added requeue work. This workqueue just clones the dm_io->orig_bio (which DM saves and ensures its end sector isn't modified). dm_io_rewind() uses the sectors and sectors_offset members of the dm_io that are recorded relative to the end of orig_bio: dm_bio_rewind()+bio_trim() are then used to make that cloned bio reflect the subset of the original bio that is represented by the dm_io that is being requeued. 2) the 2nd stage requeue is same with original requeue, but io->orig_bio points to new cloned bio (which matches the requeued dm_io as described above). This allows DM core to shift the need for bio cloning from bio-split time (during IO submission) to the less likely BLK_STS_DM_REQUEUE handling (after IO completes with that error). Signed-off-by: Ming Lei <ming.lei@redhat.com> Signed-off-by: Mike Snitzer <snitzer@kernel.org>
2022-06-24 17:12:55 +03:00
void dm_io_rewind(struct dm_io *io, struct bio_set *bs);
static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj)
{
return &container_of(kobj, struct dm_kobject_holder, kobj)->completion;
}
unsigned int __dm_get_module_param(unsigned int *module_param, unsigned int def, unsigned int max);
static inline bool dm_message_test_buffer_overflow(char *result, unsigned int maxlen)
{
return !maxlen || strlen(result) + 1 >= maxlen;
}
extern atomic_t dm_global_event_nr;
extern wait_queue_head_t dm_global_eventq;
void dm_issue_global_event(void);
#endif