2016-05-12 23:28:10 +03:00
/*
* Internal header file _only_ for device mapper core
*
* Copyright ( C ) 2016 Red Hat , Inc . All rights reserved .
*
* This file is released under the LGPL .
*/
# ifndef DM_CORE_INTERNAL_H
# define DM_CORE_INTERNAL_H
# include <linux/kthread.h>
# include <linux/ktime.h>
2020-09-19 20:09:11 +03:00
# include <linux/genhd.h>
2016-05-12 23:28:10 +03:00
# include <linux/blk-mq.h>
2021-10-18 21:04:51 +03:00
# include <linux/blk-crypto-profile.h>
2016-05-12 23:28:10 +03:00
# include <trace/events/block.h>
# include "dm.h"
dm ima: measure data on table load
DM configures a block device with various target specific attributes
passed to it as a table. DM loads the table, and calls each target’s
respective constructors with the attributes as input parameters.
Some of these attributes are critical to ensure the device meets
certain security bar. Thus, IMA should measure these attributes, to
ensure they are not tampered with, during the lifetime of the device.
So that the external services can have high confidence in the
configuration of the block-devices on a given system.
Some devices may have large tables. And a given device may change its
state (table-load, suspend, resume, rename, remove, table-clear etc.)
many times. Measuring these attributes each time when the device
changes its state will significantly increase the size of the IMA logs.
Further, once configured, these attributes are not expected to change
unless a new table is loaded, or a device is removed and recreated.
Therefore the clear-text of the attributes should only be measured
during table load, and the hash of the active/inactive table should be
measured for the remaining device state changes.
Export IMA function ima_measure_critical_data() to allow measurement
of DM device parameters, as well as target specific attributes, during
table load. Compute the hash of the inactive table and store it for
measurements during future state change. If a load is called multiple
times, update the inactive table hash with the hash of the latest
populated table. So that the correct inactive table hash is measured
when the device transitions to different states like resume, remove,
rename, etc.
Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com> # leak fix
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-07-13 03:48:58 +03:00
# include "dm-ima.h"
2016-05-12 23:28:10 +03:00
# define DM_RESERVED_MAX_IOS 1024
struct dm_kobject_holder {
struct kobject kobj ;
struct completion completion ;
} ;
/*
2020-09-19 20:09:11 +03:00
* DM core internal structures used directly by dm . c , dm - rq . c and dm - table . c .
* DM targets must _not_ deference a mapped_device or dm_table to directly
* access their members !
2016-05-12 23:28:10 +03:00
*/
2020-09-19 20:09:11 +03:00
2016-05-12 23:28:10 +03:00
struct mapped_device {
struct mutex suspend_lock ;
2018-05-23 01:26:20 +03:00
struct mutex table_devices_lock ;
struct list_head table_devices ;
2016-05-12 23:28:10 +03:00
/*
* The current mapping ( struct dm_table * ) .
* Use dm_get_live_table { _fast } or take suspend_lock for
* dereference .
*/
void __rcu * map ;
unsigned long flags ;
/* Protect queue and type against concurrent access. */
struct mutex type_lock ;
2018-05-23 01:26:20 +03:00
enum dm_queue_mode type ;
int numa_node_id ;
struct request_queue * queue ;
2016-05-12 23:28:10 +03:00
atomic_t holders ;
atomic_t open_count ;
struct dm_target * immutable_target ;
struct target_type * immutable_target_type ;
2018-05-23 01:26:20 +03:00
char name [ 16 ] ;
2016-05-12 23:28:10 +03:00
struct gendisk * disk ;
2017-04-12 22:35:44 +03:00
struct dax_device * dax_dev ;
2016-05-12 23:28:10 +03:00
/*
* A list of ios that arrived while we were suspended .
*/
struct work_struct work ;
2018-05-23 01:26:20 +03:00
wait_queue_head_t wait ;
2016-05-12 23:28:10 +03:00
spinlock_t deferred_lock ;
struct bio_list deferred ;
2018-05-23 01:26:20 +03:00
void * interface_ptr ;
2016-05-12 23:28:10 +03:00
/*
* Event handling .
*/
wait_queue_head_t eventq ;
atomic_t event_nr ;
atomic_t uevent_seq ;
struct list_head uevent_list ;
spinlock_t uevent_lock ; /* Protect access to uevent_list */
/* the number of internal suspends */
unsigned internal_suspend_count ;
/*
* io objects are allocated from here .
*/
2018-05-21 01:25:53 +03:00
struct bio_set io_bs ;
struct bio_set bs ;
2016-05-12 23:28:10 +03:00
2018-05-23 01:26:20 +03:00
/*
* Processing queue ( flush )
*/
struct workqueue_struct * wq ;
2016-05-12 23:28:10 +03:00
/* forced geometry settings */
struct hd_geometry geometry ;
/* kobject and completion */
struct dm_kobject_holder kobj_holder ;
2021-02-10 23:26:23 +03:00
int swap_bios ;
struct semaphore swap_bios_semaphore ;
struct mutex swap_bios_lock ;
2016-05-12 23:28:10 +03:00
struct dm_stats stats ;
/* for blk-mq request-based DM support */
struct blk_mq_tag_set * tag_set ;
bool init_tio_pdu : 1 ;
2017-11-01 02:33:02 +03:00
struct srcu_struct io_barrier ;
dm: introduce zone append emulation
For zoned targets that cannot support zone append operations, implement
an emulation using regular write operations. If the original BIO
submitted by the user is a zone append operation, change its clone into
a regular write operation directed at the target zone write pointer
position.
To do so, an array of write pointer offsets (write pointer position
relative to the start of a zone) is added to struct mapped_device. All
operations that modify a sequential zone write pointer (writes, zone
reset, zone finish and zone append) are intersepted in __map_bio() and
processed using the new functions dm_zone_map_bio().
Detection of the target ability to natively support zone append
operations is done from dm_table_set_restrictions() by calling the
function dm_set_zones_restrictions(). A target that does not support
zone append operation, either by explicitly declaring it using the new
struct dm_target field zone_append_not_supported, or because the device
table contains a non-zoned device, has its mapped device marked with the
new flag DMF_ZONE_APPEND_EMULATED. The helper function
dm_emulate_zone_append() is introduced to test a mapped device for this
new flag.
Atomicity of the zones write pointer tracking and updates is done using
a zone write locking mechanism based on a bitmap. This is similar to
the block layer method but based on BIOs rather than struct request.
A zone write lock is taken in dm_zone_map_bio() for any clone BIO with
an operation type that changes the BIO target zone write pointer
position. The zone write lock is released if the clone BIO is failed
before submission or when dm_zone_endio() is called when the clone BIO
completes.
The zone write lock bitmap of the mapped device, together with a bitmap
indicating zone types (conv_zones_bitmap) and the write pointer offset
array (zwp_offset) are allocated and initialized with a full device zone
report in dm_set_zones_restrictions() using the function
dm_revalidate_zones().
For failed operations that may have modified a zone write pointer, the
zone write pointer offset is marked as invalid in dm_zone_endio().
Zones with an invalid write pointer offset are checked and the write
pointer updated using an internal report zone operation when the
faulty zone is accessed again by the user.
All functions added for this emulation have a minimal overhead for
zoned targets natively supporting zone append operations. Regular
device targets are also not affected. The added code also does not
impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all
dm zone related functions.
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
# ifdef CONFIG_BLK_DEV_ZONED
unsigned int nr_zones ;
unsigned int * zwp_offset ;
# endif
dm ima: measure data on table load
DM configures a block device with various target specific attributes
passed to it as a table. DM loads the table, and calls each target’s
respective constructors with the attributes as input parameters.
Some of these attributes are critical to ensure the device meets
certain security bar. Thus, IMA should measure these attributes, to
ensure they are not tampered with, during the lifetime of the device.
So that the external services can have high confidence in the
configuration of the block-devices on a given system.
Some devices may have large tables. And a given device may change its
state (table-load, suspend, resume, rename, remove, table-clear etc.)
many times. Measuring these attributes each time when the device
changes its state will significantly increase the size of the IMA logs.
Further, once configured, these attributes are not expected to change
unless a new table is loaded, or a device is removed and recreated.
Therefore the clear-text of the attributes should only be measured
during table load, and the hash of the active/inactive table should be
measured for the remaining device state changes.
Export IMA function ima_measure_critical_data() to allow measurement
of DM device parameters, as well as target specific attributes, during
table load. Compute the hash of the inactive table and store it for
measurements during future state change. If a load is called multiple
times, update the inactive table hash with the hash of the latest
populated table. So that the correct inactive table hash is measured
when the device transitions to different states like resume, remove,
rename, etc.
Signed-off-by: Tushar Sugandhi <tusharsu@linux.microsoft.com>
Signed-off-by: Colin Ian King <colin.king@canonical.com> # leak fix
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-07-13 03:48:58 +03:00
# ifdef CONFIG_IMA
struct dm_ima_measurements ima ;
# endif
2016-05-12 23:28:10 +03:00
} ;
2021-05-26 00:24:59 +03:00
/*
* Bits for the flags field of struct mapped_device .
*/
# define DMF_BLOCK_IO_FOR_SUSPEND 0
# define DMF_SUSPENDED 1
# define DMF_FROZEN 2
# define DMF_FREEING 3
# define DMF_DELETING 4
# define DMF_NOFLUSH_SUSPENDING 5
# define DMF_DEFERRED_REMOVE 6
# define DMF_SUSPENDED_INTERNALLY 7
# define DMF_POST_SUSPENDING 8
dm: introduce zone append emulation
For zoned targets that cannot support zone append operations, implement
an emulation using regular write operations. If the original BIO
submitted by the user is a zone append operation, change its clone into
a regular write operation directed at the target zone write pointer
position.
To do so, an array of write pointer offsets (write pointer position
relative to the start of a zone) is added to struct mapped_device. All
operations that modify a sequential zone write pointer (writes, zone
reset, zone finish and zone append) are intersepted in __map_bio() and
processed using the new functions dm_zone_map_bio().
Detection of the target ability to natively support zone append
operations is done from dm_table_set_restrictions() by calling the
function dm_set_zones_restrictions(). A target that does not support
zone append operation, either by explicitly declaring it using the new
struct dm_target field zone_append_not_supported, or because the device
table contains a non-zoned device, has its mapped device marked with the
new flag DMF_ZONE_APPEND_EMULATED. The helper function
dm_emulate_zone_append() is introduced to test a mapped device for this
new flag.
Atomicity of the zones write pointer tracking and updates is done using
a zone write locking mechanism based on a bitmap. This is similar to
the block layer method but based on BIOs rather than struct request.
A zone write lock is taken in dm_zone_map_bio() for any clone BIO with
an operation type that changes the BIO target zone write pointer
position. The zone write lock is released if the clone BIO is failed
before submission or when dm_zone_endio() is called when the clone BIO
completes.
The zone write lock bitmap of the mapped device, together with a bitmap
indicating zone types (conv_zones_bitmap) and the write pointer offset
array (zwp_offset) are allocated and initialized with a full device zone
report in dm_set_zones_restrictions() using the function
dm_revalidate_zones().
For failed operations that may have modified a zone write pointer, the
zone write pointer offset is marked as invalid in dm_zone_endio().
Zones with an invalid write pointer offset are checked and the write
pointer updated using an internal report zone operation when the
faulty zone is accessed again by the user.
All functions added for this emulation have a minimal overhead for
zoned targets natively supporting zone append operations. Regular
device targets are also not affected. The added code also does not
impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all
dm zone related functions.
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
# define DMF_EMULATE_ZONE_APPEND 9
2021-05-26 00:24:59 +03:00
dm: disable DISCARD if the underlying storage no longer supports it
Storage devices which report supporting discard commands like
WRITE_SAME_16 with unmap, but reject discard commands sent to the
storage device. This is a clear storage firmware bug but it doesn't
change the fact that should a program cause discards to be sent to a
multipath device layered on this buggy storage, all paths can end up
failed at the same time from the discards, causing possible I/O loss.
The first discard to a path will fail with Illegal Request, Invalid
field in cdb, e.g.:
kernel: sd 8:0:8:19: [sdfn] tag#0 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE
kernel: sd 8:0:8:19: [sdfn] tag#0 Sense Key : Illegal Request [current]
kernel: sd 8:0:8:19: [sdfn] tag#0 Add. Sense: Invalid field in cdb
kernel: sd 8:0:8:19: [sdfn] tag#0 CDB: Write same(16) 93 08 00 00 00 00 00 a0 08 00 00 00 80 00 00 00
kernel: blk_update_request: critical target error, dev sdfn, sector 10487808
The SCSI layer converts this to the BLK_STS_TARGET error number, the sd
device disables its support for discard on this path, and because of the
BLK_STS_TARGET error multipath fails the discard without failing any
path or retrying down a different path. But subsequent discards can
cause path failures. Any discards sent to the path which already failed
a discard ends up failing with EIO from blk_cloned_rq_check_limits with
an "over max size limit" error since the discard limit was set to 0 by
the sd driver for the path. As the error is EIO, this now fails the
path and multipath tries to send the discard down the next path. This
cycle continues as discards are sent until all paths fail.
Fix this by training DM core to disable DISCARD if the underlying
storage already did so.
Also, fix branching in dm_done() and clone_endio() to reflect the
mutually exclussive nature of the IO operations in question.
Cc: stable@vger.kernel.org
Reported-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2019-04-03 19:23:11 +03:00
void disable_discard ( struct mapped_device * md ) ;
2016-05-12 23:28:10 +03:00
void disable_write_same ( struct mapped_device * md ) ;
2017-04-05 20:21:05 +03:00
void disable_write_zeroes ( struct mapped_device * md ) ;
2016-05-12 23:28:10 +03:00
2020-09-19 20:09:11 +03:00
static inline sector_t dm_get_size ( struct mapped_device * md )
{
return get_capacity ( md - > disk ) ;
}
static inline struct dm_stats * dm_get_stats ( struct mapped_device * md )
{
return & md - > stats ;
}
dm: introduce zone append emulation
For zoned targets that cannot support zone append operations, implement
an emulation using regular write operations. If the original BIO
submitted by the user is a zone append operation, change its clone into
a regular write operation directed at the target zone write pointer
position.
To do so, an array of write pointer offsets (write pointer position
relative to the start of a zone) is added to struct mapped_device. All
operations that modify a sequential zone write pointer (writes, zone
reset, zone finish and zone append) are intersepted in __map_bio() and
processed using the new functions dm_zone_map_bio().
Detection of the target ability to natively support zone append
operations is done from dm_table_set_restrictions() by calling the
function dm_set_zones_restrictions(). A target that does not support
zone append operation, either by explicitly declaring it using the new
struct dm_target field zone_append_not_supported, or because the device
table contains a non-zoned device, has its mapped device marked with the
new flag DMF_ZONE_APPEND_EMULATED. The helper function
dm_emulate_zone_append() is introduced to test a mapped device for this
new flag.
Atomicity of the zones write pointer tracking and updates is done using
a zone write locking mechanism based on a bitmap. This is similar to
the block layer method but based on BIOs rather than struct request.
A zone write lock is taken in dm_zone_map_bio() for any clone BIO with
an operation type that changes the BIO target zone write pointer
position. The zone write lock is released if the clone BIO is failed
before submission or when dm_zone_endio() is called when the clone BIO
completes.
The zone write lock bitmap of the mapped device, together with a bitmap
indicating zone types (conv_zones_bitmap) and the write pointer offset
array (zwp_offset) are allocated and initialized with a full device zone
report in dm_set_zones_restrictions() using the function
dm_revalidate_zones().
For failed operations that may have modified a zone write pointer, the
zone write pointer offset is marked as invalid in dm_zone_endio().
Zones with an invalid write pointer offset are checked and the write
pointer updated using an internal report zone operation when the
faulty zone is accessed again by the user.
All functions added for this emulation have a minimal overhead for
zoned targets natively supporting zone append operations. Regular
device targets are also not affected. The added code also does not
impact builds with CONFIG_BLK_DEV_ZONED disabled by stubbing out all
dm zone related functions.
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Reviewed-by: Himanshu Madhani <himanshu.madhani@oracle.com>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
2021-05-26 00:25:00 +03:00
static inline bool dm_emulate_zone_append ( struct mapped_device * md )
{
if ( blk_queue_is_zoned ( md - > queue ) )
return test_bit ( DMF_EMULATE_ZONE_APPEND , & md - > flags ) ;
return false ;
}
2020-09-19 20:09:11 +03:00
# define DM_TABLE_MAX_DEPTH 16
struct dm_table {
struct mapped_device * md ;
enum dm_queue_mode type ;
/* btree table */
unsigned int depth ;
unsigned int counts [ DM_TABLE_MAX_DEPTH ] ; /* in nodes */
sector_t * index [ DM_TABLE_MAX_DEPTH ] ;
unsigned int num_targets ;
unsigned int num_allocated ;
sector_t * highs ;
struct dm_target * targets ;
struct target_type * immutable_target_type ;
bool integrity_supported : 1 ;
bool singleton : 1 ;
unsigned integrity_added : 1 ;
/*
* Indicates the rw permissions for the new logical
* device . This should be a combination of FMODE_READ
* and FMODE_WRITE .
*/
fmode_t mode ;
/* a list of devices used by this table */
struct list_head devices ;
/* events get handed up using this callback */
void ( * event_fn ) ( void * ) ;
void * event_context ;
struct dm_md_mempools * mempools ;
2021-02-01 08:10:17 +03:00
# ifdef CONFIG_BLK_INLINE_ENCRYPTION
blk-crypto: rename blk_keyslot_manager to blk_crypto_profile
blk_keyslot_manager is misnamed because it doesn't necessarily manage
keyslots. It actually does several different things:
- Contains the crypto capabilities of the device.
- Provides functions to control the inline encryption hardware.
Originally these were just for programming/evicting keyslots;
however, new functionality (hardware-wrapped keys) will require new
functions here which are unrelated to keyslots. Moreover,
device-mapper devices already (ab)use "keyslot_evict" to pass key
eviction requests to their underlying devices even though
device-mapper devices don't have any keyslots themselves (so it
really should be "evict_key", not "keyslot_evict").
- Sometimes (but not always!) it manages keyslots. Originally it
always did, but device-mapper devices don't have keyslots
themselves, so they use a "passthrough keyslot manager" which
doesn't actually manage keyslots. This hack works, but the
terminology is unnatural. Also, some hardware doesn't have keyslots
and thus also uses a "passthrough keyslot manager" (support for such
hardware is yet to be upstreamed, but it will happen eventually).
Let's stop having keyslot managers which don't actually manage keyslots.
Instead, rename blk_keyslot_manager to blk_crypto_profile.
This is a fairly big change, since for consistency it also has to update
keyslot manager-related function names, variable names, and comments --
not just the actual struct name. However it's still a fairly
straightforward change, as it doesn't change any actual functionality.
Acked-by: Ulf Hansson <ulf.hansson@linaro.org> # For MMC
Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Link: https://lore.kernel.org/r/20211018180453.40441-4-ebiggers@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2021-10-18 21:04:52 +03:00
struct blk_crypto_profile * crypto_profile ;
2021-02-01 08:10:17 +03:00
# endif
2020-09-19 20:09:11 +03:00
} ;
2021-05-26 00:24:59 +03:00
/*
* One of these is allocated per clone bio .
*/
# define DM_TIO_MAGIC 7282014
struct dm_target_io {
unsigned int magic ;
struct dm_io * io ;
struct dm_target * ti ;
unsigned int target_bio_nr ;
unsigned int * len_ptr ;
bool inside_dm_io ;
struct bio clone ;
} ;
/*
* One of these is allocated per original bio .
* It contains the first clone used for that original .
*/
# define DM_IO_MAGIC 5191977
struct dm_io {
unsigned int magic ;
struct mapped_device * md ;
blk_status_t status ;
atomic_t io_count ;
struct bio * orig_bio ;
unsigned long start_time ;
spinlock_t endio_lock ;
struct dm_stats_aux stats_aux ;
/* last member of dm_target_io is 'struct bio' */
struct dm_target_io tio ;
} ;
static inline void dm_io_inc_pending ( struct dm_io * io )
{
atomic_inc ( & io - > io_count ) ;
}
void dm_io_dec_pending ( struct dm_io * io , blk_status_t error ) ;
2016-05-12 23:28:10 +03:00
static inline struct completion * dm_get_completion_from_kobject ( struct kobject * kobj )
{
return & container_of ( kobj , struct dm_kobject_holder , kobj ) - > completion ;
}
unsigned __dm_get_module_param ( unsigned * module_param , unsigned def , unsigned max ) ;
static inline bool dm_message_test_buffer_overflow ( char * result , unsigned maxlen )
{
return ! maxlen | | strlen ( result ) + 1 > = maxlen ;
}
2017-01-17 00:05:59 +03:00
extern atomic_t dm_global_event_nr ;
extern wait_queue_head_t dm_global_eventq ;
2017-09-20 14:29:49 +03:00
void dm_issue_global_event ( void ) ;
2017-01-17 00:05:59 +03:00
2016-05-12 23:28:10 +03:00
# endif