7dc66abb5a
Currently we abuse the extent_map structure for two purposes: 1) To actually represent extents for inodes; 2) To represent chunk mappings. This is odd and has several disadvantages: 1) To create a chunk map, we need to do two memory allocations: one for an extent_map structure and another one for a map_lookup structure, so more potential for an allocation failure and more complicated code to manage and link two structures; 2) For a chunk map we actually only use 3 fields (24 bytes) of the respective extent map structure: the 'start' field to have the logical start address of the chunk, the 'len' field to have the chunk's size, and the 'orig_block_len' field to contain the chunk's stripe size. Besides wasting a memory, it's also odd and not intuitive at all to have the stripe size in a field named 'orig_block_len'. We are also using 'block_len' of the extent_map structure to contain the chunk size, so we have 2 fields for the same value, 'len' and 'block_len', which is pointless; 3) When an extent map is associated to a chunk mapping, we set the bit EXTENT_FLAG_FS_MAPPING on its flags and then make its member named 'map_lookup' point to the associated map_lookup structure. This means that for an extent map associated to an inode extent, we are not using this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform); 4) Extent maps associated to a chunk mapping are never merged or split so it's pointless to use the existing extent map infrastructure. So add a dedicated data structure named 'btrfs_chunk_map' to represent chunk mappings, this is basically the existing map_lookup structure with some extra fields: 1) 'start' to contain the chunk logical address; 2) 'chunk_len' to contain the chunk's length; 3) 'stripe_size' for the stripe size; 4) 'rb_node' for insertion into a rb tree; 5) 'refs' for reference counting. This way we do a single memory allocation for chunk mappings and we don't waste memory for them with unused/unnecessary fields from an extent_map. We also save 8 bytes from the extent_map structure by removing the 'map_lookup' pointer, so the size of struct extent_map is reduced from 144 bytes down to 136 bytes, and we can now have 30 extents map per 4K page instead of 28. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
202 lines
5.2 KiB
C
202 lines
5.2 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* Copyright (C) 2012 Fusion-io All rights reserved.
|
|
* Copyright (C) 2012 Intel Corp. All rights reserved.
|
|
*/
|
|
|
|
#ifndef BTRFS_RAID56_H
|
|
#define BTRFS_RAID56_H
|
|
|
|
#include <linux/workqueue.h>
|
|
#include "volumes.h"
|
|
|
|
enum btrfs_rbio_ops {
|
|
BTRFS_RBIO_WRITE,
|
|
BTRFS_RBIO_READ_REBUILD,
|
|
BTRFS_RBIO_PARITY_SCRUB,
|
|
};
|
|
|
|
struct btrfs_raid_bio {
|
|
struct btrfs_io_context *bioc;
|
|
|
|
/*
|
|
* While we're doing RMW on a stripe we put it into a hash table so we
|
|
* can lock the stripe and merge more rbios into it.
|
|
*/
|
|
struct list_head hash_list;
|
|
|
|
/* LRU list for the stripe cache */
|
|
struct list_head stripe_cache;
|
|
|
|
/* For scheduling work in the helper threads */
|
|
struct work_struct work;
|
|
|
|
/*
|
|
* bio_list and bio_list_lock are used to add more bios into the stripe
|
|
* in hopes of avoiding the full RMW
|
|
*/
|
|
struct bio_list bio_list;
|
|
spinlock_t bio_list_lock;
|
|
|
|
/*
|
|
* Also protected by the bio_list_lock, the plug list is used by the
|
|
* plugging code to collect partial bios while plugged. The stripe
|
|
* locking code also uses it to hand off the stripe lock to the next
|
|
* pending IO.
|
|
*/
|
|
struct list_head plug_list;
|
|
|
|
/* Flags that tell us if it is safe to merge with this bio. */
|
|
unsigned long flags;
|
|
|
|
/*
|
|
* Set if we're doing a parity rebuild for a read from higher up, which
|
|
* is handled differently from a parity rebuild as part of RMW.
|
|
*/
|
|
enum btrfs_rbio_ops operation;
|
|
|
|
/* How many pages there are for the full stripe including P/Q */
|
|
u16 nr_pages;
|
|
|
|
/* How many sectors there are for the full stripe including P/Q */
|
|
u16 nr_sectors;
|
|
|
|
/* Number of data stripes (no p/q) */
|
|
u8 nr_data;
|
|
|
|
/* Number of all stripes (including P/Q) */
|
|
u8 real_stripes;
|
|
|
|
/* How many pages there are for each stripe */
|
|
u8 stripe_npages;
|
|
|
|
/* How many sectors there are for each stripe */
|
|
u8 stripe_nsectors;
|
|
|
|
/* Stripe number that we're scrubbing */
|
|
u8 scrubp;
|
|
|
|
/*
|
|
* Size of all the bios in the bio_list. This helps us decide if the
|
|
* rbio maps to a full stripe or not.
|
|
*/
|
|
int bio_list_bytes;
|
|
|
|
refcount_t refs;
|
|
|
|
atomic_t stripes_pending;
|
|
|
|
wait_queue_head_t io_wait;
|
|
|
|
/* Bitmap to record which horizontal stripe has data */
|
|
unsigned long dbitmap;
|
|
|
|
/* Allocated with stripe_nsectors-many bits for finish_*() calls */
|
|
unsigned long finish_pbitmap;
|
|
|
|
/*
|
|
* These are two arrays of pointers. We allocate the rbio big enough
|
|
* to hold them both and setup their locations when the rbio is
|
|
* allocated.
|
|
*/
|
|
|
|
/*
|
|
* Pointers to pages that we allocated for reading/writing stripes
|
|
* directly from the disk (including P/Q).
|
|
*/
|
|
struct page **stripe_pages;
|
|
|
|
/* Pointers to the sectors in the bio_list, for faster lookup */
|
|
struct sector_ptr *bio_sectors;
|
|
|
|
/*
|
|
* For subpage support, we need to map each sector to above
|
|
* stripe_pages.
|
|
*/
|
|
struct sector_ptr *stripe_sectors;
|
|
|
|
/* Allocated with real_stripes-many pointers for finish_*() calls */
|
|
void **finish_pointers;
|
|
|
|
/*
|
|
* The bitmap recording where IO errors happened.
|
|
* Each bit is corresponding to one sector in either bio_sectors[] or
|
|
* stripe_sectors[] array.
|
|
*
|
|
* The reason we don't use another bit in sector_ptr is, we have two
|
|
* arrays of sectors, and a lot of IO can use sectors in both arrays.
|
|
* Thus making it much harder to iterate.
|
|
*/
|
|
unsigned long *error_bitmap;
|
|
|
|
/*
|
|
* Checksum buffer if the rbio is for data. The buffer should cover
|
|
* all data sectors (excluding P/Q sectors).
|
|
*/
|
|
u8 *csum_buf;
|
|
|
|
/*
|
|
* Each bit represents if the corresponding sector has data csum found.
|
|
* Should only cover data sectors (excluding P/Q sectors).
|
|
*/
|
|
unsigned long *csum_bitmap;
|
|
};
|
|
|
|
/*
|
|
* For trace event usage only. Records useful debug info for each bio submitted
|
|
* by RAID56 to each physical device.
|
|
*
|
|
* No matter signed or not, (-1) is always the one indicating we can not grab
|
|
* the proper stripe number.
|
|
*/
|
|
struct raid56_bio_trace_info {
|
|
u64 devid;
|
|
|
|
/* The offset inside the stripe. (<= STRIPE_LEN) */
|
|
u32 offset;
|
|
|
|
/*
|
|
* Stripe number.
|
|
* 0 is the first data stripe, and nr_data for P stripe,
|
|
* nr_data + 1 for Q stripe.
|
|
* >= real_stripes for
|
|
*/
|
|
u8 stripe_nr;
|
|
};
|
|
|
|
static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
|
|
{
|
|
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
|
|
}
|
|
|
|
static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
|
|
{
|
|
return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
|
|
}
|
|
|
|
#define RAID5_P_STRIPE ((u64)-2)
|
|
#define RAID6_Q_STRIPE ((u64)-1)
|
|
|
|
#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \
|
|
((x) == RAID6_Q_STRIPE))
|
|
|
|
struct btrfs_device;
|
|
|
|
void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
|
|
int mirror_num);
|
|
void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
|
|
|
|
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
|
|
struct btrfs_io_context *bioc,
|
|
struct btrfs_device *scrub_dev,
|
|
unsigned long *dbitmap, int stripe_nsectors);
|
|
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
|
|
|
|
void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
|
|
struct page **data_pages, u64 data_logical);
|
|
|
|
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
|
|
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
|
|
|
|
#endif
|