956a17d9d0
Extent maps are used either to represent existing file extent items, or to represent new extents that are going to be written and the respective file extent items are created when the ordered extent completes. We currently don't have any limit for how many extent maps we can have, neither per inode nor globally. Most of the time this not too noticeable because extent maps are removed in the following situations: 1) When evicting an inode; 2) When releasing folios (pages) through the btrfs_release_folio() address space operation callback. However we won't release extent maps in the folio range if the folio is either dirty or under writeback or if the inode's i_size is less than or equals to 16M (see try_release_extent_mapping(). This 16M i_size constraint was added back in 2008 with commit 70dec8079d78 ("Btrfs: extent_io and extent_state optimizations"), but there's no explanation about why we have it or why the 16M value. This means that for buffered IO we can reach an OOM situation due to too many extent maps if either of the following happens: 1) There's a set of tasks constantly doing IO on many files with a size not larger than 16M, specially if they keep the files open for very long periods, therefore preventing inode eviction. This requires a really high number of such files, and having many non mergeable extent maps (due to random 4K writes for example) and a machine with very little memory; 2) There's a set tasks constantly doing random write IO (therefore creating many non mergeable extent maps) on files and keeping them open for long periods of time, so inode eviction doesn't happen and there's always a lot of dirty pages or pages under writeback, preventing btrfs_release_folio() from releasing the respective extent maps. This second case was actually reported in the thread pointed by the Link tag below, and it requires a very large file under heavy IO and a machine with very little amount of RAM, which is probably hard to happen in practice in a real world use case. However when using direct IO this is not so hard to happen, because the page cache is not used, and therefore btrfs_release_folio() is never called. Which means extent maps are dropped only when evicting the inode, and that means that if we have tasks that keep a file descriptor open and keep doing IO on a very large file (or files), we can exhaust memory due to an unbounded amount of extent maps. This is especially easy to happen if we have a huge file with millions of small extents and their extent maps are not mergeable (non contiguous offsets and disk locations). This was reported in that thread with the following fio test: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj MOUNT_OPTIONS="-o ssd" MKFS_OPTIONS="" cat <<EOF > /tmp/fio-job.ini [global] name=fio-rand-write filename=$MNT/fio-rand-write rw=randwrite bs=4K direct=1 numjobs=16 fallocate=none time_based runtime=90000 [file1] size=300G ioengine=libaio iodepth=16 EOF umount $MNT &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT Monitoring the btrfs_extent_map slab while running the test with: $ watch -d -n 1 'cat /sys/kernel/slab/btrfs_extent_map/objects \ /sys/kernel/slab/btrfs_extent_map/total_objects' Shows the number of active and total extent maps skyrocketing to tens of millions, and on systems with a short amount of memory it's easy and quick to get into an OOM situation, as reported in that thread. So to avoid this issue add a shrinker that will remove extents maps, as long as they are not pinned, and takes proper care with any concurrent fsync to avoid missing extents (setting the full sync flag while in the middle of a fast fsync). This shrinker is triggered through the callbacks nr_cached_objects and free_cached_objects of struct super_operations. The shrinker will iterate over all roots and over all inodes of each root, and keeps track of the last scanned root and inode, so that the next time it runs, it starts from that root and from the next inode. This is similar to what xfs does for its inode reclaim (implements those callbacks, and cycles through inodes by starting from where it ended last time). Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
146 lines
4.0 KiB
C
146 lines
4.0 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
#ifndef BTRFS_EXTENT_MAP_H
|
|
#define BTRFS_EXTENT_MAP_H
|
|
|
|
#include <linux/compiler_types.h>
|
|
#include <linux/rwlock_types.h>
|
|
#include <linux/rbtree.h>
|
|
#include <linux/list.h>
|
|
#include <linux/refcount.h>
|
|
#include "misc.h"
|
|
#include "extent_map.h"
|
|
#include "compression.h"
|
|
|
|
struct btrfs_inode;
|
|
struct btrfs_fs_info;
|
|
|
|
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
|
|
#define EXTENT_MAP_HOLE ((u64)-3)
|
|
#define EXTENT_MAP_INLINE ((u64)-2)
|
|
|
|
/* bits for the extent_map::flags field */
|
|
enum {
|
|
/* this entry not yet on disk, don't free it */
|
|
ENUM_BIT(EXTENT_FLAG_PINNED),
|
|
ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
|
|
ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
|
|
ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
|
|
/* pre-allocated extent */
|
|
ENUM_BIT(EXTENT_FLAG_PREALLOC),
|
|
/* Logging this extent */
|
|
ENUM_BIT(EXTENT_FLAG_LOGGING),
|
|
/* This em is merged from two or more physically adjacent ems */
|
|
ENUM_BIT(EXTENT_FLAG_MERGED),
|
|
};
|
|
|
|
/*
|
|
* Keep this structure as compact as possible, as we can have really large
|
|
* amounts of allocated extent maps at any time.
|
|
*/
|
|
struct extent_map {
|
|
struct rb_node rb_node;
|
|
|
|
/* all of these are in bytes */
|
|
u64 start;
|
|
u64 len;
|
|
u64 orig_start;
|
|
u64 orig_block_len;
|
|
u64 ram_bytes;
|
|
u64 block_start;
|
|
u64 block_len;
|
|
|
|
/*
|
|
* Generation of the extent map, for merged em it's the highest
|
|
* generation of all merged ems.
|
|
* For non-merged extents, it's from btrfs_file_extent_item::generation.
|
|
*/
|
|
u64 generation;
|
|
u32 flags;
|
|
refcount_t refs;
|
|
struct list_head list;
|
|
};
|
|
|
|
struct extent_map_tree {
|
|
struct rb_root_cached map;
|
|
struct list_head modified_extents;
|
|
rwlock_t lock;
|
|
};
|
|
|
|
struct btrfs_inode;
|
|
|
|
static inline void extent_map_set_compression(struct extent_map *em,
|
|
enum btrfs_compression_type type)
|
|
{
|
|
if (type == BTRFS_COMPRESS_ZLIB)
|
|
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
|
|
else if (type == BTRFS_COMPRESS_LZO)
|
|
em->flags |= EXTENT_FLAG_COMPRESS_LZO;
|
|
else if (type == BTRFS_COMPRESS_ZSTD)
|
|
em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
|
|
}
|
|
|
|
static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
|
|
{
|
|
if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
|
|
return BTRFS_COMPRESS_ZLIB;
|
|
|
|
if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
|
|
return BTRFS_COMPRESS_LZO;
|
|
|
|
if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
|
|
return BTRFS_COMPRESS_ZSTD;
|
|
|
|
return BTRFS_COMPRESS_NONE;
|
|
}
|
|
|
|
/*
|
|
* More efficient way to determine if extent is compressed, instead of using
|
|
* 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
|
|
*/
|
|
static inline bool extent_map_is_compressed(const struct extent_map *em)
|
|
{
|
|
return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
|
|
EXTENT_FLAG_COMPRESS_LZO |
|
|
EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
|
|
}
|
|
|
|
static inline int extent_map_in_tree(const struct extent_map *em)
|
|
{
|
|
return !RB_EMPTY_NODE(&em->rb_node);
|
|
}
|
|
|
|
static inline u64 extent_map_end(const struct extent_map *em)
|
|
{
|
|
if (em->start + em->len < em->start)
|
|
return (u64)-1;
|
|
return em->start + em->len;
|
|
}
|
|
|
|
void extent_map_tree_init(struct extent_map_tree *tree);
|
|
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
|
|
u64 start, u64 len);
|
|
void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
|
|
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
|
|
u64 new_logical);
|
|
|
|
struct extent_map *alloc_extent_map(void);
|
|
void free_extent_map(struct extent_map *em);
|
|
int __init extent_map_init(void);
|
|
void __cold extent_map_exit(void);
|
|
int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
|
|
void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
|
|
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
|
|
u64 start, u64 len);
|
|
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
|
|
struct extent_map **em_in, u64 start, u64 len);
|
|
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
|
|
u64 start, u64 end,
|
|
bool skip_pinned);
|
|
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
|
|
struct extent_map *new_em,
|
|
bool modified);
|
|
long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
|
|
|
|
#endif
|