2018-09-12 09:16:07 +08:00
// SPDX-License-Identifier: GPL-2.0
2012-11-29 13:28:09 +09:00
/*
2012-11-03 06:50:41 +09:00
* f2fs debugging statistics
*
* Copyright ( c ) 2012 Samsung Electronics Co . , Ltd .
* http : //www.samsung.com/
* Copyright ( c ) 2012 Linux Foundation
* Copyright ( c ) 2012 Greg Kroah - Hartman < gregkh @ linuxfoundation . org >
*/
# include <linux/fs.h>
# include <linux/backing-dev.h>
# include <linux/f2fs_fs.h>
# include <linux/blkdev.h>
# include <linux/debugfs.h>
# include <linux/seq_file.h>
# include "f2fs.h"
# include "node.h"
# include "segment.h"
# include "gc.h"
static LIST_HEAD ( f2fs_stat_list ) ;
2013-01-14 20:08:16 +08:00
static DEFINE_MUTEX ( f2fs_stat_mutex ) ;
2020-01-22 10:51:16 -08:00
# ifdef CONFIG_DEBUG_FS
static struct dentry * f2fs_debugfs_root ;
# endif
/*
* This function calculates BDF of every segments
*/
void f2fs_update_sit_info ( struct f2fs_sb_info * sbi )
{
struct f2fs_stat_info * si = F2FS_STAT ( sbi ) ;
unsigned long long blks_per_sec , hblks_per_sec , total_vblocks ;
unsigned long long bimodal , dist ;
unsigned int segno , vblocks ;
int ndirty = 0 ;
bimodal = 0 ;
total_vblocks = 0 ;
blks_per_sec = BLKS_PER_SEC ( sbi ) ;
hblks_per_sec = blks_per_sec / 2 ;
for ( segno = 0 ; segno < MAIN_SEGS ( sbi ) ; segno + = sbi - > segs_per_sec ) {
vblocks = get_valid_blocks ( sbi , segno , true ) ;
dist = abs ( vblocks - hblks_per_sec ) ;
bimodal + = dist * dist ;
if ( vblocks > 0 & & vblocks < blks_per_sec ) {
total_vblocks + = vblocks ;
ndirty + + ;
}
}
dist = div_u64 ( MAIN_SECS ( sbi ) * hblks_per_sec * hblks_per_sec , 100 ) ;
si - > bimodal = div64_u64 ( bimodal , dist ) ;
if ( si - > dirty_count )
si - > avg_vblocks = div_u64 ( total_vblocks , ndirty ) ;
else
si - > avg_vblocks = 0 ;
}
2012-11-03 06:50:41 +09:00
2020-01-22 10:51:16 -08:00
# ifdef CONFIG_DEBUG_FS
2012-11-28 16:12:41 +09:00
static void update_general_status ( struct f2fs_sb_info * sbi )
2012-11-03 06:50:41 +09:00
{
2013-07-12 14:47:11 +08:00
struct f2fs_stat_info * si = F2FS_STAT ( sbi ) ;
2019-06-05 11:33:25 +08:00
struct f2fs_super_block * raw_super = F2FS_RAW_SUPER ( sbi ) ;
2012-11-03 06:50:41 +09:00
int i ;
2019-06-05 11:33:25 +08:00
/* these will be changed if online resize is done */
si - > main_area_segs = le32_to_cpu ( raw_super - > segment_count_main ) ;
si - > main_area_sections = le32_to_cpu ( raw_super - > section_count ) ;
si - > main_area_zones = si - > main_area_sections /
le32_to_cpu ( raw_super - > secs_per_zone ) ;
2014-08-06 23:22:50 +09:00
/* validation check of the segment numbers */
2015-09-30 17:38:48 +08:00
si - > hit_largest = atomic64_read ( & sbi - > read_hit_largest ) ;
si - > hit_cached = atomic64_read ( & sbi - > read_hit_cached ) ;
si - > hit_rbtree = atomic64_read ( & sbi - > read_hit_rbtree ) ;
2015-08-19 19:13:25 +08:00
si - > hit_total = si - > hit_largest + si - > hit_cached + si - > hit_rbtree ;
2015-09-30 17:38:48 +08:00
si - > total_ext = atomic64_read ( & sbi - > total_hit_ext ) ;
2015-12-21 19:20:15 -08:00
si - > ext_tree = atomic_read ( & sbi - > total_ext_tree ) ;
2015-12-31 15:24:14 -08:00
si - > zombie_tree = atomic_read ( & sbi - > total_zombie_tree ) ;
2015-02-05 17:58:28 +08:00
si - > ext_node = atomic_read ( & sbi - > total_ext_node ) ;
2012-11-03 06:50:41 +09:00
si - > ndirty_node = get_pages ( sbi , F2FS_DIRTY_NODES ) ;
si - > ndirty_dent = get_pages ( sbi , F2FS_DIRTY_DENTS ) ;
si - > ndirty_meta = get_pages ( sbi , F2FS_DIRTY_META ) ;
2015-12-17 17:14:44 +08:00
si - > ndirty_data = get_pages ( sbi , F2FS_DIRTY_DATA ) ;
2017-11-13 17:46:38 -08:00
si - > ndirty_qdata = get_pages ( sbi , F2FS_DIRTY_QDATA ) ;
2016-09-18 23:30:08 +08:00
si - > ndirty_imeta = get_pages ( sbi , F2FS_DIRTY_IMETA ) ;
2015-12-17 17:14:44 +08:00
si - > ndirty_dirs = sbi - > ndirty_inode [ DIR_INODE ] ;
si - > ndirty_files = sbi - > ndirty_inode [ FILE_INODE ] ;
2017-11-16 16:59:14 +08:00
si - > nquota_files = sbi - > nquota_files ;
2016-05-20 11:10:10 -07:00
si - > ndirty_all = sbi - > ndirty_inode [ DIRTY_META ] ;
2014-12-05 17:18:15 -08:00
si - > inmem_pages = get_pages ( sbi , F2FS_INMEM_PAGES ) ;
2019-12-05 08:52:39 +05:30
si - > aw_cnt = sbi - > atomic_files ;
2017-03-22 17:23:45 +08:00
si - > vw_cnt = atomic_read ( & sbi - > vw_cnt ) ;
2016-12-28 13:55:09 -08:00
si - > max_aw_cnt = atomic_read ( & sbi - > max_aw_cnt ) ;
2017-03-22 17:23:45 +08:00
si - > max_vw_cnt = atomic_read ( & sbi - > max_vw_cnt ) ;
2018-11-12 00:46:46 +08:00
si - > nr_dio_read = get_pages ( sbi , F2FS_DIO_READ ) ;
si - > nr_dio_write = get_pages ( sbi , F2FS_DIO_WRITE ) ;
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
si - > nr_wb_cp_data = get_pages ( sbi , F2FS_WB_CP_DATA ) ;
si - > nr_wb_data = get_pages ( sbi , F2FS_WB_DATA ) ;
2018-10-16 10:20:53 -07:00
si - > nr_rd_data = get_pages ( sbi , F2FS_RD_DATA ) ;
si - > nr_rd_node = get_pages ( sbi , F2FS_RD_NODE ) ;
si - > nr_rd_meta = get_pages ( sbi , F2FS_RD_META ) ;
2019-07-26 11:45:12 +08:00
if ( SM_I ( sbi ) - > fcc_info ) {
2017-03-25 17:19:58 +08:00
si - > nr_flushed =
atomic_read ( & SM_I ( sbi ) - > fcc_info - > issued_flush ) ;
si - > nr_flushing =
2018-12-13 16:53:57 -08:00
atomic_read ( & SM_I ( sbi ) - > fcc_info - > queued_flush ) ;
2017-09-14 10:18:01 +08:00
si - > flush_list_empty =
llist_empty ( & SM_I ( sbi ) - > fcc_info - > issue_list ) ;
2017-03-25 17:19:58 +08:00
}
2019-07-26 11:45:12 +08:00
if ( SM_I ( sbi ) - > dcc_info ) {
2017-03-25 17:19:58 +08:00
si - > nr_discarded =
atomic_read ( & SM_I ( sbi ) - > dcc_info - > issued_discard ) ;
si - > nr_discarding =
2018-12-13 16:53:57 -08:00
atomic_read ( & SM_I ( sbi ) - > dcc_info - > queued_discard ) ;
2017-03-25 17:19:59 +08:00
si - > nr_discard_cmd =
atomic_read ( & SM_I ( sbi ) - > dcc_info - > discard_cmd_cnt ) ;
2017-04-18 19:27:39 +08:00
si - > undiscard_blks = SM_I ( sbi ) - > dcc_info - > undiscard_blks ;
2017-03-25 17:19:58 +08:00
}
2012-11-03 06:50:41 +09:00
si - > total_count = ( int ) sbi - > user_block_count / sbi - > blocks_per_seg ;
si - > rsvd_segs = reserved_segments ( sbi ) ;
si - > overp_segs = overprovision_segments ( sbi ) ;
si - > valid_count = valid_user_blocks ( sbi ) ;
2016-08-18 21:01:18 +08:00
si - > discard_blks = discard_blocks ( sbi ) ;
2012-11-03 06:50:41 +09:00
si - > valid_node_count = valid_node_count ( sbi ) ;
si - > valid_inode_count = valid_inode_count ( sbi ) ;
2015-07-15 17:28:53 +08:00
si - > inline_xattr = atomic_read ( & sbi - > inline_xattr ) ;
2014-12-08 19:08:20 +08:00
si - > inline_inode = atomic_read ( & sbi - > inline_inode ) ;
si - > inline_dir = atomic_read ( & sbi - > inline_dir ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
si - > compr_inode = atomic_read ( & sbi - > compr_inode ) ;
2020-08-31 11:09:49 +09:00
si - > compr_blocks = atomic64_read ( & sbi - > compr_blocks ) ;
2017-02-01 15:40:11 -08:00
si - > append = sbi - > im [ APPEND_INO ] . ino_num ;
si - > update = sbi - > im [ UPDATE_INO ] . ino_num ;
2016-05-10 19:13:50 -07:00
si - > orphans = sbi - > im [ ORPHAN_INO ] . ino_num ;
2012-11-03 06:50:41 +09:00
si - > utilization = utilization ( sbi ) ;
si - > free_segs = free_segments ( sbi ) ;
si - > free_secs = free_sections ( sbi ) ;
si - > prefree_count = prefree_segments ( sbi ) ;
si - > dirty_count = dirty_segments ( sbi ) ;
2019-01-01 00:11:30 -08:00
if ( sbi - > node_inode )
si - > node_pages = NODE_MAPPING ( sbi ) - > nrpages ;
if ( sbi - > meta_inode )
si - > meta_pages = META_MAPPING ( sbi ) - > nrpages ;
2020-11-06 13:22:05 -08:00
si - > nats = NM_I ( sbi ) - > nat_cnt [ TOTAL_NAT ] ;
si - > dirty_nats = NM_I ( sbi ) - > nat_cnt [ DIRTY_NAT ] ;
2015-01-07 11:09:37 -08:00
si - > sits = MAIN_SEGS ( sbi ) ;
si - > dirty_sits = SIT_I ( sbi ) - > dirty_sentries ;
2017-09-29 13:59:35 +08:00
si - > free_nids = NM_I ( sbi ) - > nid_cnt [ FREE_NID ] ;
2017-05-01 18:13:03 -07:00
si - > avail_nids = NM_I ( sbi ) - > available_nids ;
2017-09-29 13:59:35 +08:00
si - > alloc_nids = NM_I ( sbi ) - > nid_cnt [ PREALLOC_NID ] ;
2018-09-29 18:31:28 +08:00
si - > io_skip_bggc = sbi - > io_skip_bggc ;
si - > other_skip_bggc = sbi - > other_skip_bggc ;
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
si - > skipped_atomic_files [ BG_GC ] = sbi - > skipped_atomic_files [ BG_GC ] ;
si - > skipped_atomic_files [ FG_GC ] = sbi - > skipped_atomic_files [ FG_GC ] ;
2012-11-03 06:50:41 +09:00
si - > util_free = ( int ) ( free_user_blocks ( sbi ) > > sbi - > log_blocks_per_seg )
* 100 / ( int ) ( sbi - > user_block_count > > sbi - > log_blocks_per_seg )
/ 2 ;
si - > util_valid = ( int ) ( written_block_count ( sbi ) > >
sbi - > log_blocks_per_seg )
* 100 / ( int ) ( sbi - > user_block_count > > sbi - > log_blocks_per_seg )
/ 2 ;
si - > util_invalid = 50 - si - > util_free - si - > util_valid ;
f2fs: introduce inmem curseg
Previous implementation of aligned pinfile allocation will:
- allocate new segment on cold data log no matter whether last used
segment is partially used or not, it makes IOs more random;
- force concurrent cold data/GCed IO going into warm data area, it
can make a bad effect on hot/cold data separation;
In this patch, we introduce a new type of log named 'inmem curseg',
the differents from normal curseg is:
- it reuses existed segment type (CURSEG_XXX_NODE/DATA);
- it only exists in memory, its segno, blkofs, summary will not b
persisted into checkpoint area;
With this new feature, we can enhance scalability of log, special
allocators can be created for purposes:
- pure lfs allocator for aligned pinfile allocation or file
defragmentation
- pure ssr allocator for later feature
So that, let's update aligned pinfile allocation to use this new
inmem curseg fwk.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 21:14:45 +08:00
for ( i = CURSEG_HOT_DATA ; i < NO_CHECK_TYPE ; i + + ) {
2012-11-03 06:50:41 +09:00
struct curseg_info * curseg = CURSEG_I ( sbi , i ) ;
si - > curseg [ i ] = curseg - > segno ;
2017-04-07 15:08:17 -07:00
si - > cursec [ i ] = GET_SEC_FROM_SEG ( sbi , curseg - > segno ) ;
si - > curzone [ i ] = GET_ZONE_FROM_SEC ( sbi , si - > cursec [ i ] ) ;
2012-11-03 06:50:41 +09:00
}
2018-09-29 18:31:27 +08:00
for ( i = META_CP ; i < META_MAX ; i + + )
si - > meta_count [ i ] = atomic_read ( & sbi - > meta_count [ i ] ) ;
2020-06-28 10:58:44 +08:00
for ( i = 0 ; i < NO_CHECK_TYPE ; i + + ) {
si - > dirty_seg [ i ] = 0 ;
si - > full_seg [ i ] = 0 ;
si - > valid_blks [ i ] = 0 ;
}
for ( i = 0 ; i < MAIN_SEGS ( sbi ) ; i + + ) {
int blks = get_seg_entry ( sbi , i ) - > valid_blocks ;
int type = get_seg_entry ( sbi , i ) - > type ;
if ( ! blks )
continue ;
if ( blks = = sbi - > blocks_per_seg )
si - > full_seg [ type ] + + ;
else
si - > dirty_seg [ type ] + + ;
si - > valid_blks [ type ] + = blks ;
}
2012-11-03 06:50:41 +09:00
for ( i = 0 ; i < 2 ; i + + ) {
si - > segment_count [ i ] = sbi - > segment_count [ i ] ;
si - > block_count [ i ] = sbi - > block_count [ i ] ;
}
2014-12-24 02:16:54 +09:00
si - > inplace_count = atomic_read ( & sbi - > inplace_count ) ;
2012-11-03 06:50:41 +09:00
}
2012-11-29 13:28:09 +09:00
/*
2012-11-03 06:50:41 +09:00
* This function calculates memory footprint .
*/
static void update_mem_info ( struct f2fs_sb_info * sbi )
{
2013-07-12 14:47:11 +08:00
struct f2fs_stat_info * si = F2FS_STAT ( sbi ) ;
2014-11-06 15:16:04 -08:00
int i ;
2012-11-03 06:50:41 +09:00
if ( si - > base_mem )
goto get_cache ;
2017-03-18 09:25:05 +08:00
/* build stat */
si - > base_mem = sizeof ( struct f2fs_stat_info ) ;
/* build superblock */
si - > base_mem + = sizeof ( struct f2fs_sb_info ) + sbi - > sb - > s_blocksize ;
2012-11-03 06:50:41 +09:00
si - > base_mem + = 2 * sizeof ( struct f2fs_inode_info ) ;
si - > base_mem + = sizeof ( * sbi - > ckpt ) ;
/* build sm */
si - > base_mem + = sizeof ( struct f2fs_sm_info ) ;
/* build sit */
si - > base_mem + = sizeof ( struct sit_info ) ;
2014-09-23 11:23:01 -07:00
si - > base_mem + = MAIN_SEGS ( sbi ) * sizeof ( struct seg_entry ) ;
si - > base_mem + = f2fs_bitmap_size ( MAIN_SEGS ( sbi ) ) ;
2016-08-02 10:56:40 -07:00
si - > base_mem + = 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS ( sbi ) ;
f2fs: fix to avoid NULL pointer dereference on se->discard_map
https://bugzilla.kernel.org/show_bug.cgi?id=200951
These is a NULL pointer dereference issue reported in bugzilla:
Hi,
in the setup there is a SATA SSD connected to a SATA-to-USB bridge.
The disc is "Samsung SSD 850 PRO 256G" which supports TRIM.
There are four partitions:
sda1: FAT /boot
sda2: F2FS /
sda3: F2FS /home
sda4: F2FS
The bridge is ASMT1153e which uses the "uas" driver.
There is no TRIM pass-through, so, when mounting it reports:
mounting with "discard" option, but the device does not support discard
The USB host is USB3.0 and UASP capable. It is the one on RK3399.
Given this everything works fine, except there is no TRIM support.
In order to enable TRIM a new UDEV rule is added [1]:
/etc/udev/rules.d/10-sata-bridge-trim.rules:
ACTION=="add|change", ATTRS{idVendor}=="174c", ATTRS{idProduct}=="55aa", SUBSYSTEM=="scsi_disk", ATTR{provisioning_mode}="unmap"
After reboot any F2FS write hangs forever and dmesg reports:
Unable to handle kernel NULL pointer dereference
Also tested on a x86_64 system: works fine even with TRIM enabled.
same disc
same bridge
different usb host controller
different cpu architecture
not root filesystem
Regards,
Vicenç.
[1] Post #5 in https://bbs.archlinux.org/viewtopic.php?id=236280
Unable to handle kernel NULL pointer dereference at virtual address 000000000000003e
Mem abort info:
ESR = 0x96000004
Exception class = DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
Data abort info:
ISV = 0, ISS = 0x00000004
CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000626e3122
[000000000000003e] pgd=0000000000000000
Internal error: Oops: 96000004 [#1] SMP
Modules linked in: overlay snd_soc_hdmi_codec rc_cec dw_hdmi_i2s_audio dw_hdmi_cec snd_soc_simple_card snd_soc_simple_card_utils snd_soc_rockchip_i2s rockchip_rga snd_soc_rockchip_pcm rockchipdrm videobuf2_dma_sg v4l2_mem2mem rtc_rk808 videobuf2_memops analogix_dp videobuf2_v4l2 videobuf2_common dw_hdmi dw_wdt cec rc_core videodev drm_kms_helper media drm rockchip_thermal rockchip_saradc realtek drm_panel_orientation_quirks syscopyarea sysfillrect sysimgblt fb_sys_fops dwmac_rk stmmac_platform stmmac pwm_bl squashfs loop crypto_user gpio_keys hid_kensington
CPU: 5 PID: 957 Comm: nvim Not tainted 4.19.0-rc1-1-ARCH #1
Hardware name: Sapphire-RK3399 Board (DT)
pstate: 00000005 (nzcv daif -PAN -UAO)
pc : update_sit_entry+0x304/0x4b0
lr : update_sit_entry+0x108/0x4b0
sp : ffff00000ca13bd0
x29: ffff00000ca13bd0 x28: 000000000000003e
x27: 0000000000000020 x26: 0000000000080000
x25: 0000000000000048 x24: ffff8000ebb85cf8
x23: 0000000000000253 x22: 00000000ffffffff
x21: 00000000000535f2 x20: 00000000ffffffdf
x19: ffff8000eb9e6800 x18: ffff8000eb9e6be8
x17: 0000000007ce6926 x16: 000000001c83ffa8
x15: 0000000000000000 x14: ffff8000f602df90
x13: 0000000000000006 x12: 0000000000000040
x11: 0000000000000228 x10: 0000000000000000
x9 : 0000000000000000 x8 : 0000000000000000
x7 : 00000000000535f2 x6 : ffff8000ebff3440
x5 : ffff8000ebff3440 x4 : ffff8000ebe3a6c8
x3 : 00000000ffffffff x2 : 0000000000000020
x1 : 0000000000000000 x0 : ffff8000eb9e5800
Process nvim (pid: 957, stack limit = 0x0000000063a78320)
Call trace:
update_sit_entry+0x304/0x4b0
f2fs_invalidate_blocks+0x98/0x140
truncate_node+0x90/0x400
f2fs_remove_inode_page+0xe8/0x340
f2fs_evict_inode+0x2b0/0x408
evict+0xe0/0x1e0
iput+0x160/0x260
do_unlinkat+0x214/0x298
__arm64_sys_unlinkat+0x3c/0x68
el0_svc_handler+0x94/0x118
el0_svc+0x8/0xc
Code: f9400800 b9488400 36080140 f9400f01 (387c4820)
---[ end trace a0f21a307118c477 ]---
The reason is it is possible to enable discard flag on block queue via
UDEV, but during mount, f2fs will initialize se->discard_map only if
this flag is set, once the flag is set after mount, f2fs may dereference
NULL pointer on se->discard_map.
So this patch does below changes to fix this issue:
- initialize and update se->discard_map all the time.
- don't clear DISCARD option if device has no QUEUE_FLAG_DISCARD flag
during mount.
- don't issue small discard on zoned block device.
- introduce some functions to enhance the readability.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Tested-by: Vicente Bergas <vicencb@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-04 03:52:17 +08:00
si - > base_mem + = SIT_VBLOCK_MAP_SIZE * MAIN_SEGS ( sbi ) ;
2015-02-10 16:44:29 -08:00
si - > base_mem + = SIT_VBLOCK_MAP_SIZE ;
2018-10-24 18:37:26 +08:00
if ( __is_large_section ( sbi ) )
2014-09-23 11:23:01 -07:00
si - > base_mem + = MAIN_SECS ( sbi ) * sizeof ( struct sec_entry ) ;
2012-11-03 06:50:41 +09:00
si - > base_mem + = __bitmap_size ( sbi , SIT_BITMAP ) ;
/* build free segmap */
si - > base_mem + = sizeof ( struct free_segmap_info ) ;
2014-09-23 11:23:01 -07:00
si - > base_mem + = f2fs_bitmap_size ( MAIN_SEGS ( sbi ) ) ;
si - > base_mem + = f2fs_bitmap_size ( MAIN_SECS ( sbi ) ) ;
2012-11-03 06:50:41 +09:00
/* build curseg */
si - > base_mem + = sizeof ( struct curseg_info ) * NR_CURSEG_TYPE ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
si - > base_mem + = PAGE_SIZE * NR_CURSEG_TYPE ;
2012-11-03 06:50:41 +09:00
/* build dirty segmap */
si - > base_mem + = sizeof ( struct dirty_seglist_info ) ;
2014-09-23 11:23:01 -07:00
si - > base_mem + = NR_DIRTY_TYPE * f2fs_bitmap_size ( MAIN_SEGS ( sbi ) ) ;
si - > base_mem + = f2fs_bitmap_size ( MAIN_SECS ( sbi ) ) ;
2012-11-03 06:50:41 +09:00
2014-08-06 23:22:50 +09:00
/* build nm */
2012-11-03 06:50:41 +09:00
si - > base_mem + = sizeof ( struct f2fs_nm_info ) ;
si - > base_mem + = __bitmap_size ( sbi , NAT_BITMAP ) ;
2017-02-09 10:38:09 -08:00
si - > base_mem + = ( NM_I ( sbi ) - > nat_bits_blocks < < F2FS_BLKSIZE_BITS ) ;
2018-06-27 14:46:21 +08:00
si - > base_mem + = NM_I ( sbi ) - > nat_blocks *
f2fs_bitmap_size ( NAT_ENTRY_PER_BLOCK ) ;
f2fs: introduce free nid bitmap
In scenario of intensively node allocation, free nids will be ran out
soon, then it needs to stop to load free nids by traversing NAT blocks,
in worse case, if NAT blocks does not be cached in memory, it generates
IOs which slows down our foreground operations.
In order to speed up node allocation, in this patch we introduce a new
free_nid_bitmap array, so there is an bitmap table for each NAT block,
Once the NAT block is loaded, related bitmap cache will be switched on,
and bitmap will be set during traversing nat entries in NAT block, later
we can query and update nid usage status in memory completely.
With such implementation, I expect performance of node allocation can be
improved in the long-term after filesystem image is mounted.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-02-23 10:53:49 +08:00
si - > base_mem + = NM_I ( sbi ) - > nat_blocks / 8 ;
2017-03-01 17:09:07 +08:00
si - > base_mem + = NM_I ( sbi ) - > nat_blocks * sizeof ( unsigned short ) ;
2012-11-03 06:50:41 +09:00
2015-01-10 20:09:52 +08:00
get_cache :
si - > cache_mem = 0 ;
2012-11-03 06:50:41 +09:00
/* build gc */
2015-01-10 20:09:52 +08:00
if ( sbi - > gc_thread )
si - > cache_mem + = sizeof ( struct f2fs_gc_kthread ) ;
/* build merge flush thread */
2017-01-09 14:13:03 -08:00
if ( SM_I ( sbi ) - > fcc_info )
2015-01-10 20:09:52 +08:00
si - > cache_mem + = sizeof ( struct flush_cmd_control ) ;
2017-03-25 17:19:59 +08:00
if ( SM_I ( sbi ) - > dcc_info ) {
2017-01-11 14:40:24 -08:00
si - > cache_mem + = sizeof ( struct discard_cmd_control ) ;
2017-03-25 17:19:59 +08:00
si - > cache_mem + = sizeof ( struct discard_cmd ) *
atomic_read ( & SM_I ( sbi ) - > dcc_info - > discard_cmd_cnt ) ;
}
2012-11-03 06:50:41 +09:00
/* free nids */
2017-09-29 13:59:35 +08:00
si - > cache_mem + = ( NM_I ( sbi ) - > nid_cnt [ FREE_NID ] +
NM_I ( sbi ) - > nid_cnt [ PREALLOC_NID ] ) *
f2fs: split free nid list
During free nid allocation, in order to do preallocation, we will tag free
nid entry as allocated one and still leave it in free nid list, for other
allocators who want to grab free nids, it needs to traverse the free nid
list for lookup. It becomes overhead in scenario of allocating free nid
intensively by multithreads.
This patch splits free nid list to two list: {free,alloc}_nid_list, to
keep free nids and preallocated free nids separately, after that, traverse
latency will be gone, besides split nid_cnt for separate statistic.
Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for
cleanup.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-10-12 19:28:29 +08:00
sizeof ( struct free_nid ) ;
2020-11-06 13:22:05 -08:00
si - > cache_mem + = NM_I ( sbi ) - > nat_cnt [ TOTAL_NAT ] *
sizeof ( struct nat_entry ) ;
si - > cache_mem + = NM_I ( sbi ) - > nat_cnt [ DIRTY_NAT ] *
sizeof ( struct nat_entry_set ) ;
2015-01-10 21:37:36 -08:00
si - > cache_mem + = si - > inmem_pages * sizeof ( struct inmem_pages ) ;
2017-09-29 13:59:37 +08:00
for ( i = 0 ; i < MAX_INO_ENTRY ; i + + )
2014-11-18 11:18:36 +08:00
si - > cache_mem + = sbi - > im [ i ] . ino_num * sizeof ( struct ino_entry ) ;
2015-12-21 19:20:15 -08:00
si - > cache_mem + = atomic_read ( & sbi - > total_ext_tree ) *
sizeof ( struct extent_tree ) ;
2015-02-05 17:58:28 +08:00
si - > cache_mem + = atomic_read ( & sbi - > total_ext_node ) *
sizeof ( struct extent_node ) ;
2015-01-10 21:37:36 -08:00
si - > page_mem = 0 ;
2019-01-01 00:11:30 -08:00
if ( sbi - > node_inode ) {
unsigned npages = NODE_MAPPING ( sbi ) - > nrpages ;
si - > page_mem + = ( unsigned long long ) npages < < PAGE_SHIFT ;
}
if ( sbi - > meta_inode ) {
unsigned npages = META_MAPPING ( sbi ) - > nrpages ;
si - > page_mem + = ( unsigned long long ) npages < < PAGE_SHIFT ;
}
2012-11-03 06:50:41 +09:00
}
static int stat_show ( struct seq_file * s , void * v )
{
2013-05-14 20:06:46 +08:00
struct f2fs_stat_info * si ;
2012-11-03 06:50:41 +09:00
int i = 0 ;
int j ;
2013-01-14 20:08:16 +08:00
mutex_lock ( & f2fs_stat_mutex ) ;
2013-05-14 20:06:46 +08:00
list_for_each_entry ( si , & f2fs_stat_list , stat_list ) {
2012-11-03 06:50:41 +09:00
update_general_status ( si - > sbi ) ;
2017-10-26 10:31:22 +02:00
seq_printf ( s , " \n =====[ partition info(%pg). #%d, %s, CP: %s]===== \n " ,
2016-03-18 09:46:10 -07:00
si - > sbi - > sb - > s_bdev , i + + ,
2017-10-26 10:31:22 +02:00
f2fs_readonly ( si - > sbi - > sb ) ? " RO " : " RW " ,
2018-08-20 19:21:43 -07:00
is_set_ckpt_flags ( si - > sbi , CP_DISABLED_FLAG ) ?
" Disabled " : ( f2fs_cp_error ( si - > sbi ) ? " Error " : " Good " ) ) ;
2012-12-31 13:59:09 +08:00
seq_printf ( s , " [SB: 1] [CP: 2] [SIT: %d] [NAT: %d] " ,
si - > sit_area_segs , si - > nat_area_segs ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " [SSA: %d] [MAIN: %d " ,
si - > ssa_area_segs , si - > main_area_segs ) ;
seq_printf ( s , " (OverProv:%d Resv:%d)] \n \n " ,
si - > overp_segs , si - > rsvd_segs ) ;
2020-02-25 19:08:16 -08:00
seq_printf ( s , " Current Time Sec: %llu / Mounted Time Sec: %llu \n \n " ,
ktime_get_boottime_seconds ( ) ,
SIT_I ( si - > sbi ) - > mounted_time ) ;
2016-08-18 21:01:18 +08:00
if ( test_opt ( si - > sbi , DISCARD ) )
seq_printf ( s , " Utilization: %u%% (%u valid blocks, %u discard blocks) \n " ,
si - > utilization , si - > valid_count , si - > discard_blks ) ;
else
seq_printf ( s , " Utilization: %u%% (%u valid blocks) \n " ,
si - > utilization , si - > valid_count ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " - Node: %u (Inode: %u, " ,
si - > valid_node_count , si - > valid_inode_count ) ;
seq_printf ( s , " Other: %u) \n - Data: %u \n " ,
si - > valid_node_count - si - > valid_inode_count ,
si - > valid_count - si - > valid_node_count ) ;
2015-07-15 17:28:53 +08:00
seq_printf ( s , " - Inline_xattr Inode: %u \n " ,
si - > inline_xattr ) ;
2013-11-26 11:08:57 +09:00
seq_printf ( s , " - Inline_data Inode: %u \n " ,
si - > inline_inode ) ;
2014-10-13 20:00:16 -07:00
seq_printf ( s , " - Inline_dentry Inode: %u \n " ,
si - > inline_dir ) ;
2020-08-31 11:09:49 +09:00
seq_printf ( s , " - Compressed Inode: %u, Blocks: %llu \n " ,
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
si - > compr_inode , si - > compr_blocks ) ;
2017-02-01 15:40:11 -08:00
seq_printf ( s , " - Orphan/Append/Update Inode: %u, %u, %u \n " ,
si - > orphans , si - > append , si - > update ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " \n Main area: %d segs, %d secs %d zones \n " ,
si - > main_area_segs , si - > main_area_sections ,
si - > main_area_zones ) ;
2020-06-28 10:58:44 +08:00
seq_printf ( s , " TYPE %8s %8s %8s %10s %10s %10s \n " ,
" segno " , " secno " , " zoneno " , " dirty_seg " , " full_seg " , " valid_blk " ) ;
seq_printf ( s , " - COLD data: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_COLD_DATA ] ,
si - > cursec [ CURSEG_COLD_DATA ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_COLD_DATA ] ,
si - > dirty_seg [ CURSEG_COLD_DATA ] ,
si - > full_seg [ CURSEG_COLD_DATA ] ,
si - > valid_blks [ CURSEG_COLD_DATA ] ) ;
seq_printf ( s , " - WARM data: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_WARM_DATA ] ,
si - > cursec [ CURSEG_WARM_DATA ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_WARM_DATA ] ,
si - > dirty_seg [ CURSEG_WARM_DATA ] ,
si - > full_seg [ CURSEG_WARM_DATA ] ,
si - > valid_blks [ CURSEG_WARM_DATA ] ) ;
seq_printf ( s , " - HOT data: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_HOT_DATA ] ,
si - > cursec [ CURSEG_HOT_DATA ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_HOT_DATA ] ,
si - > dirty_seg [ CURSEG_HOT_DATA ] ,
si - > full_seg [ CURSEG_HOT_DATA ] ,
si - > valid_blks [ CURSEG_HOT_DATA ] ) ;
seq_printf ( s , " - Dir dnode: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_HOT_NODE ] ,
si - > cursec [ CURSEG_HOT_NODE ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_HOT_NODE ] ,
si - > dirty_seg [ CURSEG_HOT_NODE ] ,
si - > full_seg [ CURSEG_HOT_NODE ] ,
si - > valid_blks [ CURSEG_HOT_NODE ] ) ;
seq_printf ( s , " - File dnode: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_WARM_NODE ] ,
si - > cursec [ CURSEG_WARM_NODE ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_WARM_NODE ] ,
si - > dirty_seg [ CURSEG_WARM_NODE ] ,
si - > full_seg [ CURSEG_WARM_NODE ] ,
si - > valid_blks [ CURSEG_WARM_NODE ] ) ;
seq_printf ( s , " - Indir nodes: %8d %8d %8d %10u %10u %10u \n " ,
2012-11-03 06:50:41 +09:00
si - > curseg [ CURSEG_COLD_NODE ] ,
si - > cursec [ CURSEG_COLD_NODE ] ,
2020-06-28 10:58:44 +08:00
si - > curzone [ CURSEG_COLD_NODE ] ,
si - > dirty_seg [ CURSEG_COLD_NODE ] ,
si - > full_seg [ CURSEG_COLD_NODE ] ,
si - > valid_blks [ CURSEG_COLD_NODE ] ) ;
f2fs: introduce inmem curseg
Previous implementation of aligned pinfile allocation will:
- allocate new segment on cold data log no matter whether last used
segment is partially used or not, it makes IOs more random;
- force concurrent cold data/GCed IO going into warm data area, it
can make a bad effect on hot/cold data separation;
In this patch, we introduce a new type of log named 'inmem curseg',
the differents from normal curseg is:
- it reuses existed segment type (CURSEG_XXX_NODE/DATA);
- it only exists in memory, its segno, blkofs, summary will not b
persisted into checkpoint area;
With this new feature, we can enhance scalability of log, special
allocators can be created for purposes:
- pure lfs allocator for aligned pinfile allocation or file
defragmentation
- pure ssr allocator for later feature
So that, let's update aligned pinfile allocation to use this new
inmem curseg fwk.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 21:14:45 +08:00
seq_printf ( s , " - Pinned file: %8d %8d %8d \n " ,
si - > curseg [ CURSEG_COLD_DATA_PINNED ] ,
si - > cursec [ CURSEG_COLD_DATA_PINNED ] ,
si - > curzone [ CURSEG_COLD_DATA_PINNED ] ) ;
f2fs: support age threshold based garbage collection
There are several issues in current background GC algorithm:
- valid blocks is one of key factors during cost overhead calculation,
so if segment has less valid block, however even its age is young or
it locates hot segment, CB algorithm will still choose the segment as
victim, it's not appropriate.
- GCed data/node will go to existing logs, no matter in-there datas'
update frequency is the same or not, it may mix hot and cold data
again.
- GC alloctor mainly use LFS type segment, it will cost free segment
more quickly.
This patch introduces a new algorithm named age threshold based
garbage collection to solve above issues, there are three steps
mainly:
1. select a source victim:
- set an age threshold, and select candidates beased threshold:
e.g.
0 means youngest, 100 means oldest, if we set age threshold to 80
then select dirty segments which has age in range of [80, 100] as
candiddates;
- set candidate_ratio threshold, and select candidates based the
ratio, so that we can shrink candidates to those oldest segments;
- select target segment with fewest valid blocks in order to
migrate blocks with minimum cost;
2. select a target victim:
- select candidates beased age threshold;
- set candidate_radius threshold, search candidates whose age is
around source victims, searching radius should less than the
radius threshold.
- select target segment with most valid blocks in order to avoid
migrating current target segment.
3. merge valid blocks from source victim into target victim with
SSR alloctor.
Test steps:
- create 160 dirty segments:
* half of them have 128 valid blocks per segment
* left of them have 384 valid blocks per segment
- run background GC
Benefit: GC count and block movement count both decrease obviously:
- Before:
- Valid: 86
- Dirty: 1
- Prefree: 11
- Free: 6001 (6001)
GC calls: 162 (BG: 220)
- data segments : 160 (160)
- node segments : 2 (2)
Try to move 41454 blocks (BG: 41454)
- data blocks : 40960 (40960)
- node blocks : 494 (494)
IPU: 0 blocks
SSR: 0 blocks in 0 segments
LFS: 41364 blocks in 81 segments
- After:
- Valid: 87
- Dirty: 0
- Prefree: 4
- Free: 6008 (6008)
GC calls: 75 (BG: 76)
- data segments : 74 (74)
- node segments : 1 (1)
Try to move 12813 blocks (BG: 12813)
- data blocks : 12544 (12544)
- node blocks : 269 (269)
IPU: 0 blocks
SSR: 12032 blocks in 77 segments
LFS: 855 blocks in 2 segments
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 21:14:49 +08:00
seq_printf ( s , " - ATGC data: %8d %8d %8d \n " ,
si - > curseg [ CURSEG_ALL_DATA_ATGC ] ,
si - > cursec [ CURSEG_ALL_DATA_ATGC ] ,
si - > curzone [ CURSEG_ALL_DATA_ATGC ] ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " \n - Valid: %d \n - Dirty: %d \n " ,
si - > main_area_segs - si - > dirty_count -
si - > prefree_count - si - > free_segs ,
si - > dirty_count ) ;
seq_printf ( s , " - Prefree: %d \n - Free: %d (%d) \n \n " ,
si - > prefree_count , si - > free_segs , si - > free_secs ) ;
2016-01-09 13:45:17 -08:00
seq_printf ( s , " CP calls: %d (BG: %d) \n " ,
si - > cp_count , si - > bg_cp_count ) ;
2018-09-29 18:31:27 +08:00
seq_printf ( s , " - cp blocks : %u \n " , si - > meta_count [ META_CP ] ) ;
seq_printf ( s , " - sit blocks : %u \n " ,
si - > meta_count [ META_SIT ] ) ;
seq_printf ( s , " - nat blocks : %u \n " ,
si - > meta_count [ META_NAT ] ) ;
seq_printf ( s , " - ssa blocks : %u \n " ,
si - > meta_count [ META_SSA ] ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " GC calls: %d (BG: %d) \n " ,
si - > call_count , si - > bg_gc ) ;
2014-12-23 08:37:39 +09:00
seq_printf ( s , " - data segments : %d (%d) \n " ,
si - > data_segs , si - > bg_data_segs ) ;
seq_printf ( s , " - node segments : %d (%d) \n " ,
si - > node_segs , si - > bg_node_segs ) ;
seq_printf ( s , " Try to move %d blocks (BG: %d) \n " , si - > tot_blks ,
si - > bg_data_blks + si - > bg_node_blks ) ;
seq_printf ( s , " - data blocks : %d (%d) \n " , si - > data_blks ,
si - > bg_data_blks ) ;
seq_printf ( s , " - node blocks : %d (%d) \n " , si - > node_blks ,
si - > bg_node_blks ) ;
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
seq_printf ( s , " Skipped : atomic write %llu (%llu) \n " ,
si - > skipped_atomic_files [ BG_GC ] +
si - > skipped_atomic_files [ FG_GC ] ,
si - > skipped_atomic_files [ BG_GC ] ) ;
2018-09-29 18:31:28 +08:00
seq_printf ( s , " BG skip : IO: %u, Other: %u \n " ,
si - > io_skip_bggc , si - > other_skip_bggc ) ;
2015-08-19 19:13:25 +08:00
seq_puts ( s , " \n Extent Cache: \n " ) ;
2015-09-30 17:38:48 +08:00
seq_printf ( s , " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu \n " ,
2015-08-19 19:12:20 +08:00
si - > hit_largest , si - > hit_cached ,
2015-08-19 19:13:25 +08:00
si - > hit_rbtree ) ;
2015-09-30 17:38:48 +08:00
seq_printf ( s , " - Hit Ratio: %llu%% (%llu / %llu) \n " ,
2015-08-19 19:13:25 +08:00
! si - > total_ext ? 0 :
2015-09-30 17:38:48 +08:00
div64_u64 ( si - > hit_total * 100 , si - > total_ext ) ,
2015-08-19 19:13:25 +08:00
si - > hit_total , si - > total_ext ) ;
2015-12-31 15:24:14 -08:00
seq_printf ( s , " - Inner Struct Count: tree: %d(%d), node: %d \n " ,
si - > ext_tree , si - > zombie_tree , si - > ext_node ) ;
2014-01-17 14:44:39 -06:00
seq_puts ( s , " \n Balancing F2FS Async: \n " ) ;
2018-11-12 00:46:46 +08:00
seq_printf ( s , " - DIO (R: %4d, W: %4d) \n " ,
si - > nr_dio_read , si - > nr_dio_write ) ;
2018-10-16 10:20:53 -07:00
seq_printf ( s , " - IO_R (Data: %4d, Node: %4d, Meta: %4d \n " ,
si - > nr_rd_data , si - > nr_rd_node , si - > nr_rd_meta ) ;
seq_printf ( s , " - IO_W (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
2017-04-18 19:27:39 +08:00
" Discard: (%4d %4d)) cmd: %4d undiscard:%4u \n " ,
2017-01-11 10:20:04 -08:00
si - > nr_wb_cp_data , si - > nr_wb_data ,
2017-03-25 17:19:58 +08:00
si - > nr_flushing , si - > nr_flushed ,
2017-09-14 10:18:01 +08:00
si - > flush_list_empty ,
2017-03-25 17:19:59 +08:00
si - > nr_discarding , si - > nr_discarded ,
2017-04-18 19:27:39 +08:00
si - > nr_discard_cmd , si - > undiscard_blks ) ;
2017-03-22 17:23:45 +08:00
seq_printf ( s , " - inmem: %4d, atomic IO: %4d (Max. %4d), "
" volatile IO: %4d (Max. %4d) \n " ,
si - > inmem_pages , si - > aw_cnt , si - > max_aw_cnt ,
si - > vw_cnt , si - > max_vw_cnt ) ;
2016-10-20 19:09:57 -07:00
seq_printf ( s , " - nodes: %4d in %4d \n " ,
2012-11-03 06:50:41 +09:00
si - > ndirty_node , si - > node_pages ) ;
2016-10-20 19:09:57 -07:00
seq_printf ( s , " - dents: %4d in dirs:%4d (%4d) \n " ,
2016-05-20 11:10:10 -07:00
si - > ndirty_dent , si - > ndirty_dirs , si - > ndirty_all ) ;
2016-10-20 19:09:57 -07:00
seq_printf ( s , " - datas: %4d in files:%4d \n " ,
2015-12-17 17:14:44 +08:00
si - > ndirty_data , si - > ndirty_files ) ;
2017-11-13 17:46:38 -08:00
seq_printf ( s , " - quota datas: %4d in quota files:%4d \n " ,
si - > ndirty_qdata , si - > nquota_files ) ;
2016-10-20 19:09:57 -07:00
seq_printf ( s , " - meta: %4d in %4d \n " ,
2012-11-03 06:50:41 +09:00
si - > ndirty_meta , si - > meta_pages ) ;
2016-10-20 19:09:57 -07:00
seq_printf ( s , " - imeta: %4d \n " ,
2016-09-18 23:30:08 +08:00
si - > ndirty_imeta ) ;
2015-01-07 11:09:37 -08:00
seq_printf ( s , " - NATs: %9d/%9d \n - SITs: %9d/%9d \n " ,
si - > dirty_nats , si - > nats , si - > dirty_sits , si - > sits ) ;
2017-05-01 18:13:03 -07:00
seq_printf ( s , " - free_nids: %9d/%9d \n - alloc_nids: %9d \n " ,
si - > free_nids , si - > avail_nids , si - > alloc_nids ) ;
2013-07-22 16:33:32 +08:00
seq_puts ( s , " \n Distribution of User Blocks: " ) ;
seq_puts ( s , " [ valid | invalid | free ] \n " ) ;
seq_puts ( s , " [ " ) ;
2012-11-03 06:50:41 +09:00
for ( j = 0 ; j < si - > util_valid ; j + + )
2013-07-22 16:33:32 +08:00
seq_putc ( s , ' - ' ) ;
seq_putc ( s , ' | ' ) ;
2012-11-03 06:50:41 +09:00
for ( j = 0 ; j < si - > util_invalid ; j + + )
2013-07-22 16:33:32 +08:00
seq_putc ( s , ' - ' ) ;
seq_putc ( s , ' | ' ) ;
2012-11-03 06:50:41 +09:00
for ( j = 0 ; j < si - > util_free ; j + + )
2013-07-22 16:33:32 +08:00
seq_putc ( s , ' - ' ) ;
seq_puts ( s , " ] \n \n " ) ;
2014-12-24 02:16:54 +09:00
seq_printf ( s , " IPU: %u blocks \n " , si - > inplace_count ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " SSR: %u blocks in %u segments \n " ,
si - > block_count [ SSR ] , si - > segment_count [ SSR ] ) ;
seq_printf ( s , " LFS: %u blocks in %u segments \n " ,
si - > block_count [ LFS ] , si - > segment_count [ LFS ] ) ;
/* segment usage info */
2020-01-22 10:51:16 -08:00
f2fs_update_sit_info ( si - > sbi ) ;
2012-11-03 06:50:41 +09:00
seq_printf ( s , " \n BDF: %u, avg. vblocks: %u \n " ,
si - > bimodal , si - > avg_vblocks ) ;
/* memory footprint */
update_mem_info ( si - > sbi ) ;
2015-09-11 14:43:52 +08:00
seq_printf ( s , " \n Memory: %llu KB \n " ,
2015-01-10 21:37:36 -08:00
( si - > base_mem + si - > cache_mem + si - > page_mem ) > > 10 ) ;
2015-09-11 14:43:52 +08:00
seq_printf ( s , " - static: %llu KB \n " ,
2015-01-10 21:37:36 -08:00
si - > base_mem > > 10 ) ;
2015-09-11 14:43:52 +08:00
seq_printf ( s , " - cached: %llu KB \n " ,
2015-01-10 21:37:36 -08:00
si - > cache_mem > > 10 ) ;
2015-09-11 14:43:52 +08:00
seq_printf ( s , " - paged : %llu KB \n " ,
2015-01-10 21:37:36 -08:00
si - > page_mem > > 10 ) ;
2012-11-03 06:50:41 +09:00
}
2013-01-14 20:08:16 +08:00
mutex_unlock ( & f2fs_stat_mutex ) ;
2012-11-03 06:50:41 +09:00
return 0 ;
}
2018-11-05 09:41:48 -05:00
DEFINE_SHOW_ATTRIBUTE ( stat ) ;
2020-01-22 10:51:16 -08:00
# endif
2012-11-03 06:50:41 +09:00
2013-01-15 19:58:47 +09:00
int f2fs_build_stats ( struct f2fs_sb_info * sbi )
2012-11-03 06:50:41 +09:00
{
struct f2fs_super_block * raw_super = F2FS_RAW_SUPER ( sbi ) ;
struct f2fs_stat_info * si ;
2018-09-29 18:31:27 +08:00
int i ;
2012-11-03 06:50:41 +09:00
2017-11-30 19:28:17 +08:00
si = f2fs_kzalloc ( sbi , sizeof ( struct f2fs_stat_info ) , GFP_KERNEL ) ;
2013-07-12 14:47:11 +08:00
if ( ! si )
2012-11-03 06:50:41 +09:00
return - ENOMEM ;
si - > all_area_segs = le32_to_cpu ( raw_super - > segment_count ) ;
si - > sit_area_segs = le32_to_cpu ( raw_super - > segment_count_sit ) ;
si - > nat_area_segs = le32_to_cpu ( raw_super - > segment_count_nat ) ;
si - > ssa_area_segs = le32_to_cpu ( raw_super - > segment_count_ssa ) ;
si - > main_area_segs = le32_to_cpu ( raw_super - > segment_count_main ) ;
si - > main_area_sections = le32_to_cpu ( raw_super - > section_count ) ;
si - > main_area_zones = si - > main_area_sections /
le32_to_cpu ( raw_super - > secs_per_zone ) ;
si - > sbi = sbi ;
2013-07-12 14:47:11 +08:00
sbi - > stat_info = si ;
2013-01-14 20:08:16 +08:00
2015-09-30 17:38:48 +08:00
atomic64_set ( & sbi - > total_hit_ext , 0 ) ;
atomic64_set ( & sbi - > read_hit_rbtree , 0 ) ;
atomic64_set ( & sbi - > read_hit_largest , 0 ) ;
atomic64_set ( & sbi - > read_hit_cached , 0 ) ;
2015-07-15 17:29:49 +08:00
2015-07-15 17:28:53 +08:00
atomic_set ( & sbi - > inline_xattr , 0 ) ;
2014-12-08 19:08:20 +08:00
atomic_set ( & sbi - > inline_inode , 0 ) ;
atomic_set ( & sbi - > inline_dir , 0 ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
atomic_set ( & sbi - > compr_inode , 0 ) ;
2020-08-31 11:09:49 +09:00
atomic64_set ( & sbi - > compr_blocks , 0 ) ;
2014-12-24 02:16:54 +09:00
atomic_set ( & sbi - > inplace_count , 0 ) ;
2018-09-29 18:31:27 +08:00
for ( i = META_CP ; i < META_MAX ; i + + )
atomic_set ( & sbi - > meta_count [ i ] , 0 ) ;
2014-12-08 19:08:20 +08:00
2017-03-22 17:23:45 +08:00
atomic_set ( & sbi - > vw_cnt , 0 ) ;
2016-12-28 13:55:09 -08:00
atomic_set ( & sbi - > max_aw_cnt , 0 ) ;
2017-03-22 17:23:45 +08:00
atomic_set ( & sbi - > max_vw_cnt , 0 ) ;
2016-12-28 13:55:09 -08:00
2013-01-14 20:08:16 +08:00
mutex_lock ( & f2fs_stat_mutex ) ;
list_add_tail ( & si - > stat_list , & f2fs_stat_list ) ;
mutex_unlock ( & f2fs_stat_mutex ) ;
2012-11-03 06:50:41 +09:00
return 0 ;
}
void f2fs_destroy_stats ( struct f2fs_sb_info * sbi )
{
2013-07-12 14:47:11 +08:00
struct f2fs_stat_info * si = F2FS_STAT ( sbi ) ;
2012-11-03 06:50:41 +09:00
2013-01-14 20:08:16 +08:00
mutex_lock ( & f2fs_stat_mutex ) ;
2012-11-03 06:50:41 +09:00
list_del ( & si - > stat_list ) ;
2013-01-14 20:08:16 +08:00
mutex_unlock ( & f2fs_stat_mutex ) ;
2020-09-14 16:47:00 +08:00
kfree ( si ) ;
2012-11-03 06:50:41 +09:00
}
2019-01-04 14:26:18 +01:00
void __init f2fs_create_root_stats ( void )
2013-01-15 19:58:47 +09:00
{
2020-01-22 10:51:16 -08:00
# ifdef CONFIG_DEBUG_FS
2013-12-03 21:09:29 +08:00
f2fs_debugfs_root = debugfs_create_dir ( " f2fs " , NULL ) ;
2013-12-03 20:11:46 +08:00
2019-01-04 14:26:18 +01:00
debugfs_create_file ( " status " , S_IRUGO , f2fs_debugfs_root , NULL ,
& stat_fops ) ;
2020-01-22 10:51:16 -08:00
# endif
2013-01-15 19:58:47 +09:00
}
void f2fs_destroy_root_stats ( void )
2012-11-03 06:50:41 +09:00
{
2020-01-22 10:51:16 -08:00
# ifdef CONFIG_DEBUG_FS
2013-12-03 21:09:29 +08:00
debugfs_remove_recursive ( f2fs_debugfs_root ) ;
f2fs_debugfs_root = NULL ;
2020-01-22 10:51:16 -08:00
# endif
2012-11-03 06:50:41 +09:00
}