linux/fs/f2fs/sysfs.c

1541 lines
42 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* f2fs sysfs interface
*
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
* http://www.samsung.com/
* Copyright (c) 2017 Chao Yu <chao@kernel.org>
*/
#include <linux/compiler.h>
#include <linux/proc_fs.h>
#include <linux/f2fs_fs.h>
#include <linux/seq_file.h>
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
#include <linux/unicode.h>
#include <linux/ioprio.h>
#include <linux/sysfs.h>
#include "f2fs.h"
#include "segment.h"
#include "gc.h"
#include "iostat.h"
#include <trace/events/f2fs.h>
static struct proc_dir_entry *f2fs_proc_root;
/* Sysfs support for f2fs */
enum {
GC_THREAD, /* struct f2fs_gc_thread */
SM_INFO, /* struct f2fs_sm_info */
DCC_INFO, /* struct discard_cmd_control */
NM_INFO, /* struct f2fs_nm_info */
F2FS_SBI, /* struct f2fs_sb_info */
#ifdef CONFIG_F2FS_STAT_FS
STAT_INFO, /* struct f2fs_stat_info */
#endif
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_RATE, /* struct f2fs_fault_info */
FAULT_INFO_TYPE, /* struct f2fs_fault_info */
#endif
RESERVED_BLOCKS, /* struct f2fs_sb_info */
CPRC_INFO, /* struct ckpt_req_control */
ATGC_INFO, /* struct atgc_management */
};
static const char *gc_mode_names[MAX_GC_MODE] = {
"GC_NORMAL",
"GC_IDLE_CB",
"GC_IDLE_GREEDY",
"GC_IDLE_AT",
"GC_URGENT_HIGH",
"GC_URGENT_LOW",
"GC_URGENT_MID"
};
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf);
ssize_t (*store)(struct f2fs_attr *a, struct f2fs_sb_info *sbi,
const char *buf, size_t len);
int struct_type;
int offset;
int id;
};
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf);
static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
{
if (struct_type == GC_THREAD)
return (unsigned char *)sbi->gc_thread;
else if (struct_type == SM_INFO)
return (unsigned char *)SM_I(sbi);
else if (struct_type == DCC_INFO)
return (unsigned char *)SM_I(sbi)->dcc_info;
else if (struct_type == NM_INFO)
return (unsigned char *)NM_I(sbi);
else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS)
return (unsigned char *)sbi;
#ifdef CONFIG_F2FS_FAULT_INJECTION
else if (struct_type == FAULT_INFO_RATE ||
struct_type == FAULT_INFO_TYPE)
return (unsigned char *)&F2FS_OPTION(sbi).fault_info;
#endif
#ifdef CONFIG_F2FS_STAT_FS
else if (struct_type == STAT_INFO)
return (unsigned char *)F2FS_STAT(sbi);
#endif
else if (struct_type == CPRC_INFO)
return (unsigned char *)&sbi->cprc_info;
else if (struct_type == ATGC_INFO)
return (unsigned char *)&sbi->am;
return NULL;
}
static ssize_t dirty_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(dirty_segments(sbi)));
}
static ssize_t free_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(free_segments(sbi)));
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
}
static ssize_t ovp_segments_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(overprovision_segments(sbi)));
}
static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(sbi->kbytes_written +
((f2fs_get_sectors_written(sbi) -
sbi->sectors_written_start) >> 1)));
}
static ssize_t sb_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%lx\n", sbi->s_flag);
}
static ssize_t cp_status_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%x\n", le32_to_cpu(F2FS_CKPT(sbi)->ckpt_flags));
}
static ssize_t pending_discard_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (!SM_I(sbi)->dcc_info)
return -EINVAL;
return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
static ssize_t gc_mode_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%s\n", gc_mode_names[sbi->gc_mode]);
}
static ssize_t features_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
int len = 0;
if (f2fs_sb_has_encrypt(sbi))
len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
if (f2fs_sb_has_blkzoned(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "extra_attr");
if (f2fs_sb_has_project_quota(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "projquota");
if (f2fs_sb_has_inode_chksum(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_checksum");
if (f2fs_sb_has_flexible_inline_xattr(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
f2fs: support flexible inline xattr size Now, in product, more and more features based on file encryption were introduced, their demand of xattr space is increasing, however, inline xattr has fixed-size of 200 bytes, once inline xattr space is full, new increased xattr data would occupy additional xattr block which may bring us more space usage and performance regression during persisting. In order to resolve above issue, it's better to expand inline xattr size flexibly according to user's requirement. So this patch introduces new filesystem feature 'flexible inline xattr', and new mount option 'inline_xattr_size=%u', once mkfs enables the feature, we can use the option to make f2fs supporting flexible inline xattr size. To support this feature, we add extra attribute i_inline_xattr_size in inode layout, indicating that how many space inline xattr borrows from block address mapping space in inode layout, by this, we can easily locate and store flexible-sized inline xattr data in inode. Inode disk layout: +----------------------+ | .i_mode | | ... | | .i_ext | +----------------------+ | .i_extra_isize | | .i_inline_xattr_size |-----------+ | ... | | +----------------------+ | | .i_addr | | | - block address or | | | - inline data | | +----------------------+<---+ v | inline xattr | +---inline xattr range +----------------------+<---+ | .i_nid | +----------------------+ | node_footer | | (nid, ino, offset) | +----------------------+ Note that, we have to cnosider backward compatibility which reserved inline_data space, 200 bytes, all the time, reported by Sheng Yong. Previous inline data or directory always reserved 200 bytes in inode layout, even if inline_xattr is disabled. In order to keep inline_dentry's structure for backward compatibility, we get the space back only from inline_data. Signed-off-by: Chao Yu <yuchao0@huawei.com> Reported-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-09-06 16:59:50 +03:00
len ? ", " : "", "flexible_inline_xattr");
if (f2fs_sb_has_quota_ino(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "quota_ino");
if (f2fs_sb_has_inode_crtime(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
if (f2fs_sb_has_lost_found(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
f2fs: add fs-verity support Add fs-verity support to f2fs. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. The f2fs support for fs-verity consists of: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->readpages() to verify data as it's read from verity files and to support reading verity metadata pages. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by f2fs with only some relatively small changes to f2fs. Extended attributes cannot be used because (a) f2fs limits the total size of an inode's xattr entries to 4096 bytes, which wouldn't be enough for even a single Merkle tree block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity metadata *must* be encrypted when the file is because it contains hashes of the plaintext data. Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Acked-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 19:26:24 +03:00
if (f2fs_sb_has_verity(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
f2fs: add fs-verity support Add fs-verity support to f2fs. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. The f2fs support for fs-verity consists of: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->readpages() to verify data as it's read from verity files and to support reading verity metadata pages. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by f2fs with only some relatively small changes to f2fs. Extended attributes cannot be used because (a) f2fs limits the total size of an inode's xattr entries to 4096 bytes, which wouldn't be enough for even a single Merkle tree block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity metadata *must* be encrypted when the file is because it contains hashes of the plaintext data. Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Acked-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 19:26:24 +03:00
len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
if (f2fs_sb_has_casefold(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
len ? ", " : "", "casefold");
if (f2fs_sb_has_readonly(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "readonly");
f2fs: support data compression This patch tries to support compression in f2fs. - New term named cluster is defined as basic unit of compression, file can be divided into multiple clusters logically. One cluster includes 4 << n (n >= 0) logical pages, compression size is also cluster size, each of cluster can be compressed or not. - In cluster metadata layout, one special flag is used to indicate cluster is compressed one or normal one, for compressed cluster, following metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores data including compress header and compressed data. - In order to eliminate write amplification during overwrite, F2FS only support compression on write-once file, data can be compressed only when all logical blocks in file are valid and cluster compress ratio is lower than specified threshold. - To enable compression on regular inode, there are three ways: * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext Compress metadata layout: [Dnode Structure] +-----------------------------------------------+ | cluster 1 | cluster 2 | ......... | cluster N | +-----------------------------------------------+ . . . . . . . . . Compressed Cluster . . Normal Cluster . +----------+---------+---------+---------+ +---------+---------+---------+---------+ |compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | +----------+---------+---------+---------+ +---------+---------+---------+---------+ . . . . . . +-------------+-------------+----------+----------------------------+ | data length | data chksum | reserved | compressed data | +-------------+-------------+----------+----------------------------+ Changelog: 20190326: - fix error handling of read_end_io(). - remove unneeded comments in f2fs_encrypt_one_page(). 20190327: - fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages(). - don't jump into loop directly to avoid uninitialized variables. - add TODO tag in error path of f2fs_write_cache_pages(). 20190328: - fix wrong merge condition in f2fs_read_multi_pages(). - check compressed file in f2fs_post_read_required(). 20190401 - allow overwrite on non-compressed cluster. - check cluster meta before writing compressed data. 20190402 - don't preallocate blocks for compressed file. - add lz4 compress algorithm - process multiple post read works in one workqueue Now f2fs supports processing post read work in multiple workqueue, it shows low performance due to schedule overhead of multiple workqueue executing orderly. 20190921 - compress: support buffered overwrite C: compress cluster flag V: valid block address N: NEW_ADDR One cluster contain 4 blocks before overwrite after overwrite - VVVV -> CVNN - CVNN -> VVVV - CVNN -> CVNN - CVNN -> CVVV - CVVV -> CVNN - CVVV -> CVVV 20191029 - add kconfig F2FS_FS_COMPRESSION to isolate compression related codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm. note that: will remove lzo backend if Jaegeuk agreed that too. - update codes according to Eric's comments. 20191101 - apply fixes from Jaegeuk 20191113 - apply fixes from Jaegeuk - split workqueue for fsverity 20191216 - apply fixes from Jaegeuk 20200117 - fix to avoid NULL pointer dereference [Jaegeuk Kim] - add tracepoint for f2fs_{,de}compress_pages() - fix many bugs and add some compression stats - fix overwrite/mmap bugs - address 32bit build error, reported by Geert. - bug fixes when handling errors and i_compressed_blocks Reported-by: <noreply@ellerman.id.au> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 13:07:14 +03:00
if (f2fs_sb_has_compression(sbi))
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
f2fs: support data compression This patch tries to support compression in f2fs. - New term named cluster is defined as basic unit of compression, file can be divided into multiple clusters logically. One cluster includes 4 << n (n >= 0) logical pages, compression size is also cluster size, each of cluster can be compressed or not. - In cluster metadata layout, one special flag is used to indicate cluster is compressed one or normal one, for compressed cluster, following metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores data including compress header and compressed data. - In order to eliminate write amplification during overwrite, F2FS only support compression on write-once file, data can be compressed only when all logical blocks in file are valid and cluster compress ratio is lower than specified threshold. - To enable compression on regular inode, there are three ways: * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext Compress metadata layout: [Dnode Structure] +-----------------------------------------------+ | cluster 1 | cluster 2 | ......... | cluster N | +-----------------------------------------------+ . . . . . . . . . Compressed Cluster . . Normal Cluster . +----------+---------+---------+---------+ +---------+---------+---------+---------+ |compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | +----------+---------+---------+---------+ +---------+---------+---------+---------+ . . . . . . +-------------+-------------+----------+----------------------------+ | data length | data chksum | reserved | compressed data | +-------------+-------------+----------+----------------------------+ Changelog: 20190326: - fix error handling of read_end_io(). - remove unneeded comments in f2fs_encrypt_one_page(). 20190327: - fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages(). - don't jump into loop directly to avoid uninitialized variables. - add TODO tag in error path of f2fs_write_cache_pages(). 20190328: - fix wrong merge condition in f2fs_read_multi_pages(). - check compressed file in f2fs_post_read_required(). 20190401 - allow overwrite on non-compressed cluster. - check cluster meta before writing compressed data. 20190402 - don't preallocate blocks for compressed file. - add lz4 compress algorithm - process multiple post read works in one workqueue Now f2fs supports processing post read work in multiple workqueue, it shows low performance due to schedule overhead of multiple workqueue executing orderly. 20190921 - compress: support buffered overwrite C: compress cluster flag V: valid block address N: NEW_ADDR One cluster contain 4 blocks before overwrite after overwrite - VVVV -> CVNN - CVNN -> VVVV - CVNN -> CVNN - CVNN -> CVVV - CVVV -> CVNN - CVVV -> CVVV 20191029 - add kconfig F2FS_FS_COMPRESSION to isolate compression related codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm. note that: will remove lzo backend if Jaegeuk agreed that too. - update codes according to Eric's comments. 20191101 - apply fixes from Jaegeuk 20191113 - apply fixes from Jaegeuk - split workqueue for fsverity 20191216 - apply fixes from Jaegeuk 20200117 - fix to avoid NULL pointer dereference [Jaegeuk Kim] - add tracepoint for f2fs_{,de}compress_pages() - fix many bugs and add some compression stats - fix overwrite/mmap bugs - address 32bit build error, reported by Geert. - bug fixes when handling errors and i_compressed_blocks Reported-by: <noreply@ellerman.id.au> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 13:07:14 +03:00
len ? ", " : "", "compression");
len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "pin_file");
len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%u\n", sbi->current_reserved_blocks);
}
static ssize_t unusable_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
block_t unusable;
if (test_opt(sbi, DISABLE_CHECKPOINT))
unusable = sbi->unusable_block_count;
else
unusable = f2fs_get_unusable_blocks(sbi);
return sysfs_emit(buf, "%llu\n", (unsigned long long)unusable);
}
static ssize_t encoding_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
#if IS_ENABLED(CONFIG_UNICODE)
struct super_block *sb = sbi->sb;
if (f2fs_sb_has_casefold(sbi))
unicode patches for 5.17 This includes patches from Christoph Hellwig to split the large data tables of the unicode subsystem into a loadable module, which allow users to not have them around if case-insensitive filesystems are not to be used. It also includes minor code fixes to unicode and its users, from the same author. There is a trivial conflict in the function encoding_show in fs/f2fs/sysfs.c reported by linux-next between commit 84eab2a899f2 ("f2fs: replace snprintf in show functions with sysfs_emit") and commit a440943e68cd ("unicode: remove the charset field from struct unicode_map"). from my tree. All the patches here have been on linux-next releases for the past months. Signed-off-by: Gabriel Krisman Bertazi <krisman@collabora.com> -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8jAUPq50yNjPBCi4QEuZqsMcppQFAmHeLp0ACgkQQEuZqsMc ppRWdhAAstuibIlhUj1Vae070P92oaxM/Azz3IgyVFWensJyQV1PvbtFQDhyKM4w M3tQ45eK49vVHn+JpLHbiAdZV66rD/sMSsruCVIf/8KNVDisOBQtFar5yxVr0Ion AOMoG6/Xrk8BZlZH62fhtJGtu/EFmeFoGVdC81NdTSroe9G+26we3IULwHSE1lNH XMJFCgU6otuLDOna16U7kL77Tu7GXRJcQe1+2nRJ+u6Agxy2xTo/s4FHuxzRK0/e GsgO1scY6unWM23O6z+qJYazng2Zt3EOZtSGqU4TsvZwjUi2UtAYW1/vAQGc/q3Y hGxPYGgKC1VrXLfIcuyng7j0vFPtADbdHMbsJPoyy+Nz4znDJ81IAKAHMO1in3C8 CHKjW+6InmXNye/uwdRt8Tx49jxUHmWUbQRT5FwMDpzC7MAL+DVdPpVVQgpLVM/H gW3YpBEk5qQvVdh8DWZVW3rT3SnMX/v0+u+76FsMHKYNJMNrCnP6vXpCPQl/Gyut ycgK7qVF3o/bgNBf072H3ZBZajTv7ePvacP4Wth7m9I2ykk+p4IjQLpTC5rJK0By VC1xS4im2VqiIWE9eE5y9cXU1oa/AfOcOF+7FZcxT13IL6hKTtd4+H4yKgdcNsyk 7RjpGgjp+SU51/EilhEqMFgEe07CURxwGwhApizBSiTIOgZS96U= =4q9x -----END PGP SIGNATURE----- Merge tag 'unicode-for-next-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode Pull unicode updates from Gabriel Krisman Bertazi: "This includes patches from Christoph Hellwig to split the large data tables of the unicode subsystem into a loadable module, which allow users to not have them around if case-insensitive filesystems are not to be used. It also includes minor code fixes to unicode and its users, from the same author. All the patches here have been on linux-next releases for the past months" * tag 'unicode-for-next-5.17' of git://git.kernel.org/pub/scm/linux/kernel/git/krisman/unicode: unicode: only export internal symbols for the selftests unicode: Add utf8-data module unicode: cache the normalization tables in struct unicode_map unicode: move utf8cursor to utf8-selftest.c unicode: simplify utf8len unicode: remove the unused utf8{,n}age{min,max} functions unicode: pass a UNICODE_AGE() tripple to utf8_load unicode: mark the version field in struct unicode_map unsigned unicode: remove the charset field from struct unicode_map f2fs: simplify f2fs_sb_read_encoding ext4: simplify ext4_sb_read_encoding
2022-01-17 06:40:02 +03:00
return sysfs_emit(buf, "UTF-8 (%d.%d.%d)\n",
(sb->s_encoding->version >> 16) & 0xff,
(sb->s_encoding->version >> 8) & 0xff,
sb->s_encoding->version & 0xff);
#endif
return sysfs_emit(buf, "(none)\n");
}
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n", SIT_I(sbi)->mounted_time);
}
#ifdef CONFIG_F2FS_STAT_FS
static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->tot_blks -
(si->bg_data_blks + si->bg_node_blks)));
}
static ssize_t moved_blocks_background_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
return sysfs_emit(buf, "%llu\n",
(unsigned long long)(si->bg_data_blks + si->bg_node_blks));
}
static ssize_t avg_vblocks_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
si->dirty_count = dirty_segments(sbi);
f2fs_update_sit_info(sbi);
return sysfs_emit(buf, "%llu\n", (unsigned long long)(si->avg_vblocks));
}
#endif
static ssize_t main_blkaddr_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "%llu\n",
(unsigned long long)MAIN_BLKADDR(sbi));
}
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
unsigned char *ptr = NULL;
unsigned int *ui;
ptr = __struct_ptr(sbi, a->struct_type);
if (!ptr)
return -EINVAL;
if (!strcmp(a->attr.name, "extension_list")) {
__u8 (*extlist)[F2FS_EXTENSION_LEN] =
sbi->raw_super->extension_list;
int cold_count = le32_to_cpu(sbi->raw_super->extension_count);
int hot_count = sbi->raw_super->hot_ext_count;
int len = 0, i;
len += scnprintf(buf + len, PAGE_SIZE - len,
"cold file extension:\n");
for (i = 0; i < cold_count; i++)
len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
len += scnprintf(buf + len, PAGE_SIZE - len,
"hot file extension:\n");
for (i = cold_count; i < cold_count + hot_count; i++)
len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
return len;
}
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
struct ckpt_req_control *cprc = &sbi->cprc_info;
int class = IOPRIO_PRIO_CLASS(cprc->ckpt_thread_ioprio);
int data = IOPRIO_PRIO_DATA(cprc->ckpt_thread_ioprio);
if (class != IOPRIO_CLASS_RT && class != IOPRIO_CLASS_BE)
return -EINVAL;
return sysfs_emit(buf, "%s,%d\n",
class == IOPRIO_CLASS_RT ? "rt" : "be", data);
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block"))
return sysfs_emit(buf, "%llu\n", sbi->compr_written_block);
if (!strcmp(a->attr.name, "compr_saved_block"))
return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block);
if (!strcmp(a->attr.name, "compr_new_inode"))
return sysfs_emit(buf, "%u\n", sbi->compr_new_inode);
#endif
if (!strcmp(a->attr.name, "gc_segment_mode"))
return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode);
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
return sysfs_emit(buf, "%u\n",
sbi->gc_reclaimed_segs[sbi->gc_segment_mode]);
}
if (!strcmp(a->attr.name, "current_atomic_write")) {
s64 current_write = atomic64_read(&sbi->current_atomic_write);
return sysfs_emit(buf, "%lld\n", current_write);
}
if (!strcmp(a->attr.name, "peak_atomic_write"))
return sysfs_emit(buf, "%lld\n", sbi->peak_atomic_write);
if (!strcmp(a->attr.name, "committed_atomic_block"))
return sysfs_emit(buf, "%llu\n", sbi->committed_atomic_block);
if (!strcmp(a->attr.name, "revoked_atomic_block"))
return sysfs_emit(buf, "%llu\n", sbi->revoked_atomic_block);
ui = (unsigned int *)(ptr + a->offset);
return sysfs_emit(buf, "%u\n", *ui);
}
f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-29 19:20:41 +03:00
static ssize_t __sbi_store(struct f2fs_attr *a,
struct f2fs_sb_info *sbi,
const char *buf, size_t count)
{
unsigned char *ptr;
unsigned long t;
unsigned int *ui;
ssize_t ret;
ptr = __struct_ptr(sbi, a->struct_type);
if (!ptr)
return -EINVAL;
if (!strcmp(a->attr.name, "extension_list")) {
const char *name = strim((char *)buf);
bool set = true, hot;
if (!strncmp(name, "[h]", 3))
hot = true;
else if (!strncmp(name, "[c]", 3))
hot = false;
else
return -EINVAL;
name += 3;
if (*name == '!') {
name++;
set = false;
}
if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN)
return -EINVAL;
f2fs_down_write(&sbi->sb_lock);
f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-29 19:20:41 +03:00
ret = f2fs_update_extension_list(sbi, name, hot, set);
if (ret)
goto out;
ret = f2fs_commit_super(sbi, false);
if (ret)
f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-29 19:20:41 +03:00
f2fs_update_extension_list(sbi, name, hot, !set);
out:
f2fs_up_write(&sbi->sb_lock);
return ret ? ret : count;
}
if (!strcmp(a->attr.name, "ckpt_thread_ioprio")) {
const char *name = strim((char *)buf);
struct ckpt_req_control *cprc = &sbi->cprc_info;
int class;
long data;
int ret;
if (!strncmp(name, "rt,", 3))
class = IOPRIO_CLASS_RT;
else if (!strncmp(name, "be,", 3))
class = IOPRIO_CLASS_BE;
else
return -EINVAL;
name += 3;
ret = kstrtol(name, 10, &data);
if (ret)
return ret;
if (data >= IOPRIO_NR_LEVELS || data < 0)
return -EINVAL;
cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data);
if (test_opt(sbi, MERGE_CHECKPOINT)) {
ret = set_task_ioprio(cprc->f2fs_issue_ckpt,
cprc->ckpt_thread_ioprio);
if (ret)
return ret;
}
return count;
}
ui = (unsigned int *)(ptr + a->offset);
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret < 0)
return ret;
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX))
return -EINVAL;
if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
return -EINVAL;
#endif
if (a->struct_type == RESERVED_BLOCKS) {
spin_lock(&sbi->stat_lock);
if (t > (unsigned long)(sbi->user_block_count -
f2fs: fix to reserve space for IO align feature https://bugzilla.kernel.org/show_bug.cgi?id=204137 With below script, we will hit panic during new segment allocation: DISK=bingo.img MOUNT_DIR=/mnt/f2fs dd if=/dev/zero of=$DISK bs=1M count=105 mkfs.f2fe -a 1 -o 19 -t 1 -z 1 -f -q $DISK mount -t f2fs $DISK $MOUNT_DIR -o "noinline_dentry,flush_merge,noextent_cache,mode=lfs,io_bits=7,fsync_mode=strict" for (( i = 0; i < 4096; i++ )); do name=`head /dev/urandom | tr -dc A-Za-z0-9 | head -c 10` mkdir $MOUNT_DIR/$name done umount $MOUNT_DIR rm $DISK --- Core dump --- Call Trace: allocate_segment_by_default+0x9d/0x100 [f2fs] f2fs_allocate_data_block+0x3c0/0x5c0 [f2fs] do_write_page+0x62/0x110 [f2fs] f2fs_outplace_write_data+0x43/0xc0 [f2fs] f2fs_do_write_data_page+0x386/0x560 [f2fs] __write_data_page+0x706/0x850 [f2fs] f2fs_write_cache_pages+0x267/0x6a0 [f2fs] f2fs_write_data_pages+0x19c/0x2e0 [f2fs] do_writepages+0x1c/0x70 __filemap_fdatawrite_range+0xaa/0xe0 filemap_fdatawrite+0x1f/0x30 f2fs_sync_dirty_inodes+0x74/0x1f0 [f2fs] block_operations+0xdc/0x350 [f2fs] f2fs_write_checkpoint+0x104/0x1150 [f2fs] f2fs_sync_fs+0xa2/0x120 [f2fs] f2fs_balance_fs_bg+0x33c/0x390 [f2fs] f2fs_write_node_pages+0x4c/0x1f0 [f2fs] do_writepages+0x1c/0x70 __writeback_single_inode+0x45/0x320 writeback_sb_inodes+0x273/0x5c0 wb_writeback+0xff/0x2e0 wb_workfn+0xa1/0x370 process_one_work+0x138/0x350 worker_thread+0x4d/0x3d0 kthread+0x109/0x140 ret_from_fork+0x25/0x30 The root cause here is, with IO alignment feature enables, in worst case, we need F2FS_IO_SIZE() free blocks space for single one 4k write due to IO alignment feature will fill dummy pages to make IO being aligned. So we will easily run out of free segments during non-inline directory's data writeback, even in process of foreground GC. In order to fix this issue, I just propose to reserve additional free space for IO alignment feature to handle worst case of free space usage ratio during FGGC. Fixes: 0a595ebaaa6b ("f2fs: support IO alignment for DATA and NODE writes") Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-12-11 16:27:36 +03:00
F2FS_OPTION(sbi).root_reserved_blocks -
sbi->blocks_per_seg *
SM_I(sbi)->additional_reserved_segments)) {
spin_unlock(&sbi->stat_lock);
return -EINVAL;
}
*ui = t;
sbi->current_reserved_blocks = min(sbi->reserved_blocks,
sbi->user_block_count - valid_user_blocks(sbi));
spin_unlock(&sbi->stat_lock);
return count;
}
if (!strcmp(a->attr.name, "discard_io_aware_gran")) {
if (t > MAX_PLIST_NUM)
return -EINVAL;
if (!f2fs_block_unit_discard(sbi))
return -EINVAL;
if (t == *ui)
return count;
*ui = t;
return count;
}
if (!strcmp(a->attr.name, "discard_granularity")) {
if (t == 0 || t > MAX_PLIST_NUM)
return -EINVAL;
f2fs: introduce discard_unit mount option As James Z reported in bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=213877 [1.] One-line summary of the problem: Mount multiple SMR block devices exceed certain number cause system non-response [2.] Full description of the problem/report: Created some F2FS on SMR devices (mkfs.f2fs -m), then mounted in sequence. Each device is the same Model: HGST HSH721414AL (Size 14TB). Empirically, found that when the amount of SMR device * 1.5Gb > System RAM, the system ran out of memory and hung. No dmesg output. For example, 24 SMR Disk need 24*1.5GB = 36GB. A system with 32G RAM can only mount 21 devices, the 22nd device will be a reproducible cause of system hang. The number of SMR devices with other FS mounted on this system does not interfere with the result above. [3.] Keywords (i.e., modules, networking, kernel): F2FS, SMR, Memory [4.] Kernel information [4.1.] Kernel version (uname -a): Linux 5.13.4-200.fc34.x86_64 #1 SMP Tue Jul 20 20:27:29 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux [4.2.] Kernel .config file: Default Fedora 34 with f2fs-tools-1.14.0-2.fc34.x86_64 [5.] Most recent kernel version which did not have the bug: None [6.] Output of Oops.. message (if applicable) with symbolic information resolved (see Documentation/admin-guide/oops-tracing.rst) None [7.] A small shell script or example program which triggers the problem (if possible) mount /dev/sdX /mnt/0X [8.] Memory consumption With 24 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 46 36 0 0 10 10 Swap: 0 0 0 With 3 * 14T SMR Block device with F2FS free -g total used free shared buff/cache available Mem: 7 5 0 0 1 1 Swap: 7 0 7 The root cause is, there are three bitmaps: - cur_valid_map - ckpt_valid_map - discard_map and each of them will cost ~500MB memory, {cur, ckpt}_valid_map are necessary, but discard_map is optional, since this bitmap will only be useful in mountpoint that small discard is enabled. For a blkzoned device such as SMR or ZNS devices, f2fs will only issue discard for a section(zone) when all blocks of that section are invalid, so, for such device, we don't need small discard functionality at all. This patch introduces a new mountoption "discard_unit=block|segment| section" to support issuing discard with different basic unit which is aligned to block, segment or section, so that user can specify "discard_unit=segment" or "discard_unit=section" to disable small discard functionality. Note that this mount option can not be changed by remount() due to related metadata need to be initialized during mount(). In order to save memory, let's use "discard_unit=section" for blkzoned device by default. Signed-off-by: Chao Yu <chao@kernel.org> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-08-03 03:15:43 +03:00
if (!f2fs_block_unit_discard(sbi))
return -EINVAL;
if (t == *ui)
return count;
*ui = t;
return count;
}
if (!strcmp(a->attr.name, "max_ordered_discard")) {
if (t == 0 || t > MAX_PLIST_NUM)
return -EINVAL;
if (!f2fs_block_unit_discard(sbi))
return -EINVAL;
*ui = t;
return count;
}
if (!strcmp(a->attr.name, "discard_urgent_util")) {
if (t > 100)
return -EINVAL;
*ui = t;
return count;
}
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
}
if (!strcmp(a->attr.name, "gc_urgent")) {
if (t == 0) {
sbi->gc_mode = GC_NORMAL;
} else if (t == 1) {
sbi->gc_mode = GC_URGENT_HIGH;
if (sbi->gc_thread) {
sbi->gc_thread->gc_wake = true;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
wake_up_discard_thread(sbi, true);
}
} else if (t == 2) {
sbi->gc_mode = GC_URGENT_LOW;
} else if (t == 3) {
sbi->gc_mode = GC_URGENT_MID;
if (sbi->gc_thread) {
sbi->gc_thread->gc_wake = true;
wake_up_interruptible_all(
&sbi->gc_thread->gc_wait_queue_head);
}
} else {
return -EINVAL;
}
return count;
}
if (!strcmp(a->attr.name, "gc_idle")) {
f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu <yuchao0@huawei.com> [Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 16:14:49 +03:00
if (t == GC_IDLE_CB) {
sbi->gc_mode = GC_IDLE_CB;
f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu <yuchao0@huawei.com> [Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 16:14:49 +03:00
} else if (t == GC_IDLE_GREEDY) {
sbi->gc_mode = GC_IDLE_GREEDY;
f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu <yuchao0@huawei.com> [Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 16:14:49 +03:00
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
sbi->gc_mode = GC_IDLE_AT;
f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu <yuchao0@huawei.com> [Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 16:14:49 +03:00
} else {
sbi->gc_mode = GC_NORMAL;
f2fs: support age threshold based garbage collection There are several issues in current background GC algorithm: - valid blocks is one of key factors during cost overhead calculation, so if segment has less valid block, however even its age is young or it locates hot segment, CB algorithm will still choose the segment as victim, it's not appropriate. - GCed data/node will go to existing logs, no matter in-there datas' update frequency is the same or not, it may mix hot and cold data again. - GC alloctor mainly use LFS type segment, it will cost free segment more quickly. This patch introduces a new algorithm named age threshold based garbage collection to solve above issues, there are three steps mainly: 1. select a source victim: - set an age threshold, and select candidates beased threshold: e.g. 0 means youngest, 100 means oldest, if we set age threshold to 80 then select dirty segments which has age in range of [80, 100] as candiddates; - set candidate_ratio threshold, and select candidates based the ratio, so that we can shrink candidates to those oldest segments; - select target segment with fewest valid blocks in order to migrate blocks with minimum cost; 2. select a target victim: - select candidates beased age threshold; - set candidate_radius threshold, search candidates whose age is around source victims, searching radius should less than the radius threshold. - select target segment with most valid blocks in order to avoid migrating current target segment. 3. merge valid blocks from source victim into target victim with SSR alloctor. Test steps: - create 160 dirty segments: * half of them have 128 valid blocks per segment * left of them have 384 valid blocks per segment - run background GC Benefit: GC count and block movement count both decrease obviously: - Before: - Valid: 86 - Dirty: 1 - Prefree: 11 - Free: 6001 (6001) GC calls: 162 (BG: 220) - data segments : 160 (160) - node segments : 2 (2) Try to move 41454 blocks (BG: 41454) - data blocks : 40960 (40960) - node blocks : 494 (494) IPU: 0 blocks SSR: 0 blocks in 0 segments LFS: 41364 blocks in 81 segments - After: - Valid: 87 - Dirty: 0 - Prefree: 4 - Free: 6008 (6008) GC calls: 75 (BG: 76) - data segments : 74 (74) - node segments : 1 (1) Try to move 12813 blocks (BG: 12813) - data blocks : 12544 (12544) - node blocks : 269 (269) IPU: 0 blocks SSR: 12032 blocks in 77 segments LFS: 855 blocks in 2 segments Signed-off-by: Chao Yu <yuchao0@huawei.com> [Jaegeuk Kim: fix a bug along with pinfile in-mem segment & clean up] Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 16:14:49 +03:00
}
return count;
}
if (!strcmp(a->attr.name, "gc_remaining_trials")) {
spin_lock(&sbi->gc_remaining_trials_lock);
sbi->gc_remaining_trials = t;
spin_unlock(&sbi->gc_remaining_trials_lock);
return count;
}
#ifdef CONFIG_F2FS_IOSTAT
f2fs: UBSAN: set boolean value iostat_enable correctly When setting /sys/fs/f2fs/<DEV>/iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong <shengyong1@huawei.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-01-15 23:02:15 +03:00
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
if (!sbi->iostat_enable)
f2fs_reset_iostat(sbi);
return count;
}
if (!strcmp(a->attr.name, "iostat_period_ms")) {
if (t < MIN_IOSTAT_PERIOD_MS || t > MAX_IOSTAT_PERIOD_MS)
return -EINVAL;
spin_lock_irq(&sbi->iostat_lock);
sbi->iostat_period_ms = (unsigned int)t;
spin_unlock_irq(&sbi->iostat_lock);
return count;
}
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (!strcmp(a->attr.name, "compr_written_block") ||
!strcmp(a->attr.name, "compr_saved_block")) {
if (t != 0)
return -EINVAL;
sbi->compr_written_block = 0;
sbi->compr_saved_block = 0;
return count;
}
if (!strcmp(a->attr.name, "compr_new_inode")) {
if (t != 0)
return -EINVAL;
sbi->compr_new_inode = 0;
return count;
}
if (!strcmp(a->attr.name, "compress_percent")) {
if (t == 0 || t > 100)
return -EINVAL;
*ui = t;
return count;
}
if (!strcmp(a->attr.name, "compress_watermark")) {
if (t == 0 || t > 100)
return -EINVAL;
*ui = t;
return count;
}
#endif
if (!strcmp(a->attr.name, "atgc_candidate_ratio")) {
if (t > 100)
return -EINVAL;
sbi->am.candidate_ratio = t;
return count;
}
if (!strcmp(a->attr.name, "atgc_age_weight")) {
if (t > 100)
return -EINVAL;
sbi->am.age_weight = t;
return count;
}
if (!strcmp(a->attr.name, "gc_segment_mode")) {
if (t < MAX_GC_MODE)
sbi->gc_segment_mode = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "gc_reclaimed_segments")) {
if (t != 0)
return -EINVAL;
sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0;
return count;
}
if (!strcmp(a->attr.name, "seq_file_ra_mul")) {
if (t >= MIN_RA_MUL && t <= MAX_RA_MUL)
sbi->seq_file_ra_mul = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "max_fragment_chunk")) {
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
sbi->max_fragment_chunk = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "max_fragment_hole")) {
if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE)
sbi->max_fragment_hole = t;
else
return -EINVAL;
return count;
}
if (!strcmp(a->attr.name, "peak_atomic_write")) {
if (t != 0)
return -EINVAL;
sbi->peak_atomic_write = 0;
return count;
}
if (!strcmp(a->attr.name, "committed_atomic_block")) {
if (t != 0)
return -EINVAL;
sbi->committed_atomic_block = 0;
return count;
}
if (!strcmp(a->attr.name, "revoked_atomic_block")) {
if (t != 0)
return -EINVAL;
sbi->revoked_atomic_block = 0;
return count;
}
if (!strcmp(a->attr.name, "readdir_ra")) {
sbi->readdir_ra = !!t;
return count;
}
f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 <qixiaoyu1@xiaomi.com> Signed-off-by: xiongping1 <xiongping1@xiaomi.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2022-12-02 04:37:15 +03:00
if (!strcmp(a->attr.name, "hot_data_age_threshold")) {
if (t == 0 || t >= sbi->warm_data_age_threshold)
return -EINVAL;
if (t == *ui)
return count;
*ui = (unsigned int)t;
return count;
}
if (!strcmp(a->attr.name, "warm_data_age_threshold")) {
if (t <= sbi->hot_data_age_threshold)
f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 <qixiaoyu1@xiaomi.com> Signed-off-by: xiongping1 <xiongping1@xiaomi.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2022-12-02 04:37:15 +03:00
return -EINVAL;
if (t == *ui)
return count;
*ui = (unsigned int)t;
return count;
}
if (!strcmp(a->attr.name, "last_age_weight")) {
if (t > 100)
return -EINVAL;
if (t == *ui)
return count;
*ui = (unsigned int)t;
return count;
}
if (!strcmp(a->attr.name, "ipu_policy")) {
if (t >= BIT(F2FS_IPU_MAX))
return -EINVAL;
if (t && f2fs_lfs_mode(sbi))
return -EINVAL;
SM_I(sbi)->ipu_policy = (unsigned int)t;
return count;
}
f2fs: UBSAN: set boolean value iostat_enable correctly When setting /sys/fs/f2fs/<DEV>/iostat_enable with non-bool value, UBSAN reports the following warning. [ 7562.295484] ================================================================================ [ 7562.296531] UBSAN: Undefined behaviour in fs/f2fs/f2fs.h:2776:10 [ 7562.297651] load of value 64 is not a valid value for type '_Bool' [ 7562.298642] CPU: 1 PID: 7487 Comm: dd Not tainted 4.20.0-rc4+ #79 [ 7562.298653] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 7562.298662] Call Trace: [ 7562.298760] dump_stack+0x46/0x5b [ 7562.298811] ubsan_epilogue+0x9/0x40 [ 7562.298830] __ubsan_handle_load_invalid_value+0x72/0x90 [ 7562.298863] f2fs_file_write_iter+0x29f/0x3f0 [ 7562.298905] __vfs_write+0x115/0x160 [ 7562.298922] vfs_write+0xa7/0x190 [ 7562.298934] ksys_write+0x50/0xc0 [ 7562.298973] do_syscall_64+0x4a/0xe0 [ 7562.298992] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 7562.299001] RIP: 0033:0x7fa45ec19c00 [ 7562.299004] Code: 73 01 c3 48 8b 0d 88 92 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d dd eb 2c 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 ce 8f 01 00 48 89 04 24 [ 7562.299044] RSP: 002b:00007ffca52b49e8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 7562.299052] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fa45ec19c00 [ 7562.299059] RDX: 0000000000000400 RSI: 000000000093f000 RDI: 0000000000000001 [ 7562.299065] RBP: 000000000093f000 R08: 0000000000000004 R09: 0000000000000000 [ 7562.299071] R10: 00007ffca52b47b0 R11: 0000000000000246 R12: 0000000000000400 [ 7562.299077] R13: 000000000093f000 R14: 000000000093f400 R15: 0000000000000000 [ 7562.299091] ================================================================================ So, if iostat_enable is enabled, set its value as true. Signed-off-by: Sheng Yong <shengyong1@huawei.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-01-15 23:02:15 +03:00
*ui = (unsigned int)t;
return count;
}
static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
struct f2fs_sb_info *sbi,
const char *buf, size_t count)
{
ssize_t ret;
bool gc_entry = (!strcmp(a->attr.name, "gc_urgent") ||
a->struct_type == GC_THREAD);
f2fs: avoid potential deadlock in f2fs_sbi_store [ 155.018460] ====================================================== [ 155.021431] WARNING: possible circular locking dependency detected [ 155.024339] 4.18.0-rc3+ #5 Tainted: G OE [ 155.026879] ------------------------------------------------------ [ 155.029783] umount/2901 is trying to acquire lock: [ 155.032187] 00000000c4282f1f (kn->count#130){++++}, at: kernfs_remove+0x1f/0x30 [ 155.035439] [ 155.035439] but task is already holding lock: [ 155.038892] 0000000056e4307b (&type->s_umount_key#41){++++}, at: deactivate_super+0x33/0x50 [ 155.042602] [ 155.042602] which lock already depends on the new lock. [ 155.042602] [ 155.047465] [ 155.047465] the existing dependency chain (in reverse order) is: [ 155.051354] [ 155.051354] -> #1 (&type->s_umount_key#41){++++}: [ 155.054768] f2fs_sbi_store+0x61/0x460 [f2fs] [ 155.057083] kernfs_fop_write+0x113/0x1a0 [ 155.059277] __vfs_write+0x36/0x180 [ 155.061250] vfs_write+0xbe/0x1b0 [ 155.063179] ksys_write+0x55/0xc0 [ 155.065068] do_syscall_64+0x60/0x1b0 [ 155.067071] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.069529] [ 155.069529] -> #0 (kn->count#130){++++}: [ 155.072421] __kernfs_remove+0x26f/0x2e0 [ 155.074452] kernfs_remove+0x1f/0x30 [ 155.076342] kobject_del.part.5+0xe/0x40 [ 155.078354] f2fs_put_super+0x12d/0x290 [f2fs] [ 155.080500] generic_shutdown_super+0x6c/0x110 [ 155.082655] kill_block_super+0x21/0x50 [ 155.084634] kill_f2fs_super+0x9c/0xc0 [f2fs] [ 155.086726] deactivate_locked_super+0x3f/0x70 [ 155.088826] cleanup_mnt+0x3b/0x70 [ 155.090584] task_work_run+0x93/0xc0 [ 155.092367] exit_to_usermode_loop+0xf0/0x100 [ 155.094466] do_syscall_64+0x162/0x1b0 [ 155.096312] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 155.098603] [ 155.098603] other info that might help us debug this: [ 155.098603] [ 155.102418] Possible unsafe locking scenario: [ 155.102418] [ 155.105134] CPU0 CPU1 [ 155.107037] ---- ---- [ 155.108910] lock(&type->s_umount_key#41); [ 155.110674] lock(kn->count#130); [ 155.113010] lock(&type->s_umount_key#41); [ 155.115608] lock(kn->count#130); Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-07-15 03:58:08 +03:00
if (gc_entry) {
if (!down_read_trylock(&sbi->sb->s_umount))
return -EAGAIN;
}
f2fs: clean up symbol namespace As Ted reported: "Hi, I was looking at f2fs's sources recently, and I noticed that there is a very large number of non-static symbols which don't have a f2fs prefix. There's well over a hundred (see attached below). As one example, in fs/f2fs/dir.c there is: unsigned char get_de_type(struct f2fs_dir_entry *de) This function is clearly only useful for f2fs, but it has a generic name. This means that if any other file system tries to have the same symbol name, there will be a symbol conflict and the kernel would not successfully build. It also means that when someone is looking f2fs sources, it's not at all obvious whether a function such as read_data_page(), invalidate_blocks(), is a generic kernel function found in the fs, mm, or block layers, or a f2fs specific function. You might want to fix this at some point. Hopefully Kent's bcachefs isn't similarly using genericly named functions, since that might cause conflicts with f2fs's functions --- but just as this would be a problem that we would rightly insist that Kent fix, this is something that we should have rightly insisted that f2fs should have fixed before it was integrated into the mainline kernel. acquire_orphan_inode add_ino_entry add_orphan_inode allocate_data_block allocate_new_segments alloc_nid alloc_nid_done alloc_nid_failed available_free_memory ...." This patch adds "f2fs_" prefix for all non-static symbols in order to: a) avoid conflict with other kernel generic symbols; b) to indicate the function is f2fs specific one instead of generic one; Reported-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-29 19:20:41 +03:00
ret = __sbi_store(a, sbi, buf, count);
if (gc_entry)
up_read(&sbi->sb->s_umount);
return ret;
}
static ssize_t f2fs_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->show ? a->show(a, sbi, buf) : 0;
}
static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->store ? a->store(a, sbi, buf, len) : 0;
}
static void f2fs_sb_release(struct kobject *kobj)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_kobj);
complete(&sbi->s_kobj_unregister);
}
/*
* Note that there are three feature list entries:
* 1) /sys/fs/f2fs/features
* : shows runtime features supported by in-kernel f2fs along with Kconfig.
* - ref. F2FS_FEATURE_RO_ATTR()
*
* 2) /sys/fs/f2fs/$s_id/features <deprecated>
* : shows on-disk features enabled by mkfs.f2fs, used for old kernels. This
* won't add new feature anymore, and thus, users should check entries in 3)
* instead of this 2).
*
* 3) /sys/fs/f2fs/$s_id/feature_list
* : shows on-disk features enabled by mkfs.f2fs per instance, which follows
* sysfs entry rule where each entry should expose single value.
* This list covers old feature list provided by 2) and beyond. Therefore,
* please add new on-disk feature in this list only.
* - ref. F2FS_SB_FEATURE_RO_ATTR()
*/
static ssize_t f2fs_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_feature_show, \
}
static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
if (F2FS_HAS_FEATURE(sbi, a->id))
return sysfs_emit(buf, "supported\n");
return sysfs_emit(buf, "unsupported\n");
}
#define F2FS_SB_FEATURE_RO_ATTR(_name, _feat) \
static struct f2fs_attr f2fs_attr_sb_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_sb_feature_show, \
.id = F2FS_FEATURE_##_feat, \
}
#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
.struct_type = _struct_type, \
.offset = _offset \
}
#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0444, \
f2fs_sbi_show, NULL, \
offsetof(struct struct_name, elname))
#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
F2FS_ATTR_OFFSET(struct_type, name, 0644, \
f2fs_sbi_show, f2fs_sbi_store, \
offsetof(struct struct_name, elname))
#define F2FS_GENERAL_RO_ATTR(name) \
static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL)
#ifdef CONFIG_F2FS_STAT_FS
#define STAT_INFO_RO_ATTR(name, elname) \
F2FS_RO_ATTR(STAT_INFO, f2fs_stat_info, name, elname)
#endif
#define GC_THREAD_RW_ATTR(name, elname) \
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, name, elname)
#define SM_INFO_RW_ATTR(name, elname) \
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, name, elname)
#define SM_INFO_GENERAL_RW_ATTR(elname) \
SM_INFO_RW_ATTR(elname, elname)
#define DCC_INFO_RW_ATTR(name, elname) \
F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, name, elname)
#define DCC_INFO_GENERAL_RW_ATTR(elname) \
DCC_INFO_RW_ATTR(elname, elname)
#define NM_INFO_RW_ATTR(name, elname) \
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, name, elname)
#define NM_INFO_GENERAL_RW_ATTR(elname) \
NM_INFO_RW_ATTR(elname, elname)
#define F2FS_SBI_RW_ATTR(name, elname) \
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, name, elname)
#define F2FS_SBI_GENERAL_RW_ATTR(elname) \
F2FS_SBI_RW_ATTR(elname, elname)
#define F2FS_SBI_GENERAL_RO_ATTR(elname) \
F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, elname, elname)
#ifdef CONFIG_F2FS_FAULT_INJECTION
#define FAULT_INFO_GENERAL_RW_ATTR(type, elname) \
F2FS_RW_ATTR(type, f2fs_fault_info, elname, elname)
#endif
#define RESERVED_BLOCKS_GENERAL_RW_ATTR(elname) \
F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, elname, elname)
#define CPRC_INFO_GENERAL_RW_ATTR(elname) \
F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, elname, elname)
#define ATGC_INFO_RW_ATTR(name, elname) \
F2FS_RW_ATTR(ATGC_INFO, atgc_management, name, elname)
/* GC_THREAD ATTR */
GC_THREAD_RW_ATTR(gc_urgent_sleep_time, urgent_sleep_time);
GC_THREAD_RW_ATTR(gc_min_sleep_time, min_sleep_time);
GC_THREAD_RW_ATTR(gc_max_sleep_time, max_sleep_time);
GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
/* SM_INFO ATTR */
SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments);
SM_INFO_GENERAL_RW_ATTR(ipu_policy);
SM_INFO_GENERAL_RW_ATTR(min_ipu_util);
SM_INFO_GENERAL_RW_ATTR(min_fsync_blocks);
SM_INFO_GENERAL_RW_ATTR(min_seq_blocks);
SM_INFO_GENERAL_RW_ATTR(min_hot_blocks);
SM_INFO_GENERAL_RW_ATTR(min_ssr_sections);
/* DCC_INFO ATTR */
DCC_INFO_RW_ATTR(max_small_discards, max_discards);
DCC_INFO_GENERAL_RW_ATTR(max_discard_request);
DCC_INFO_GENERAL_RW_ATTR(min_discard_issue_time);
DCC_INFO_GENERAL_RW_ATTR(mid_discard_issue_time);
DCC_INFO_GENERAL_RW_ATTR(max_discard_issue_time);
DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran);
DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util);
DCC_INFO_GENERAL_RW_ATTR(discard_granularity);
DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard);
/* NM_INFO ATTR */
NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks);
NM_INFO_GENERAL_RW_ATTR(ram_thresh);
NM_INFO_GENERAL_RW_ATTR(ra_nid_pages);
NM_INFO_GENERAL_RW_ATTR(dirty_nats_ratio);
/* F2FS_SBI ATTR */
F2FS_RW_ATTR(F2FS_SBI, f2fs_super_block, extension_list, extension_list);
F2FS_SBI_RW_ATTR(gc_idle, gc_mode);
F2FS_SBI_RW_ATTR(gc_urgent, gc_mode);
F2FS_SBI_RW_ATTR(cp_interval, interval_time[CP_TIME]);
F2FS_SBI_RW_ATTR(idle_interval, interval_time[REQ_TIME]);
F2FS_SBI_RW_ATTR(discard_idle_interval, interval_time[DISCARD_TIME]);
F2FS_SBI_RW_ATTR(gc_idle_interval, interval_time[GC_TIME]);
F2FS_SBI_RW_ATTR(umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]);
F2FS_SBI_RW_ATTR(gc_pin_file_thresh, gc_pin_file_threshold);
F2FS_SBI_RW_ATTR(gc_reclaimed_segments, gc_reclaimed_segs);
F2FS_SBI_GENERAL_RW_ATTR(max_victim_search);
F2FS_SBI_GENERAL_RW_ATTR(migration_granularity);
F2FS_SBI_GENERAL_RW_ATTR(dir_level);
#ifdef CONFIG_F2FS_IOSTAT
F2FS_SBI_GENERAL_RW_ATTR(iostat_enable);
F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms);
#endif
F2FS_SBI_GENERAL_RW_ATTR(readdir_ra);
F2FS_SBI_GENERAL_RW_ATTR(max_io_bytes);
F2FS_SBI_GENERAL_RW_ATTR(data_io_flag);
F2FS_SBI_GENERAL_RW_ATTR(node_io_flag);
F2FS_SBI_GENERAL_RW_ATTR(gc_remaining_trials);
F2FS_SBI_GENERAL_RW_ATTR(seq_file_ra_mul);
F2FS_SBI_GENERAL_RW_ATTR(gc_segment_mode);
F2FS_SBI_GENERAL_RW_ATTR(max_fragment_chunk);
F2FS_SBI_GENERAL_RW_ATTR(max_fragment_hole);
#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_SBI_GENERAL_RW_ATTR(compr_written_block);
F2FS_SBI_GENERAL_RW_ATTR(compr_saved_block);
F2FS_SBI_GENERAL_RW_ATTR(compr_new_inode);
F2FS_SBI_GENERAL_RW_ATTR(compress_percent);
F2FS_SBI_GENERAL_RW_ATTR(compress_watermark);
#endif
/* atomic write */
F2FS_SBI_GENERAL_RO_ATTR(current_atomic_write);
F2FS_SBI_GENERAL_RW_ATTR(peak_atomic_write);
F2FS_SBI_GENERAL_RW_ATTR(committed_atomic_block);
F2FS_SBI_GENERAL_RW_ATTR(revoked_atomic_block);
/* block age extent cache */
F2FS_SBI_GENERAL_RW_ATTR(hot_data_age_threshold);
F2FS_SBI_GENERAL_RW_ATTR(warm_data_age_threshold);
F2FS_SBI_GENERAL_RW_ATTR(last_age_weight);
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
#endif
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
STAT_INFO_RO_ATTR(cp_foreground_calls, cp_count);
STAT_INFO_RO_ATTR(cp_background_calls, bg_cp_count);
STAT_INFO_RO_ATTR(gc_foreground_calls, call_count);
STAT_INFO_RO_ATTR(gc_background_calls, bg_gc);
#endif
/* FAULT_INFO ATTR */
#ifdef CONFIG_F2FS_FAULT_INJECTION
FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_RATE, inject_rate);
FAULT_INFO_GENERAL_RW_ATTR(FAULT_INFO_TYPE, inject_type);
#endif
/* RESERVED_BLOCKS ATTR */
RESERVED_BLOCKS_GENERAL_RW_ATTR(reserved_blocks);
/* CPRC_INFO ATTR */
CPRC_INFO_GENERAL_RW_ATTR(ckpt_thread_ioprio);
/* ATGC_INFO ATTR */
ATGC_INFO_RW_ATTR(atgc_candidate_ratio, candidate_ratio);
ATGC_INFO_RW_ATTR(atgc_candidate_count, max_candidate_count);
ATGC_INFO_RW_ATTR(atgc_age_weight, age_weight);
ATGC_INFO_RW_ATTR(atgc_age_threshold, age_threshold);
F2FS_GENERAL_RO_ATTR(dirty_segments);
F2FS_GENERAL_RO_ATTR(free_segments);
F2FS_GENERAL_RO_ATTR(ovp_segments);
F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
F2FS_GENERAL_RO_ATTR(encoding);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
F2FS_GENERAL_RO_ATTR(gc_mode);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_GENERAL_RO_ATTR(moved_blocks_background);
F2FS_GENERAL_RO_ATTR(moved_blocks_foreground);
F2FS_GENERAL_RO_ATTR(avg_vblocks);
#endif
#ifdef CONFIG_FS_ENCRYPTION
F2FS_FEATURE_RO_ATTR(encryption);
F2FS_FEATURE_RO_ATTR(test_dummy_encryption_v2);
#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(encrypted_casefold);
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
F2FS_FEATURE_RO_ATTR(block_zoned);
#endif
F2FS_FEATURE_RO_ATTR(atomic_write);
F2FS_FEATURE_RO_ATTR(extra_attr);
F2FS_FEATURE_RO_ATTR(project_quota);
F2FS_FEATURE_RO_ATTR(inode_checksum);
F2FS_FEATURE_RO_ATTR(flexible_inline_xattr);
F2FS_FEATURE_RO_ATTR(quota_ino);
F2FS_FEATURE_RO_ATTR(inode_crtime);
F2FS_FEATURE_RO_ATTR(lost_found);
f2fs: add fs-verity support Add fs-verity support to f2fs. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. The f2fs support for fs-verity consists of: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->readpages() to verify data as it's read from verity files and to support reading verity metadata pages. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by f2fs with only some relatively small changes to f2fs. Extended attributes cannot be used because (a) f2fs limits the total size of an inode's xattr entries to 4096 bytes, which wouldn't be enough for even a single Merkle tree block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity metadata *must* be encrypted when the file is because it contains hashes of the plaintext data. Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Acked-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 19:26:24 +03:00
#ifdef CONFIG_FS_VERITY
F2FS_FEATURE_RO_ATTR(verity);
f2fs: add fs-verity support Add fs-verity support to f2fs. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. The f2fs support for fs-verity consists of: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->readpages() to verify data as it's read from verity files and to support reading verity metadata pages. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by f2fs with only some relatively small changes to f2fs. Extended attributes cannot be used because (a) f2fs limits the total size of an inode's xattr entries to 4096 bytes, which wouldn't be enough for even a single Merkle tree block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity metadata *must* be encrypted when the file is because it contains hashes of the plaintext data. Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Acked-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 19:26:24 +03:00
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum);
#if IS_ENABLED(CONFIG_UNICODE)
F2FS_FEATURE_RO_ATTR(casefold);
#endif
F2FS_FEATURE_RO_ATTR(readonly);
#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression);
#endif
F2FS_FEATURE_RO_ATTR(pin_file);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_urgent_sleep_time),
ATTR_LIST(gc_min_sleep_time),
ATTR_LIST(gc_max_sleep_time),
ATTR_LIST(gc_no_gc_sleep_time),
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
ATTR_LIST(main_blkaddr),
ATTR_LIST(max_small_discards),
ATTR_LIST(max_discard_request),
ATTR_LIST(min_discard_issue_time),
ATTR_LIST(mid_discard_issue_time),
ATTR_LIST(max_discard_issue_time),
ATTR_LIST(discard_io_aware_gran),
ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
ATTR_LIST(max_ordered_discard),
ATTR_LIST(pending_discard),
ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
ATTR_LIST(min_seq_blocks),
ATTR_LIST(min_hot_blocks),
ATTR_LIST(min_ssr_sections),
ATTR_LIST(max_victim_search),
ATTR_LIST(migration_granularity),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
ATTR_LIST(ra_nid_pages),
ATTR_LIST(dirty_nats_ratio),
ATTR_LIST(max_roll_forward_node_blocks),
ATTR_LIST(cp_interval),
ATTR_LIST(idle_interval),
ATTR_LIST(discard_idle_interval),
ATTR_LIST(gc_idle_interval),
ATTR_LIST(umount_discard_timeout),
#ifdef CONFIG_F2FS_IOSTAT
ATTR_LIST(iostat_enable),
ATTR_LIST(iostat_period_ms),
#endif
ATTR_LIST(readdir_ra),
ATTR_LIST(max_io_bytes),
ATTR_LIST(gc_pin_file_thresh),
ATTR_LIST(extension_list),
#ifdef CONFIG_F2FS_FAULT_INJECTION
ATTR_LIST(inject_rate),
ATTR_LIST(inject_type),
#endif
ATTR_LIST(data_io_flag),
ATTR_LIST(node_io_flag),
ATTR_LIST(gc_remaining_trials),
ATTR_LIST(ckpt_thread_ioprio),
ATTR_LIST(dirty_segments),
ATTR_LIST(free_segments),
ATTR_LIST(ovp_segments),
ATTR_LIST(unusable),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(features),
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
ATTR_LIST(encoding),
ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
ATTR_LIST(cp_background_calls),
ATTR_LIST(gc_foreground_calls),
ATTR_LIST(gc_background_calls),
ATTR_LIST(moved_blocks_foreground),
ATTR_LIST(moved_blocks_background),
ATTR_LIST(avg_vblocks),
#endif
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(unusable_blocks_per_sec),
#endif
#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compr_written_block),
ATTR_LIST(compr_saved_block),
ATTR_LIST(compr_new_inode),
ATTR_LIST(compress_percent),
ATTR_LIST(compress_watermark),
#endif
/* For ATGC */
ATTR_LIST(atgc_candidate_ratio),
ATTR_LIST(atgc_candidate_count),
ATTR_LIST(atgc_age_weight),
ATTR_LIST(atgc_age_threshold),
ATTR_LIST(seq_file_ra_mul),
ATTR_LIST(gc_segment_mode),
ATTR_LIST(gc_reclaimed_segments),
ATTR_LIST(max_fragment_chunk),
ATTR_LIST(max_fragment_hole),
ATTR_LIST(current_atomic_write),
ATTR_LIST(peak_atomic_write),
ATTR_LIST(committed_atomic_block),
ATTR_LIST(revoked_atomic_block),
f2fs: add block_age-based extent cache This patch introduces a runtime hot/cold data separation method for f2fs, in order to improve the accuracy for data temperature classification, reduce the garbage collection overhead after long-term data updates. Enhanced hot/cold data separation can record data block update frequency as "age" of the extent per inode, and take use of the age info to indicate better temperature type for data block allocation: - It records total data blocks allocated since mount; - When file extent has been updated, it calculate the count of data blocks allocated since last update as the age of the extent; - Before the data block allocated, it searches for the age info and chooses the suitable segment for allocation. Test and result: - Prepare: create about 30000 files * 3% for cold files (with cold file extension like .apk, from 3M to 10M) * 50% for warm files (with random file extension like .FcDxq, from 1K to 4M) * 47% for hot files (with hot file extension like .db, from 1K to 256K) - create(5%)/random update(90%)/delete(5%) the files * total write amount is about 70G * fsync will be called for .db files, and buffered write will be used for other files The storage of test device is large enough(128G) so that it will not switch to SSR mode during the test. Benefit: dirty segment count increment reduce about 14% - before: Dirty +21110 - after: Dirty +18286 Signed-off-by: qixiaoyu1 <qixiaoyu1@xiaomi.com> Signed-off-by: xiongping1 <xiongping1@xiaomi.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2022-12-02 04:37:15 +03:00
ATTR_LIST(hot_data_age_threshold),
ATTR_LIST(warm_data_age_threshold),
ATTR_LIST(last_age_weight),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
ATTR_LIST(encryption),
fscrypt: support test_dummy_encryption=v2 v1 encryption policies are deprecated in favor of v2, and some new features (e.g. encryption+casefolding) are only being added for v2. Therefore, the "test_dummy_encryption" mount option (which is used for encryption I/O testing with xfstests) needs to support v2 policies. To do this, extend its syntax to be "test_dummy_encryption=v1" or "test_dummy_encryption=v2". The existing "test_dummy_encryption" (no argument) also continues to be accepted, to specify the default setting -- currently v1, but the next patch changes it to v2. To cleanly support both v1 and v2 while also making it easy to support specifying other encryption settings in the future (say, accepting "$contents_mode:$filenames_mode:v2"), make ext4 and f2fs maintain a pointer to the dummy fscrypt_context rather than using mount flags. To avoid concurrency issues, don't allow test_dummy_encryption to be set or changed during a remount. (The former restriction is new, but xfstests doesn't run into it, so no one should notice.) Tested with 'gce-xfstests -c {ext4,f2fs}/encrypt -g auto'. On ext4, there are two regressions, both of which are test bugs: ext4/023 and ext4/028 fail because they set an xattr and expect it to be stored inline, but the increase in size of the fscrypt_context from 24 to 40 bytes causes this xattr to be spilled into an external block. Link: https://lore.kernel.org/r/20200512233251.118314-4-ebiggers@kernel.org Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Reviewed-by: Theodore Ts'o <tytso@mit.edu> Signed-off-by: Eric Biggers <ebiggers@google.com>
2020-05-13 02:32:50 +03:00
ATTR_LIST(test_dummy_encryption_v2),
#if IS_ENABLED(CONFIG_UNICODE)
ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
ATTR_LIST(block_zoned),
#endif
ATTR_LIST(atomic_write),
ATTR_LIST(extra_attr),
ATTR_LIST(project_quota),
ATTR_LIST(inode_checksum),
f2fs: support flexible inline xattr size Now, in product, more and more features based on file encryption were introduced, their demand of xattr space is increasing, however, inline xattr has fixed-size of 200 bytes, once inline xattr space is full, new increased xattr data would occupy additional xattr block which may bring us more space usage and performance regression during persisting. In order to resolve above issue, it's better to expand inline xattr size flexibly according to user's requirement. So this patch introduces new filesystem feature 'flexible inline xattr', and new mount option 'inline_xattr_size=%u', once mkfs enables the feature, we can use the option to make f2fs supporting flexible inline xattr size. To support this feature, we add extra attribute i_inline_xattr_size in inode layout, indicating that how many space inline xattr borrows from block address mapping space in inode layout, by this, we can easily locate and store flexible-sized inline xattr data in inode. Inode disk layout: +----------------------+ | .i_mode | | ... | | .i_ext | +----------------------+ | .i_extra_isize | | .i_inline_xattr_size |-----------+ | ... | | +----------------------+ | | .i_addr | | | - block address or | | | - inline data | | +----------------------+<---+ v | inline xattr | +---inline xattr range +----------------------+<---+ | .i_nid | +----------------------+ | node_footer | | (nid, ino, offset) | +----------------------+ Note that, we have to cnosider backward compatibility which reserved inline_data space, 200 bytes, all the time, reported by Sheng Yong. Previous inline data or directory always reserved 200 bytes in inode layout, even if inline_xattr is disabled. In order to keep inline_dentry's structure for backward compatibility, we get the space back only from inline_data. Signed-off-by: Chao Yu <yuchao0@huawei.com> Reported-by: Sheng Yong <shengyong1@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-09-06 16:59:50 +03:00
ATTR_LIST(flexible_inline_xattr),
ATTR_LIST(quota_ino),
ATTR_LIST(inode_crtime),
ATTR_LIST(lost_found),
f2fs: add fs-verity support Add fs-verity support to f2fs. fs-verity is a filesystem feature that enables transparent integrity protection and authentication of read-only files. It uses a dm-verity like mechanism at the file level: a Merkle tree is used to verify any block in the file in log(filesize) time. It is implemented mainly by helper functions in fs/verity/. See Documentation/filesystems/fsverity.rst for the full documentation. The f2fs support for fs-verity consists of: - Adding a filesystem feature flag and an inode flag for fs-verity. - Implementing the fsverity_operations to support enabling verity on an inode and reading/writing the verity metadata. - Updating ->readpages() to verify data as it's read from verity files and to support reading verity metadata pages. - Updating ->write_begin(), ->write_end(), and ->writepages() to support writing verity metadata pages. - Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl(). Like ext4, f2fs stores the verity metadata (Merkle tree and fsverity_descriptor) past the end of the file, starting at the first 64K boundary beyond i_size. This approach works because (a) verity files are readonly, and (b) pages fully beyond i_size aren't visible to userspace but can be read/written internally by f2fs with only some relatively small changes to f2fs. Extended attributes cannot be used because (a) f2fs limits the total size of an inode's xattr entries to 4096 bytes, which wouldn't be enough for even a single Merkle tree block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity metadata *must* be encrypted when the file is because it contains hashes of the plaintext data. Acked-by: Jaegeuk Kim <jaegeuk@kernel.org> Acked-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 19:26:24 +03:00
#ifdef CONFIG_FS_VERITY
ATTR_LIST(verity),
#endif
ATTR_LIST(sb_checksum),
#if IS_ENABLED(CONFIG_UNICODE)
f2fs: include charset encoding information in the superblock Add charset encoding to f2fs to support casefolding. It is modeled after the same feature introduced in commit c83ad55eaa91 ("ext4: include charset encoding information in the superblock") Currently this is not compatible with encryption, similar to the current ext4 imlpementation. This will change in the future. >From the ext4 patch: """ The s_encoding field stores a magic number indicating the encoding format and version used globally by file and directory names in the filesystem. The s_encoding_flags defines policies for using the charset encoding, like how to handle invalid sequences. The magic number is mapped to the exact charset table, but the mapping is specific to ext4. Since we don't have any commitment to support old encodings, the only encoding I am supporting right now is utf8-12.1.0. The current implementation prevents the user from enabling encoding and per-directory encryption on the same filesystem at the same time. The incompatibility between these features lies in how we do efficient directory searches when we cannot be sure the encryption of the user provided fname will match the actual hash stored in the disk without decrypting every directory entry, because of normalization cases. My quickest solution is to simply block the concurrent use of these features for now, and enable it later, once we have a better solution. """ Signed-off-by: Daniel Rosenberg <drosen@google.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-24 02:05:28 +03:00
ATTR_LIST(casefold),
#endif
ATTR_LIST(readonly),
#ifdef CONFIG_F2FS_FS_COMPRESSION
f2fs: support data compression This patch tries to support compression in f2fs. - New term named cluster is defined as basic unit of compression, file can be divided into multiple clusters logically. One cluster includes 4 << n (n >= 0) logical pages, compression size is also cluster size, each of cluster can be compressed or not. - In cluster metadata layout, one special flag is used to indicate cluster is compressed one or normal one, for compressed cluster, following metadata maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores data including compress header and compressed data. - In order to eliminate write amplification during overwrite, F2FS only support compression on write-once file, data can be compressed only when all logical blocks in file are valid and cluster compress ratio is lower than specified threshold. - To enable compression on regular inode, there are three ways: * chattr +c file * chattr +c dir; touch dir/file * mount w/ -o compress_extension=ext; touch file.ext Compress metadata layout: [Dnode Structure] +-----------------------------------------------+ | cluster 1 | cluster 2 | ......... | cluster N | +-----------------------------------------------+ . . . . . . . . . Compressed Cluster . . Normal Cluster . +----------+---------+---------+---------+ +---------+---------+---------+---------+ |compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 | +----------+---------+---------+---------+ +---------+---------+---------+---------+ . . . . . . +-------------+-------------+----------+----------------------------+ | data length | data chksum | reserved | compressed data | +-------------+-------------+----------+----------------------------+ Changelog: 20190326: - fix error handling of read_end_io(). - remove unneeded comments in f2fs_encrypt_one_page(). 20190327: - fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages(). - don't jump into loop directly to avoid uninitialized variables. - add TODO tag in error path of f2fs_write_cache_pages(). 20190328: - fix wrong merge condition in f2fs_read_multi_pages(). - check compressed file in f2fs_post_read_required(). 20190401 - allow overwrite on non-compressed cluster. - check cluster meta before writing compressed data. 20190402 - don't preallocate blocks for compressed file. - add lz4 compress algorithm - process multiple post read works in one workqueue Now f2fs supports processing post read work in multiple workqueue, it shows low performance due to schedule overhead of multiple workqueue executing orderly. 20190921 - compress: support buffered overwrite C: compress cluster flag V: valid block address N: NEW_ADDR One cluster contain 4 blocks before overwrite after overwrite - VVVV -> CVNN - CVNN -> VVVV - CVNN -> CVNN - CVNN -> CVVV - CVVV -> CVNN - CVVV -> CVVV 20191029 - add kconfig F2FS_FS_COMPRESSION to isolate compression related codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm. note that: will remove lzo backend if Jaegeuk agreed that too. - update codes according to Eric's comments. 20191101 - apply fixes from Jaegeuk 20191113 - apply fixes from Jaegeuk - split workqueue for fsverity 20191216 - apply fixes from Jaegeuk 20200117 - fix to avoid NULL pointer dereference [Jaegeuk Kim] - add tracepoint for f2fs_{,de}compress_pages() - fix many bugs and add some compression stats - fix overwrite/mmap bugs - address 32bit build error, reported by Geert. - bug fixes when handling errors and i_compressed_blocks Reported-by: <noreply@ellerman.id.au> Signed-off-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 13:07:14 +03:00
ATTR_LIST(compression),
#endif
ATTR_LIST(pin_file),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
F2FS_GENERAL_RO_ATTR(sb_status);
F2FS_GENERAL_RO_ATTR(cp_status);
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
ATTR_LIST(cp_status),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_stat);
F2FS_SB_FEATURE_RO_ATTR(encryption, ENCRYPT);
F2FS_SB_FEATURE_RO_ATTR(block_zoned, BLKZONED);
F2FS_SB_FEATURE_RO_ATTR(extra_attr, EXTRA_ATTR);
F2FS_SB_FEATURE_RO_ATTR(project_quota, PRJQUOTA);
F2FS_SB_FEATURE_RO_ATTR(inode_checksum, INODE_CHKSUM);
F2FS_SB_FEATURE_RO_ATTR(flexible_inline_xattr, FLEXIBLE_INLINE_XATTR);
F2FS_SB_FEATURE_RO_ATTR(quota_ino, QUOTA_INO);
F2FS_SB_FEATURE_RO_ATTR(inode_crtime, INODE_CRTIME);
F2FS_SB_FEATURE_RO_ATTR(lost_found, LOST_FOUND);
F2FS_SB_FEATURE_RO_ATTR(verity, VERITY);
F2FS_SB_FEATURE_RO_ATTR(sb_checksum, SB_CHKSUM);
F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD);
F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION);
F2FS_SB_FEATURE_RO_ATTR(readonly, RO);
static struct attribute *f2fs_sb_feat_attrs[] = {
ATTR_LIST(sb_encryption),
ATTR_LIST(sb_block_zoned),
ATTR_LIST(sb_extra_attr),
ATTR_LIST(sb_project_quota),
ATTR_LIST(sb_inode_checksum),
ATTR_LIST(sb_flexible_inline_xattr),
ATTR_LIST(sb_quota_ino),
ATTR_LIST(sb_inode_crtime),
ATTR_LIST(sb_lost_found),
ATTR_LIST(sb_verity),
ATTR_LIST(sb_sb_checksum),
ATTR_LIST(sb_casefold),
ATTR_LIST(sb_compression),
ATTR_LIST(sb_readonly),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_sb_feat);
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
};
static const struct kobj_type f2fs_sb_ktype = {
.default_groups = f2fs_groups,
.sysfs_ops = &f2fs_attr_ops,
.release = f2fs_sb_release,
};
static const struct kobj_type f2fs_ktype = {
.sysfs_ops = &f2fs_attr_ops,
};
static struct kset f2fs_kset = {
.kobj = {.ktype = &f2fs_ktype},
};
static const struct kobj_type f2fs_feat_ktype = {
.default_groups = f2fs_feat_groups,
.sysfs_ops = &f2fs_attr_ops,
};
static struct kobject f2fs_feat = {
.kset = &f2fs_kset,
};
static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->show ? a->show(a, sbi, buf) : 0;
}
static ssize_t f2fs_stat_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t len)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->store ? a->store(a, sbi, buf, len) : 0;
}
static void f2fs_stat_kobj_release(struct kobject *kobj)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_stat_kobj);
complete(&sbi->s_stat_kobj_unregister);
}
static const struct sysfs_ops f2fs_stat_attr_ops = {
.show = f2fs_stat_attr_show,
.store = f2fs_stat_attr_store,
};
static const struct kobj_type f2fs_stat_ktype = {
.default_groups = f2fs_stat_groups,
.sysfs_ops = &f2fs_stat_attr_ops,
.release = f2fs_stat_kobj_release,
};
static ssize_t f2fs_sb_feat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_feature_list_kobj);
struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr);
return a->show ? a->show(a, sbi, buf) : 0;
}
static void f2fs_feature_list_kobj_release(struct kobject *kobj)
{
struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info,
s_feature_list_kobj);
complete(&sbi->s_feature_list_kobj_unregister);
}
static const struct sysfs_ops f2fs_feature_list_attr_ops = {
.show = f2fs_sb_feat_attr_show,
};
static const struct kobj_type f2fs_feature_list_ktype = {
.default_groups = f2fs_sb_feat_groups,
.sysfs_ops = &f2fs_feature_list_attr_ops,
.release = f2fs_feature_list_kobj_release,
};
static int __maybe_unused segment_info_seq_show(struct seq_file *seq,
void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
unsigned int total_segs =
le32_to_cpu(sbi->raw_super->segment_count_main);
int i;
seq_puts(seq, "format: segment_type|valid_blocks\n"
"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
for (i = 0; i < total_segs; i++) {
struct seg_entry *se = get_seg_entry(sbi, i);
if ((i % 10) == 0)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u", se->type, se->valid_blocks);
if ((i % 10) == 9 || i == (total_segs - 1))
seq_putc(seq, '\n');
else
seq_putc(seq, ' ');
}
return 0;
}
static int __maybe_unused segment_bits_seq_show(struct seq_file *seq,
void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
unsigned int total_segs =
le32_to_cpu(sbi->raw_super->segment_count_main);
int i, j;
seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n"
"segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
for (i = 0; i < total_segs; i++) {
struct seg_entry *se = get_seg_entry(sbi, i);
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d|%-3u|", se->type, se->valid_blocks);
for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++)
seq_printf(seq, " %.2x", se->cur_valid_map[j]);
seq_putc(seq, '\n');
}
return 0;
}
static int __maybe_unused victim_bits_seq_show(struct seq_file *seq,
void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
int i;
seq_puts(seq, "format: victim_secmap bitmaps\n");
for (i = 0; i < MAIN_SECS(sbi); i++) {
if ((i % 10) == 0)
seq_printf(seq, "%-10d", i);
seq_printf(seq, "%d", test_bit(i, dirty_i->victim_secmap) ? 1 : 0);
if ((i % 10) == 9 || i == (MAIN_SECS(sbi) - 1))
seq_putc(seq, '\n');
else
seq_putc(seq, ' ');
}
return 0;
}
static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
int i, count;
seq_puts(seq, "Discard pend list(Show diacrd_cmd count on each entry, .:not exist):\n");
if (!f2fs_realtime_discard_enable(sbi))
return 0;
if (dcc) {
mutex_lock(&dcc->cmd_lock);
for (i = 0; i < MAX_PLIST_NUM; i++) {
struct list_head *pend_list;
struct discard_cmd *dc, *tmp;
if (i % 8 == 0)
seq_printf(seq, " %-3d", i);
count = 0;
pend_list = &dcc->pend_list[i];
list_for_each_entry_safe(dc, tmp, pend_list, list)
count++;
if (count)
seq_printf(seq, " %7d", count);
else
seq_puts(seq, " .");
if (i % 8 == 7)
seq_putc(seq, '\n');
}
seq_putc(seq, '\n');
mutex_unlock(&dcc->cmd_lock);
}
return 0;
}
int __init f2fs_init_sysfs(void)
{
int ret;
kobject_set_name(&f2fs_kset.kobj, "f2fs");
f2fs_kset.kobj.parent = fs_kobj;
ret = kset_register(&f2fs_kset);
if (ret)
return ret;
ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
NULL, "features");
if (ret)
goto put_kobject;
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
if (!f2fs_proc_root) {
ret = -ENOMEM;
goto put_kobject;
}
return 0;
put_kobject:
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
return ret;
}
void f2fs_exit_sysfs(void)
{
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
remove_proc_entry("fs/f2fs", NULL);
f2fs_proc_root = NULL;
}
int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
{
struct super_block *sb = sbi->sb;
int err;
sbi->s_kobj.kset = &f2fs_kset;
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
"%s", sb->s_id);
if (err)
goto put_sb_kobj;
sbi->s_stat_kobj.kset = &f2fs_kset;
init_completion(&sbi->s_stat_kobj_unregister);
err = kobject_init_and_add(&sbi->s_stat_kobj, &f2fs_stat_ktype,
&sbi->s_kobj, "stat");
if (err)
goto put_stat_kobj;
sbi->s_feature_list_kobj.kset = &f2fs_kset;
init_completion(&sbi->s_feature_list_kobj_unregister);
err = kobject_init_and_add(&sbi->s_feature_list_kobj,
&f2fs_feature_list_ktype,
&sbi->s_kobj, "feature_list");
if (err)
goto put_feature_list_kobj;
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
if (!sbi->s_proc) {
err = -ENOMEM;
goto put_feature_list_kobj;
}
proc_create_single_data("segment_info", 0444, sbi->s_proc,
segment_info_seq_show, sb);
proc_create_single_data("segment_bits", 0444, sbi->s_proc,
segment_bits_seq_show, sb);
#ifdef CONFIG_F2FS_IOSTAT
proc_create_single_data("iostat_info", 0444, sbi->s_proc,
iostat_info_seq_show, sb);
#endif
proc_create_single_data("victim_bits", 0444, sbi->s_proc,
victim_bits_seq_show, sb);
proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
discard_plist_seq_show, sb);
return 0;
put_feature_list_kobj:
kobject_put(&sbi->s_feature_list_kobj);
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
put_stat_kobj:
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
put_sb_kobj:
kobject_put(&sbi->s_kobj);
wait_for_completion(&sbi->s_kobj_unregister);
return err;
}
void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
{
remove_proc_subtree(sbi->sb->s_id, f2fs_proc_root);
kobject_put(&sbi->s_stat_kobj);
wait_for_completion(&sbi->s_stat_kobj_unregister);
kobject_put(&sbi->s_feature_list_kobj);
wait_for_completion(&sbi->s_feature_list_kobj_unregister);
kobject_put(&sbi->s_kobj);
f2fs: wait for sysfs kobject removal before freeing f2fs_sb_info syzkaller found that with CONFIG_DEBUG_KOBJECT_RELEASE=y, unmounting an f2fs filesystem could result in the following splat: kobject: 'loop5' ((____ptrval____)): kobject_release, parent 0000000000000000 (delayed 250) kobject: 'f2fs_xattr_entry-7:5' ((____ptrval____)): kobject_release, parent 0000000000000000 (delayed 750) ------------[ cut here ]------------ ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x98 WARNING: CPU: 0 PID: 699 at lib/debugobjects.c:485 debug_print_object+0x180/0x240 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 699 Comm: syz-executor.5 Tainted: G S 5.9.0-rc8+ #101 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x4d8 show_stack+0x34/0x48 dump_stack+0x174/0x1f8 panic+0x360/0x7a0 __warn+0x244/0x2ec report_bug+0x240/0x398 bug_handler+0x50/0xc0 call_break_hook+0x160/0x1d8 brk_handler+0x30/0xc0 do_debug_exception+0x184/0x340 el1_dbg+0x48/0xb0 el1_sync_handler+0x170/0x1c8 el1_sync+0x80/0x100 debug_print_object+0x180/0x240 debug_check_no_obj_freed+0x200/0x430 slab_free_freelist_hook+0x190/0x210 kfree+0x13c/0x460 f2fs_put_super+0x624/0xa58 generic_shutdown_super+0x120/0x300 kill_block_super+0x94/0xf8 kill_f2fs_super+0x244/0x308 deactivate_locked_super+0x104/0x150 deactivate_super+0x118/0x148 cleanup_mnt+0x27c/0x3c0 __cleanup_mnt+0x28/0x38 task_work_run+0x10c/0x248 do_notify_resume+0x9d4/0x1188 work_pending+0x8/0x34c Like the error handling for f2fs_register_sysfs(), we need to wait for the kobject to be destroyed before returning to prevent a potential use-after-free. Fixes: bf9e697ecd42 ("f2fs: expose features to sysfs entry") Cc: Jaegeuk Kim <jaegeuk@kernel.org> Cc: Chao Yu <chao@kernel.org> Signed-off-by: Jamie Iles <jamie@nuviainc.com> Reviewed-by: Chao Yu <yuchao0@huawei.com> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-10-12 16:09:48 +03:00
wait_for_completion(&sbi->s_kobj_unregister);
}