2018-09-12 09:16:07 +08:00
// SPDX-License-Identifier: GPL-2.0
2012-11-29 13:28:09 +09:00
/*
2012-11-02 17:09:44 +09:00
* fs / f2fs / file . c
*
* Copyright ( c ) 2012 Samsung Electronics Co . , Ltd .
* http : //www.samsung.com/
*/
# include <linux/fs.h>
# include <linux/f2fs_fs.h>
# include <linux/stat.h>
# include <linux/buffer_head.h>
# include <linux/writeback.h>
2013-03-16 11:13:04 +09:00
# include <linux/blkdev.h>
2012-11-02 17:09:44 +09:00
# include <linux/falloc.h>
# include <linux/types.h>
2013-02-04 23:41:41 +09:00
# include <linux/compat.h>
2012-11-02 17:09:44 +09:00
# include <linux/uaccess.h>
# include <linux/mount.h>
2014-04-28 18:12:36 +09:00
# include <linux/pagevec.h>
2017-01-13 13:12:29 -08:00
# include <linux/uio.h>
2016-05-20 17:01:00 -07:00
# include <linux/uuid.h>
2016-07-08 15:16:47 -07:00
# include <linux/file.h>
2019-07-17 17:06:11 +08:00
# include <linux/nls.h>
2020-07-21 12:21:11 +09:00
# include <linux/sched/signal.h>
2021-04-07 14:36:43 +02:00
# include <linux/fileattr.h>
2021-08-02 21:22:45 -07:00
# include <linux/fadvise.h>
2021-07-23 00:59:21 -07:00
# include <linux/iomap.h>
2012-11-02 17:09:44 +09:00
# include "f2fs.h"
# include "node.h"
# include "segment.h"
# include "xattr.h"
# include "acl.h"
2015-07-10 18:08:10 +08:00
# include "gc.h"
2021-08-19 20:52:28 -07:00
# include "iostat.h"
2013-04-20 01:28:40 +09:00
# include <trace/events/f2fs.h>
2020-11-02 14:21:31 +08:00
# include <uapi/linux/f2fs.h>
2012-11-02 17:09:44 +09:00
2018-04-15 01:40:02 +05:30
static vm_fault_t f2fs_filemap_fault ( struct vm_fault * vmf )
2017-05-18 11:06:45 +08:00
{
struct inode * inode = file_inode ( vmf - > vma - > vm_file ) ;
2018-04-15 01:40:02 +05:30
vm_fault_t ret ;
2017-05-18 11:06:45 +08:00
2018-04-15 01:40:02 +05:30
ret = filemap_fault ( vmf ) ;
2020-04-16 18:16:56 +08:00
if ( ! ret )
f2fs_update_iostat ( F2FS_I_SB ( inode ) , APP_MAPPED_READ_IO ,
F2FS_BLKSIZE ) ;
2019-04-15 15:22:19 +08:00
trace_f2fs_filemap_fault ( inode , vmf - > pgoff , ( unsigned long ) ret ) ;
2018-04-15 01:40:02 +05:30
return ret ;
2017-05-18 11:06:45 +08:00
}
2018-04-15 01:40:02 +05:30
static vm_fault_t f2fs_vm_page_mkwrite ( struct vm_fault * vmf )
2012-11-02 17:09:44 +09:00
{
struct page * page = vmf - > page ;
2017-02-24 14:56:41 -08:00
struct inode * inode = file_inode ( vmf - > vma - > vm_file ) ;
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2019-12-03 15:53:16 -08:00
struct dnode_of_data dn ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
bool need_alloc = true ;
int err = 0 ;
2012-11-02 17:09:44 +09:00
2020-12-26 18:07:01 +08:00
if ( unlikely ( IS_IMMUTABLE ( inode ) ) )
return VM_FAULT_SIGBUS ;
2021-05-25 11:39:35 -07:00
if ( is_inode_flag_set ( inode , FI_COMPRESS_RELEASED ) )
return VM_FAULT_SIGBUS ;
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( sbi ) ) ) {
err = - EIO ;
goto err ;
}
2019-08-23 17:58:36 +08:00
if ( ! f2fs_is_checkpoint_ready ( sbi ) ) {
err = - ENOSPC ;
2019-07-22 17:57:05 +08:00
goto err ;
2019-08-23 17:58:36 +08:00
}
2017-10-23 23:48:49 +02:00
2021-02-02 16:03:11 +08:00
err = f2fs_convert_inline_inode ( inode ) ;
if ( err )
goto err ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
# ifdef CONFIG_F2FS_FS_COMPRESSION
if ( f2fs_compressed_file ( inode ) ) {
int ret = f2fs_is_compressed_cluster ( inode , page - > index ) ;
if ( ret < 0 ) {
err = ret ;
goto err ;
} else if ( ret ) {
need_alloc = false ;
}
}
# endif
2019-12-03 15:53:16 -08:00
/* should do out of any locked page */
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( need_alloc )
f2fs_balance_fs ( sbi , true ) ;
2019-12-03 15:53:16 -08:00
2012-11-02 17:09:44 +09:00
sb_start_pagefault ( inode - > i_sb ) ;
2014-10-23 19:48:09 -07:00
f2fs_bug_on ( sbi , f2fs_has_inline_data ( inode ) ) ;
2014-08-07 16:32:25 -07:00
2017-02-24 14:56:41 -08:00
file_update_time ( vmf - > vma - > vm_file ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock_shared ( inode - > i_mapping ) ;
2012-11-02 17:09:44 +09:00
lock_page ( page ) ;
2013-12-06 15:00:58 +09:00
if ( unlikely ( page - > mapping ! = inode - > i_mapping | |
2013-04-28 09:04:18 +09:00
page_offset ( page ) > i_size_read ( inode ) | |
2013-12-06 15:00:58 +09:00
! PageUptodate ( page ) ) ) {
2012-11-02 17:09:44 +09:00
unlock_page ( page ) ;
err = - EFAULT ;
2017-05-18 11:06:45 +08:00
goto out_sem ;
2012-11-02 17:09:44 +09:00
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( need_alloc ) {
/* block allocation */
2020-06-18 14:36:22 +08:00
f2fs_do_map_lock ( sbi , F2FS_GET_BLOCK_PRE_AIO , true ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
err = f2fs_get_block ( & dn , page - > index ) ;
2020-06-18 14:36:22 +08:00
f2fs_do_map_lock ( sbi , F2FS_GET_BLOCK_PRE_AIO , false ) ;
2018-09-27 18:33:18 +08:00
}
2020-03-02 17:34:27 +08:00
# ifdef CONFIG_F2FS_FS_COMPRESSION
if ( ! need_alloc ) {
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
err = f2fs_get_dnode_of_data ( & dn , page - > index , LOOKUP_NODE ) ;
f2fs_put_dnode ( & dn ) ;
}
# endif
if ( err ) {
unlock_page ( page ) ;
goto out_sem ;
2018-09-27 18:33:18 +08:00
}
2018-12-25 17:43:42 +08:00
f2fs_wait_on_page_writeback ( page , DATA , false , true ) ;
2018-09-27 18:33:18 +08:00
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback ( inode , dn . data_blkaddr ) ;
2012-11-02 17:09:44 +09:00
/*
* check to see if the page is mapped already ( no holes )
*/
if ( PageMappedToDisk ( page ) )
2018-09-27 18:33:18 +08:00
goto out_sem ;
2012-11-02 17:09:44 +09:00
/* page is wholly or partially inside EOF */
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
if ( ( ( loff_t ) ( page - > index + 1 ) < < PAGE_SHIFT ) >
2015-09-11 14:43:52 +08:00
i_size_read ( inode ) ) {
2018-05-30 04:21:14 +09:00
loff_t offset ;
2018-05-30 04:33:07 +09:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
offset = i_size_read ( inode ) & ~ PAGE_MASK ;
zero_user_segment ( page , offset , PAGE_SIZE ) ;
2012-11-02 17:09:44 +09:00
}
set_page_dirty ( page ) ;
2016-06-30 18:49:15 -07:00
if ( ! PageUptodate ( page ) )
SetPageUptodate ( page ) ;
2012-11-02 17:09:44 +09:00
2017-08-02 23:21:48 +08:00
f2fs_update_iostat ( sbi , APP_MAPPED_IO , F2FS_BLKSIZE ) ;
2018-10-05 10:47:39 +05:30
f2fs_update_time ( sbi , REQ_TIME ) ;
2017-08-02 23:21:48 +08:00
2013-10-25 14:26:31 +09:00
trace_f2fs_vm_page_mkwrite ( page , DATA ) ;
2017-05-18 11:06:45 +08:00
out_sem :
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock_shared ( inode - > i_mapping ) ;
2018-09-27 18:33:18 +08:00
2012-11-02 17:09:44 +09:00
sb_end_pagefault ( inode - > i_sb ) ;
2017-10-23 23:48:49 +02:00
err :
2012-11-02 17:09:44 +09:00
return block_page_mkwrite_return ( err ) ;
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
2017-05-18 11:06:45 +08:00
. fault = f2fs_filemap_fault ,
2014-04-07 15:37:19 -07:00
. map_pages = filemap_map_pages ,
2013-01-17 18:37:41 +09:00
. page_mkwrite = f2fs_vm_page_mkwrite ,
2012-11-02 17:09:44 +09:00
} ;
2013-06-14 08:52:35 +09:00
static int get_parent_ino ( struct inode * inode , nid_t * pino )
{
struct dentry * dentry ;
2020-05-05 11:41:11 -07:00
/*
* Make sure to get the non - deleted alias . The alias associated with
* the open file descriptor being fsync ( ) ' ed may be deleted already .
*/
dentry = d_find_alias ( inode ) ;
2013-06-14 08:52:35 +09:00
if ( ! dentry )
return 0 ;
2013-07-22 22:12:56 +09:00
* pino = parent_ino ( dentry ) ;
dput ( dentry ) ;
2013-06-14 08:52:35 +09:00
return 1 ;
}
2017-11-06 22:51:45 +08:00
static inline enum cp_reason_type need_do_checkpoint ( struct inode * inode )
2014-08-20 18:37:35 +08:00
{
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2017-11-06 22:51:45 +08:00
enum cp_reason_type cp_reason = CP_NO_NEEDED ;
2014-08-20 18:37:35 +08:00
2017-11-06 22:51:45 +08:00
if ( ! S_ISREG ( inode - > i_mode ) )
cp_reason = CP_NON_REGULAR ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
else if ( f2fs_compressed_file ( inode ) )
cp_reason = CP_COMPRESSED ;
2017-11-06 22:51:45 +08:00
else if ( inode - > i_nlink ! = 1 )
cp_reason = CP_HARDLINK ;
2016-08-29 18:23:45 -07:00
else if ( is_sbi_flag_set ( sbi , SBI_NEED_CP ) )
2017-11-06 22:51:45 +08:00
cp_reason = CP_SB_NEED_CP ;
2014-08-20 18:37:35 +08:00
else if ( file_wrong_pino ( inode ) )
2017-11-06 22:51:45 +08:00
cp_reason = CP_WRONG_PINO ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
else if ( ! f2fs_space_for_roll_forward ( sbi ) )
2017-11-06 22:51:45 +08:00
cp_reason = CP_NO_SPC_ROLL ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
else if ( ! f2fs_is_checkpointed_node ( sbi , F2FS_I ( inode ) - > i_pino ) )
2017-11-06 22:51:45 +08:00
cp_reason = CP_NODE_NEED_CP ;
2014-10-30 22:47:03 -07:00
else if ( test_opt ( sbi , FASTBOOT ) )
2017-11-06 22:51:45 +08:00
cp_reason = CP_FASTBOOT_MODE ;
2018-03-08 14:22:56 +08:00
else if ( F2FS_OPTION ( sbi ) . active_logs = = 2 )
2017-11-06 22:51:45 +08:00
cp_reason = CP_SPEC_LOG_NUM ;
2018-03-08 14:22:56 +08:00
else if ( F2FS_OPTION ( sbi ) . fsync_mode = = FSYNC_MODE_STRICT & &
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_need_dentry_mark ( sbi , inode - > i_ino ) & &
f2fs_exist_written_data ( sbi , F2FS_I ( inode ) - > i_pino ,
TRANS_DIR_INO ) )
2017-12-28 08:09:44 -08:00
cp_reason = CP_RECOVER_DIR ;
2014-08-20 18:37:35 +08:00
2017-11-06 22:51:45 +08:00
return cp_reason ;
2014-08-20 18:37:35 +08:00
}
2014-12-08 15:29:40 +09:00
static bool need_inode_page_update ( struct f2fs_sb_info * sbi , nid_t ino )
{
struct page * i = find_get_page ( NODE_MAPPING ( sbi ) , ino ) ;
bool ret = false ;
/* But we need to avoid that there are some inode updates */
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
if ( ( i & & PageDirty ( i ) ) | | f2fs_need_inode_block_update ( sbi , ino ) )
2014-12-08 15:29:40 +09:00
ret = true ;
f2fs_put_page ( i , 0 ) ;
return ret ;
}
2014-12-08 15:29:41 +09:00
static void try_to_fix_pino ( struct inode * inode )
{
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
nid_t pino ;
down_write ( & fi - > i_sem ) ;
if ( file_wrong_pino ( inode ) & & inode - > i_nlink = = 1 & &
get_parent_ino ( inode , & pino ) ) {
2016-05-20 09:52:20 -07:00
f2fs_i_pino_write ( inode , pino ) ;
2014-12-08 15:29:41 +09:00
file_got_pino ( inode ) ;
}
2016-05-20 16:32:49 -07:00
up_write ( & fi - > i_sem ) ;
2014-12-08 15:29:41 +09:00
}
2016-04-15 09:43:17 -07:00
static int f2fs_do_sync_file ( struct file * file , loff_t start , loff_t end ,
int datasync , bool atomic )
2012-11-02 17:09:44 +09:00
{
struct inode * inode = file - > f_mapping - > host ;
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2014-09-10 14:58:18 -07:00
nid_t ino = inode - > i_ino ;
2012-11-02 17:09:44 +09:00
int ret = 0 ;
2017-11-06 22:51:45 +08:00
enum cp_reason_type cp_reason = 0 ;
2012-11-02 17:09:44 +09:00
struct writeback_control wbc = {
2014-03-03 11:28:40 +09:00
. sync_mode = WB_SYNC_ALL ,
2012-11-02 17:09:44 +09:00
. nr_to_write = LONG_MAX ,
. for_reclaim = 0 ,
} ;
f2fs: fix to avoid broken of dnode block list
f2fs recovery flow is relying on dnode block link list, it means fsynced
file recovery depends on previous dnode's persistence in the list, so
during fsync() we should wait on all regular inode's dnode writebacked
before issuing flush.
By this way, we can avoid dnode block list being broken by out-of-order
IO submission due to IO scheduler or driver.
Sheng Yong helps to do the test with this patch:
Target:/data (f2fs, -)
64MB / 32768KB / 4KB / 8
1 / PERSIST / Index
Base:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08
2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7
3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48
Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333
After:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 798.81 202.5 41143 40613.87 602.71 838.08 913.83
2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27
3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91
Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333
Patched/Original:
0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189
It looks like atomic write will suffer performance regression.
I suspect that the criminal is that we forcing to wait all dnode being in
storage cache before we issue PREFLUSH+FUA.
BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
cause the problem: we will lose data of last transaction after SPO, even if
atomic write return no error:
- atomic_open();
- write() P1, P2, P3;
- atomic_commit();
- writeback data: P1, P2, P3;
- writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is
writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
last transaction.
- preflush + fua;
- power-cut
If we don't wait dnode writeback for atomic_write:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85
2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77
3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92
Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18
Patched/Original:
0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294
SQLite's performance recovers.
Jaegeuk:
"Practically, I don't see db corruption becase of this. We can excuse to lose
the last transaction."
Finally, we decide to keep original implementation of atomic write interface
sematics that we don't wait all dnode writeback before preflush+fua submission.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-02 23:03:19 +08:00
unsigned int seq_id = 0 ;
2012-11-02 17:09:44 +09:00
2021-08-19 14:00:57 -07:00
if ( unlikely ( f2fs_readonly ( inode - > i_sb ) ) )
2012-12-01 10:56:01 +09:00
return 0 ;
2013-04-20 01:28:40 +09:00
trace_f2fs_sync_file_enter ( inode ) ;
2014-07-24 19:11:43 -07:00
2018-11-06 10:25:29 +08:00
if ( S_ISDIR ( inode - > i_mode ) )
goto go_write ;
2014-07-24 19:11:43 -07:00
/* if fdatasync is triggered, let's do in-place-update */
2015-12-31 13:49:17 -08:00
if ( datasync | | get_dirty_pages ( inode ) < = SM_I ( sbi ) - > min_fsync_blocks )
2016-05-20 10:13:22 -07:00
set_inode_flag ( inode , FI_NEED_IPU ) ;
2017-07-07 15:20:52 -04:00
ret = file_write_and_wait_range ( file , start , end ) ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_NEED_IPU ) ;
2014-09-10 16:53:02 -07:00
2021-08-19 14:00:57 -07:00
if ( ret | | is_sbi_flag_set ( sbi , SBI_CP_DISABLED ) ) {
2017-11-06 22:51:45 +08:00
trace_f2fs_sync_file_exit ( inode , cp_reason , datasync , ret ) ;
2012-11-02 17:09:44 +09:00
return ret ;
2013-04-20 01:28:40 +09:00
}
2012-11-02 17:09:44 +09:00
2014-12-08 15:29:40 +09:00
/* if the inode is dirty, let's recover all the time */
2016-11-17 20:53:31 +08:00
if ( ! f2fs_skip_inode_update ( inode , datasync ) ) {
2015-08-15 21:51:05 -07:00
f2fs_write_inode ( inode , NULL ) ;
2014-12-08 15:29:40 +09:00
goto go_write ;
}
2014-07-24 19:08:02 -07:00
/*
* if there is no written data , don ' t waste time to write recovery info .
*/
2016-05-20 10:13:22 -07:00
if ( ! is_inode_flag_set ( inode , FI_APPEND_WRITE ) & &
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
! f2fs_exist_written_data ( sbi , ino , APPEND_INO ) ) {
2014-09-10 15:04:03 -07:00
2014-12-08 15:29:40 +09:00
/* it may call write_inode just prior to fsync */
if ( need_inode_page_update ( sbi , ino ) )
2014-09-10 15:04:03 -07:00
goto go_write ;
2016-05-20 10:13:22 -07:00
if ( is_inode_flag_set ( inode , FI_UPDATE_WRITE ) | |
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_exist_written_data ( sbi , ino , UPDATE_INO ) )
2014-07-24 19:08:02 -07:00
goto flush_out ;
goto out ;
f2fs: fix to force keeping write barrier for strict fsync mode
[1] https://www.mail-archive.com/linux-f2fs-devel@lists.sourceforge.net/msg15126.html
As [1] reported, if lower device doesn't support write barrier, in below
case:
- write page #0; persist
- overwrite page #0
- fsync
- write data page #0 OPU into device's cache
- write inode page into device's cache
- issue flush
If SPO is triggered during flush command, inode page can be persisted
before data page #0, so that after recovery, inode page can be recovered
with new physical block address of data page #0, however there may
contains dummy data in new physical block address.
Then what user will see is: after overwrite & fsync + SPO, old data in
file was corrupted, if any user do care about such case, we can suggest
user to use STRICT fsync mode, in this mode, we will force to use atomic
write sematics to keep write order in between data/node and last node,
so that it avoids potential data corruption during fsync().
Signed-off-by: Chao Yu <chao@kernel.org>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-07-20 09:03:29 +08:00
} else {
/*
* for OPU case , during fsync ( ) , node can be persisted before
* data when lower device doesn ' t support write barrier , result
* in data corruption after SPO .
* So for strict fsync mode , force to use atomic write sematics
* to keep write order in between data / node and last node to
* avoid potential data corruption .
*/
if ( F2FS_OPTION ( sbi ) . fsync_mode = =
FSYNC_MODE_STRICT & & ! atomic )
atomic = true ;
2014-07-24 19:08:02 -07:00
}
2014-09-10 15:04:03 -07:00
go_write :
2013-07-03 10:55:52 +09:00
/*
* Both of fdatasync ( ) and fsync ( ) are able to be recovered from
* sudden - power - off .
*/
2016-05-20 10:13:22 -07:00
down_read ( & F2FS_I ( inode ) - > i_sem ) ;
2017-11-06 22:51:45 +08:00
cp_reason = need_do_checkpoint ( inode ) ;
2016-05-20 10:13:22 -07:00
up_read ( & F2FS_I ( inode ) - > i_sem ) ;
2014-03-20 19:10:08 +09:00
2017-11-06 22:51:45 +08:00
if ( cp_reason ) {
2012-11-02 17:09:44 +09:00
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs ( inode - > i_sb , 1 ) ;
2014-03-20 19:10:08 +09:00
2014-12-08 15:29:41 +09:00
/*
* We ' ve secured consistency through sync_fs . Following pino
* will be used only for fsynced inodes after checkpoint .
*/
try_to_fix_pino ( inode ) ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_APPEND_WRITE ) ;
clear_inode_flag ( inode , FI_UPDATE_WRITE ) ;
2014-12-08 15:29:41 +09:00
goto out ;
}
f2fs: fix conditions to remain recovery information in f2fs_sync_file
This patch revisited whole the recovery information during the f2fs_sync_file.
In this patch, there are three information to make a decision.
a) IS_CHECKPOINTED, /* is it checkpointed before? */
b) HAS_FSYNCED_INODE, /* is the inode fsynced before? */
c) HAS_LAST_FSYNC, /* has the latest node fsync mark? */
And, the scenarios for our rule are based on:
[Term] F: fsync_mark, D: dentry_mark
1. inode(x) | CP | inode(x) | dnode(F)
2. inode(x) | CP | inode(F) | dnode(F)
3. inode(x) | CP | dnode(F) | inode(x) | inode(F)
4. inode(x) | CP | dnode(F) | inode(F)
5. CP | inode(x) | dnode(F) | inode(DF)
6. CP | inode(DF) | dnode(F)
7. CP | dnode(F) | inode(DF)
8. CP | dnode(F) | inode(x) | inode(DF)
For example, #3, the three conditions should be changed as follows.
inode(x) | CP | dnode(F) | inode(x) | inode(F)
a) x o o o o
b) x x x x o
c) x o o x o
If f2fs_sync_file stops ------^,
it should write inode(F) --------------^
So, the need_inode_block_update should return true, since
c) get_nat_flag(e, HAS_LAST_FSYNC), is false.
For example, #8,
CP | alloc | dnode(F) | inode(x) | inode(DF)
a) o x x x x
b) x x x o
c) o o x o
If f2fs_sync_file stops -------^,
it should write inode(DF) --------------^
Note that, the roll-forward policy should follow this rule, which means,
if there are any missing blocks, we doesn't need to recover that inode.
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-15 14:50:48 -07:00
sync_nodes :
2018-06-04 23:20:36 +08:00
atomic_inc ( & sbi - > wb_sync_req [ NODE ] ) ;
f2fs: fix to avoid broken of dnode block list
f2fs recovery flow is relying on dnode block link list, it means fsynced
file recovery depends on previous dnode's persistence in the list, so
during fsync() we should wait on all regular inode's dnode writebacked
before issuing flush.
By this way, we can avoid dnode block list being broken by out-of-order
IO submission due to IO scheduler or driver.
Sheng Yong helps to do the test with this patch:
Target:/data (f2fs, -)
64MB / 32768KB / 4KB / 8
1 / PERSIST / Index
Base:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08
2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7
3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48
Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333
After:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 798.81 202.5 41143 40613.87 602.71 838.08 913.83
2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27
3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91
Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333
Patched/Original:
0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189
It looks like atomic write will suffer performance regression.
I suspect that the criminal is that we forcing to wait all dnode being in
storage cache before we issue PREFLUSH+FUA.
BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
cause the problem: we will lose data of last transaction after SPO, even if
atomic write return no error:
- atomic_open();
- write() P1, P2, P3;
- atomic_commit();
- writeback data: P1, P2, P3;
- writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is
writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
last transaction.
- preflush + fua;
- power-cut
If we don't wait dnode writeback for atomic_write:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85
2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77
3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92
Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18
Patched/Original:
0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294
SQLite's performance recovers.
Jaegeuk:
"Practically, I don't see db corruption becase of this. We can excuse to lose
the last transaction."
Finally, we decide to keep original implementation of atomic write interface
sematics that we don't wait all dnode writeback before preflush+fua submission.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-02 23:03:19 +08:00
ret = f2fs_fsync_node_pages ( sbi , inode , & wbc , atomic , & seq_id ) ;
2018-06-04 23:20:36 +08:00
atomic_dec ( & sbi - > wb_sync_req [ NODE ] ) ;
2016-04-15 09:25:04 -07:00
if ( ret )
goto out ;
2014-12-08 15:29:41 +09:00
2015-01-09 16:27:17 -08:00
/* if cp_error was enabled, we should avoid infinite loop */
2015-12-24 18:04:56 +08:00
if ( unlikely ( f2fs_cp_error ( sbi ) ) ) {
ret = - EIO ;
2015-01-09 16:27:17 -08:00
goto out ;
2015-12-24 18:04:56 +08:00
}
2015-01-09 16:27:17 -08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
if ( f2fs_need_inode_block_update ( sbi , ino ) ) {
2016-10-14 11:51:23 -07:00
f2fs_mark_inode_dirty_sync ( inode , true ) ;
2014-12-08 15:29:41 +09:00
f2fs_write_inode ( inode , NULL ) ;
goto sync_nodes ;
2012-11-02 17:09:44 +09:00
}
2014-12-08 15:29:41 +09:00
2017-07-28 02:29:12 -07:00
/*
* If it ' s atomic_write , it ' s just fine to keep write ordering . So
* here we don ' t need to wait for node write completion , since we use
* node chain which serializes node blocks . If one of node writes are
* reordered , we can see simply broken chain , resulting in stopping
* roll - forward recovery . It means we ' ll recover all or none node blocks
* given fsync mark .
*/
if ( ! atomic ) {
f2fs: fix to avoid broken of dnode block list
f2fs recovery flow is relying on dnode block link list, it means fsynced
file recovery depends on previous dnode's persistence in the list, so
during fsync() we should wait on all regular inode's dnode writebacked
before issuing flush.
By this way, we can avoid dnode block list being broken by out-of-order
IO submission due to IO scheduler or driver.
Sheng Yong helps to do the test with this patch:
Target:/data (f2fs, -)
64MB / 32768KB / 4KB / 8
1 / PERSIST / Index
Base:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08
2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7
3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48
Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333
After:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 798.81 202.5 41143 40613.87 602.71 838.08 913.83
2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27
3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91
Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333
Patched/Original:
0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189
It looks like atomic write will suffer performance regression.
I suspect that the criminal is that we forcing to wait all dnode being in
storage cache before we issue PREFLUSH+FUA.
BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
cause the problem: we will lose data of last transaction after SPO, even if
atomic write return no error:
- atomic_open();
- write() P1, P2, P3;
- atomic_commit();
- writeback data: P1, P2, P3;
- writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is
writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
last transaction.
- preflush + fua;
- power-cut
If we don't wait dnode writeback for atomic_write:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85
2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77
3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92
Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18
Patched/Original:
0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294
SQLite's performance recovers.
Jaegeuk:
"Practically, I don't see db corruption becase of this. We can excuse to lose
the last transaction."
Finally, we decide to keep original implementation of atomic write interface
sematics that we don't wait all dnode writeback before preflush+fua submission.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-02 23:03:19 +08:00
ret = f2fs_wait_on_node_pages_writeback ( sbi , seq_id ) ;
2017-07-28 02:29:12 -07:00
if ( ret )
goto out ;
}
2014-12-08 15:29:41 +09:00
/* once recovery info is written, don't need to tack this */
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_remove_ino_entry ( sbi , ino , APPEND_INO ) ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_APPEND_WRITE ) ;
2014-12-08 15:29:41 +09:00
flush_out :
2018-05-25 18:02:58 -07:00
if ( ! atomic & & F2FS_OPTION ( sbi ) . fsync_mode ! = FSYNC_MODE_NOBARRIER )
2017-09-29 13:59:38 +08:00
ret = f2fs_issue_flush ( sbi , inode - > i_ino ) ;
2017-09-29 13:59:36 +08:00
if ( ! ret ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_remove_ino_entry ( sbi , ino , UPDATE_INO ) ;
2017-09-29 13:59:36 +08:00
clear_inode_flag ( inode , FI_UPDATE_WRITE ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_remove_ino_entry ( sbi , ino , FLUSH_INO ) ;
2017-09-29 13:59:36 +08:00
}
2016-01-08 16:57:48 -08:00
f2fs_update_time ( sbi , REQ_TIME ) ;
2012-11-02 17:09:44 +09:00
out :
2017-11-06 22:51:45 +08:00
trace_f2fs_sync_file_exit ( inode , cp_reason , datasync , ret ) ;
2012-11-02 17:09:44 +09:00
return ret ;
}
2016-04-15 09:43:17 -07:00
int f2fs_sync_file ( struct file * file , loff_t start , loff_t end , int datasync )
{
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( file_inode ( file ) ) ) ) )
return - EIO ;
2016-04-15 09:43:17 -07:00
return f2fs_do_sync_file ( file , start , end , datasync , false ) ;
}
2020-08-24 22:48:41 +01:00
static bool __found_offset ( struct address_space * mapping , block_t blkaddr ,
pgoff_t index , int whence )
2014-04-28 18:12:36 +09:00
{
switch ( whence ) {
case SEEK_DATA :
2020-08-24 22:48:41 +01:00
if ( __is_valid_data_blkaddr ( blkaddr ) )
return true ;
if ( blkaddr = = NEW_ADDR & &
xa_get_mark ( & mapping - > i_pages , index , PAGECACHE_TAG_DIRTY ) )
2014-04-28 18:12:36 +09:00
return true ;
break ;
case SEEK_HOLE :
if ( blkaddr = = NULL_ADDR )
return true ;
break ;
}
return false ;
}
2014-04-23 14:10:24 +08:00
static loff_t f2fs_seek_block ( struct file * file , loff_t offset , int whence )
{
struct inode * inode = file - > f_mapping - > host ;
loff_t maxbytes = inode - > i_sb - > s_maxbytes ;
struct dnode_of_data dn ;
2020-08-24 22:48:41 +01:00
pgoff_t pgofs , end_offset ;
2014-04-28 18:12:36 +09:00
loff_t data_ofs = offset ;
loff_t isize ;
2014-04-23 14:10:24 +08:00
int err = 0 ;
2016-01-22 15:40:57 -05:00
inode_lock ( inode ) ;
2014-04-23 14:10:24 +08:00
isize = i_size_read ( inode ) ;
if ( offset > = isize )
goto fail ;
/* handle inline data case */
2020-11-02 17:36:58 +08:00
if ( f2fs_has_inline_data ( inode ) ) {
if ( whence = = SEEK_HOLE ) {
data_ofs = isize ;
goto found ;
} else if ( whence = = SEEK_DATA ) {
data_ofs = offset ;
goto found ;
}
2014-04-23 14:10:24 +08:00
}
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pgofs = ( pgoff_t ) ( offset > > PAGE_SHIFT ) ;
2014-04-23 14:10:24 +08:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
for ( ; data_ofs < isize ; data_ofs = ( loff_t ) pgofs < < PAGE_SHIFT ) {
2014-04-23 14:10:24 +08:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
err = f2fs_get_dnode_of_data ( & dn , pgofs , LOOKUP_NODE ) ;
2014-04-23 14:10:24 +08:00
if ( err & & err ! = - ENOENT ) {
goto fail ;
} else if ( err = = - ENOENT ) {
2014-08-06 23:22:50 +09:00
/* direct node does not exists */
2014-04-23 14:10:24 +08:00
if ( whence = = SEEK_DATA ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
pgofs = f2fs_get_next_page_offset ( & dn , pgofs ) ;
2014-04-23 14:10:24 +08:00
continue ;
} else {
goto found ;
}
}
2016-01-26 15:39:35 +08:00
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
2014-04-23 14:10:24 +08:00
/* find data/hole in dnode block */
for ( ; dn . ofs_in_node < end_offset ;
dn . ofs_in_node + + , pgofs + + ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
data_ofs = ( loff_t ) pgofs < < PAGE_SHIFT ) {
2014-04-23 14:10:24 +08:00
block_t blkaddr ;
2018-05-30 04:33:07 +09:00
2020-02-14 17:44:10 +08:00
blkaddr = f2fs_data_blkaddr ( & dn ) ;
2014-04-23 14:10:24 +08:00
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
if ( __is_valid_data_blkaddr ( blkaddr ) & &
! f2fs_is_valid_blkaddr ( F2FS_I_SB ( inode ) ,
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
blkaddr , DATA_GENERIC_ENHANCE ) ) {
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
f2fs_put_dnode ( & dn ) ;
goto fail ;
}
2020-08-24 22:48:41 +01:00
if ( __found_offset ( file - > f_mapping , blkaddr ,
2018-06-05 17:44:11 +08:00
pgofs , whence ) ) {
2014-04-23 14:10:24 +08:00
f2fs_put_dnode ( & dn ) ;
goto found ;
}
}
f2fs_put_dnode ( & dn ) ;
}
if ( whence = = SEEK_DATA )
goto fail ;
found :
2014-04-28 17:02:48 +09:00
if ( whence = = SEEK_HOLE & & data_ofs > isize )
data_ofs = isize ;
2016-01-22 15:40:57 -05:00
inode_unlock ( inode ) ;
2014-04-23 14:10:24 +08:00
return vfs_setpos ( file , data_ofs , maxbytes ) ;
fail :
2016-01-22 15:40:57 -05:00
inode_unlock ( inode ) ;
2014-04-23 14:10:24 +08:00
return - ENXIO ;
}
static loff_t f2fs_llseek ( struct file * file , loff_t offset , int whence )
{
struct inode * inode = file - > f_mapping - > host ;
loff_t maxbytes = inode - > i_sb - > s_maxbytes ;
2021-01-13 13:21:54 +08:00
if ( f2fs_compressed_file ( inode ) )
maxbytes = max_file_blocks ( inode ) < < F2FS_BLKSIZE_BITS ;
2014-04-23 14:10:24 +08:00
switch ( whence ) {
case SEEK_SET :
case SEEK_CUR :
case SEEK_END :
return generic_file_llseek_size ( file , offset , whence ,
maxbytes , i_size_read ( inode ) ) ;
case SEEK_DATA :
case SEEK_HOLE :
2014-09-08 10:59:43 -07:00
if ( offset < 0 )
return - ENXIO ;
2014-04-23 14:10:24 +08:00
return f2fs_seek_block ( file , offset , whence ) ;
}
return - EINVAL ;
}
2012-11-02 17:09:44 +09:00
static int f2fs_file_mmap ( struct file * file , struct vm_area_struct * vma )
{
2014-10-23 19:48:09 -07:00
struct inode * inode = file_inode ( file ) ;
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) )
return - EIO ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( ! f2fs_is_compress_backend_ready ( inode ) )
return - EOPNOTSUPP ;
2012-11-02 17:09:44 +09:00
file_accessed ( file ) ;
vma - > vm_ops = & f2fs_file_vm_ops ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
set_inode_flag ( inode , FI_MMAP_FILE ) ;
2012-11-02 17:09:44 +09:00
return 0 ;
}
2015-04-21 20:39:58 -07:00
static int f2fs_file_open ( struct inode * inode , struct file * filp )
{
2017-11-29 12:35:28 -08:00
int err = fscrypt_file_open ( inode , filp ) ;
2015-04-21 20:39:58 -07:00
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
if ( err )
return err ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( ! f2fs_is_compress_backend_ready ( inode ) )
return - EOPNOTSUPP ;
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
err = fsverity_file_open ( inode , filp ) ;
2017-11-29 12:35:28 -08:00
if ( err )
return err ;
2018-03-08 19:34:38 +09:00
filp - > f_mode | = FMODE_NOWAIT ;
2017-07-09 00:13:07 +08:00
return dquot_file_open ( inode , filp ) ;
2015-04-21 20:39:58 -07:00
}
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
void f2fs_truncate_data_blocks_range ( struct dnode_of_data * dn , int count )
2012-11-02 17:09:44 +09:00
{
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( dn - > inode ) ;
2012-11-02 17:09:44 +09:00
struct f2fs_node * raw_node ;
f2fs: update extent tree in batches
This patch introduce a new helper f2fs_update_extent_tree_range which can
do extent mapping update at a specified range.
The main idea is:
1) punch all mapping info in extent node(s) which are at a specified range;
2) try to merge new extent mapping with adjacent node, or failing that,
insert the mapping into extent tree as a new node.
In order to see the benefit, I add a function for stating time stamping
count as below:
uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd.
truncation path: update extent cache from truncate_data_blocks_range
non-truncataion path: update extent cache from other paths
total: all update paths
a) Removing 128MB file which has one extent node mapping whole range of
file:
1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128
2. sync
3. rm /mnt/f2fs/128M
Before:
total count average
truncation: 7651022 32768 233.49
Patched:
total count average
truncation: 3321 33 100.64
b) fsstress:
fsstress -d /mnt/f2fs -l 5 -n 100 -p 20
Test times: 5 times.
Before:
total count average
truncation: 5812480.6 20911.6 277.95
non-truncation: 7783845.6 13440.8 579.12
total: 13596326.2 34352.4 395.79
Patched:
total count average
truncation: 1281283.0 3041.6 421.25
non-truncation: 7355844.4 13662.8 538.38
total: 8637127.4 16704.4 517.06
1) For the updates in truncation path:
- we can see updating in batches leads total tsc and update count reducing
explicitly;
- besides, for a single batched updating, punching multiple extent nodes
in a loop, result in executing more operations, so our average tsc
increase intensively.
2) For the updates in non-truncation path:
- there is a little improvement, that is because for the scenario that we
just need to update in the head or tail of extent node, new interface
optimize to update info in extent node directly, rather than removing
original extent node for updating and then inserting that updated one
into cache as new node.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-26 20:34:48 +08:00
int nr_free = 0 , ofs = dn - > ofs_in_node , len = count ;
2012-11-02 17:09:44 +09:00
__le32 * addr ;
2017-07-19 00:19:06 +08:00
int base = 0 ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
bool compressed_cluster = false ;
int cluster_index = 0 , valid_blocks = 0 ;
int cluster_size = F2FS_I ( dn - > inode ) - > i_cluster_size ;
2020-09-08 11:44:10 +09:00
bool released = ! atomic_read ( & F2FS_I ( dn - > inode ) - > i_compr_blocks ) ;
2017-07-19 00:19:06 +08:00
if ( IS_INODE ( dn - > node_page ) & & f2fs_has_extra_attr ( dn - > inode ) )
base = get_extra_isize ( dn - > inode ) ;
2012-11-02 17:09:44 +09:00
2013-07-15 17:57:38 +08:00
raw_node = F2FS_NODE ( dn - > node_page ) ;
2017-07-19 00:19:06 +08:00
addr = blkaddr_in_node ( raw_node ) + base + ofs ;
2012-11-02 17:09:44 +09:00
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
/* Assumption: truncateion starts with cluster */
for ( ; count > 0 ; count - - , addr + + , dn - > ofs_in_node + + , cluster_index + + ) {
2012-11-02 17:09:44 +09:00
block_t blkaddr = le32_to_cpu ( * addr ) ;
2018-05-30 04:33:07 +09:00
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( f2fs_compressed_file ( dn - > inode ) & &
! ( cluster_index & ( cluster_size - 1 ) ) ) {
if ( compressed_cluster )
f2fs_i_compr_blocks_update ( dn - > inode ,
valid_blocks , false ) ;
compressed_cluster = ( blkaddr = = COMPRESS_ADDR ) ;
valid_blocks = 0 ;
}
2012-11-02 17:09:44 +09:00
if ( blkaddr = = NULL_ADDR )
continue ;
2014-12-30 22:57:55 -08:00
dn - > data_blkaddr = NULL_ADDR ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_set_data_blkaddr ( dn ) ;
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( __is_valid_data_blkaddr ( blkaddr ) ) {
if ( ! f2fs_is_valid_blkaddr ( sbi , blkaddr ,
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
DATA_GENERIC_ENHANCE ) )
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
continue ;
if ( compressed_cluster )
valid_blocks + + ;
}
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
2015-03-17 17:16:35 -07:00
if ( dn - > ofs_in_node = = 0 & & IS_INODE ( dn - > node_page ) )
2016-05-20 10:13:22 -07:00
clear_inode_flag ( dn - > inode , FI_FIRST_BLOCK_WRITTEN ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
f2fs_invalidate_blocks ( sbi , blkaddr ) ;
2020-03-06 15:36:09 +08:00
if ( ! released | | blkaddr ! = COMPRESS_ADDR )
nr_free + + ;
2012-11-02 17:09:44 +09:00
}
f2fs: update extent tree in batches
This patch introduce a new helper f2fs_update_extent_tree_range which can
do extent mapping update at a specified range.
The main idea is:
1) punch all mapping info in extent node(s) which are at a specified range;
2) try to merge new extent mapping with adjacent node, or failing that,
insert the mapping into extent tree as a new node.
In order to see the benefit, I add a function for stating time stamping
count as below:
uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd.
truncation path: update extent cache from truncate_data_blocks_range
non-truncataion path: update extent cache from other paths
total: all update paths
a) Removing 128MB file which has one extent node mapping whole range of
file:
1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128
2. sync
3. rm /mnt/f2fs/128M
Before:
total count average
truncation: 7651022 32768 233.49
Patched:
total count average
truncation: 3321 33 100.64
b) fsstress:
fsstress -d /mnt/f2fs -l 5 -n 100 -p 20
Test times: 5 times.
Before:
total count average
truncation: 5812480.6 20911.6 277.95
non-truncation: 7783845.6 13440.8 579.12
total: 13596326.2 34352.4 395.79
Patched:
total count average
truncation: 1281283.0 3041.6 421.25
non-truncation: 7355844.4 13662.8 538.38
total: 8637127.4 16704.4 517.06
1) For the updates in truncation path:
- we can see updating in batches leads total tsc and update count reducing
explicitly;
- besides, for a single batched updating, punching multiple extent nodes
in a loop, result in executing more operations, so our average tsc
increase intensively.
2) For the updates in non-truncation path:
- there is a little improvement, that is because for the scenario that we
just need to update in the head or tail of extent node, new interface
optimize to update info in extent node directly, rather than removing
original extent node for updating and then inserting that updated one
into cache as new node.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-26 20:34:48 +08:00
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( compressed_cluster )
f2fs_i_compr_blocks_update ( dn - > inode , valid_blocks , false ) ;
2012-11-02 17:09:44 +09:00
if ( nr_free ) {
f2fs: update extent tree in batches
This patch introduce a new helper f2fs_update_extent_tree_range which can
do extent mapping update at a specified range.
The main idea is:
1) punch all mapping info in extent node(s) which are at a specified range;
2) try to merge new extent mapping with adjacent node, or failing that,
insert the mapping into extent tree as a new node.
In order to see the benefit, I add a function for stating time stamping
count as below:
uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd.
truncation path: update extent cache from truncate_data_blocks_range
non-truncataion path: update extent cache from other paths
total: all update paths
a) Removing 128MB file which has one extent node mapping whole range of
file:
1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128
2. sync
3. rm /mnt/f2fs/128M
Before:
total count average
truncation: 7651022 32768 233.49
Patched:
total count average
truncation: 3321 33 100.64
b) fsstress:
fsstress -d /mnt/f2fs -l 5 -n 100 -p 20
Test times: 5 times.
Before:
total count average
truncation: 5812480.6 20911.6 277.95
non-truncation: 7783845.6 13440.8 579.12
total: 13596326.2 34352.4 395.79
Patched:
total count average
truncation: 1281283.0 3041.6 421.25
non-truncation: 7355844.4 13662.8 538.38
total: 8637127.4 16704.4 517.06
1) For the updates in truncation path:
- we can see updating in batches leads total tsc and update count reducing
explicitly;
- besides, for a single batched updating, punching multiple extent nodes
in a loop, result in executing more operations, so our average tsc
increase intensively.
2) For the updates in non-truncation path:
- there is a little improvement, that is because for the scenario that we
just need to update in the head or tail of extent node, new interface
optimize to update info in extent node directly, rather than removing
original extent node for updating and then inserting that updated one
into cache as new node.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-26 20:34:48 +08:00
pgoff_t fofs ;
/*
* once we invalidate valid blkaddr in range [ ofs , ofs + count ] ,
* we will invalidate all blkaddr in the whole range .
*/
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
fofs = f2fs_start_bidx_of_node ( ofs_of_node ( dn - > node_page ) ,
2016-01-26 15:39:35 +08:00
dn - > inode ) + ofs ;
f2fs: update extent tree in batches
This patch introduce a new helper f2fs_update_extent_tree_range which can
do extent mapping update at a specified range.
The main idea is:
1) punch all mapping info in extent node(s) which are at a specified range;
2) try to merge new extent mapping with adjacent node, or failing that,
insert the mapping into extent tree as a new node.
In order to see the benefit, I add a function for stating time stamping
count as below:
uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
My test environment is: ubuntu, intel i7-3770, 16G memory, 256g micron ssd.
truncation path: update extent cache from truncate_data_blocks_range
non-truncataion path: update extent cache from other paths
total: all update paths
a) Removing 128MB file which has one extent node mapping whole range of
file:
1. dd if=/dev/zero of=/mnt/f2fs/128M bs=1M count=128
2. sync
3. rm /mnt/f2fs/128M
Before:
total count average
truncation: 7651022 32768 233.49
Patched:
total count average
truncation: 3321 33 100.64
b) fsstress:
fsstress -d /mnt/f2fs -l 5 -n 100 -p 20
Test times: 5 times.
Before:
total count average
truncation: 5812480.6 20911.6 277.95
non-truncation: 7783845.6 13440.8 579.12
total: 13596326.2 34352.4 395.79
Patched:
total count average
truncation: 1281283.0 3041.6 421.25
non-truncation: 7355844.4 13662.8 538.38
total: 8637127.4 16704.4 517.06
1) For the updates in truncation path:
- we can see updating in batches leads total tsc and update count reducing
explicitly;
- besides, for a single batched updating, punching multiple extent nodes
in a loop, result in executing more operations, so our average tsc
increase intensively.
2) For the updates in non-truncation path:
- there is a little improvement, that is because for the scenario that we
just need to update in the head or tail of extent node, new interface
optimize to update info in extent node directly, rather than removing
original extent node for updating and then inserting that updated one
into cache as new node.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-26 20:34:48 +08:00
f2fs_update_extent_cache_range ( dn , fofs , 0 , len ) ;
2013-06-08 21:25:40 +09:00
dec_valid_block_count ( sbi , dn - > inode , nr_free ) ;
2012-11-02 17:09:44 +09:00
}
dn - > ofs_in_node = ofs ;
2013-04-20 01:28:52 +09:00
2016-01-08 16:57:48 -08:00
f2fs_update_time ( sbi , REQ_TIME ) ;
2013-04-20 01:28:52 +09:00
trace_f2fs_truncate_data_blocks_range ( dn - > inode , dn - > nid ,
dn - > ofs_in_node , nr_free ) ;
2012-11-02 17:09:44 +09:00
}
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
void f2fs_truncate_data_blocks ( struct dnode_of_data * dn )
2012-11-02 17:09:44 +09:00
{
2019-03-25 21:08:19 +08:00
f2fs_truncate_data_blocks_range ( dn , ADDRS_PER_BLOCK ( dn - > inode ) ) ;
2012-11-02 17:09:44 +09:00
}
2015-03-10 13:16:25 +08:00
static int truncate_partial_data_page ( struct inode * inode , u64 from ,
2015-04-30 17:00:33 -07:00
bool cache_only )
2012-11-02 17:09:44 +09:00
{
2018-05-30 04:21:14 +09:00
loff_t offset = from & ( PAGE_SIZE - 1 ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pgoff_t index = from > > PAGE_SHIFT ;
2015-04-30 17:00:33 -07:00
struct address_space * mapping = inode - > i_mapping ;
2012-11-02 17:09:44 +09:00
struct page * page ;
2015-04-30 17:00:33 -07:00
if ( ! offset & & ! cache_only )
2014-10-23 19:48:09 -07:00
return 0 ;
2012-11-02 17:09:44 +09:00
2015-04-30 17:00:33 -07:00
if ( cache_only ) {
2016-09-06 15:55:54 -07:00
page = find_lock_page ( mapping , index ) ;
2015-04-30 17:00:33 -07:00
if ( page & & PageUptodate ( page ) )
goto truncate_out ;
f2fs_put_page ( page , 1 ) ;
2014-10-23 19:48:09 -07:00
return 0 ;
2015-04-30 17:00:33 -07:00
}
2012-11-02 17:09:44 +09:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
page = f2fs_get_lock_data_page ( inode , index , true ) ;
2015-04-30 17:00:33 -07:00
if ( IS_ERR ( page ) )
2017-02-28 20:32:41 +08:00
return PTR_ERR ( page ) = = - ENOENT ? 0 : PTR_ERR ( page ) ;
2015-04-30 17:00:33 -07:00
truncate_out :
2018-12-25 17:43:42 +08:00
f2fs_wait_on_page_writeback ( page , DATA , true , true ) ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
zero_user ( page , offset , PAGE_SIZE - offset ) ;
2017-06-14 08:05:32 -07:00
/* An encrypted inode should have a key and truncate the last page. */
2018-12-12 15:20:11 +05:30
f2fs_bug_on ( F2FS_I_SB ( inode ) , cache_only & & IS_ENCRYPTED ( inode ) ) ;
2017-06-14 08:05:32 -07:00
if ( ! cache_only )
2015-03-10 13:16:25 +08:00
set_page_dirty ( page ) ;
2012-11-02 17:09:44 +09:00
f2fs_put_page ( page , 1 ) ;
2014-10-23 19:48:09 -07:00
return 0 ;
2012-11-02 17:09:44 +09:00
}
2020-03-18 16:22:59 +08:00
int f2fs_do_truncate_blocks ( struct inode * inode , u64 from , bool lock )
2012-11-02 17:09:44 +09:00
{
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 17:09:44 +09:00
struct dnode_of_data dn ;
pgoff_t free_from ;
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
int count = 0 , err = 0 ;
2014-10-23 19:48:09 -07:00
struct page * ipage ;
2015-03-10 13:16:25 +08:00
bool truncate_page = false ;
2012-11-02 17:09:44 +09:00
2013-04-20 01:28:52 +09:00
trace_f2fs_truncate_blocks_enter ( inode , from ) ;
2018-03-20 23:08:29 +08:00
free_from = ( pgoff_t ) F2FS_BLK_ALIGN ( from ) ;
2012-11-02 17:09:44 +09:00
2021-01-13 13:21:54 +08:00
if ( free_from > = max_file_blocks ( inode ) )
2016-05-05 19:13:03 +08:00
goto free_partial ;
2014-08-14 16:32:54 -07:00
if ( lock )
2019-02-02 17:33:01 +08:00
f2fs_lock_op ( sbi ) ;
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ipage = f2fs_get_node_page ( sbi , inode - > i_ino ) ;
2014-10-23 19:48:09 -07:00
if ( IS_ERR ( ipage ) ) {
err = PTR_ERR ( ipage ) ;
goto out ;
}
if ( f2fs_has_inline_data ( inode ) ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_truncate_inline_inode ( inode , ipage , from ) ;
2014-10-23 19:48:09 -07:00
f2fs_put_page ( ipage , 1 ) ;
2015-03-10 13:16:25 +08:00
truncate_page = true ;
2014-10-23 19:48:09 -07:00
goto out ;
}
set_new_dnode ( & dn , inode , ipage , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
err = f2fs_get_dnode_of_data ( & dn , free_from , LOOKUP_NODE_RA ) ;
2012-11-02 17:09:44 +09:00
if ( err ) {
if ( err = = - ENOENT )
goto free_next ;
2014-10-23 19:48:09 -07:00
goto out ;
2014-10-15 10:24:34 -07:00
}
2016-01-26 15:39:35 +08:00
count = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
2012-11-02 17:09:44 +09:00
count - = dn . ofs_in_node ;
2014-09-02 15:52:58 -07:00
f2fs_bug_on ( sbi , count < 0 ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 16:21:29 +09:00
2012-11-02 17:09:44 +09:00
if ( dn . ofs_in_node | | IS_INODE ( dn . node_page ) ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_truncate_data_blocks_range ( & dn , count ) ;
2012-11-02 17:09:44 +09:00
free_from + = count ;
}
f2fs_put_dnode ( & dn ) ;
free_next :
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
err = f2fs_truncate_inode_blocks ( inode , free_from ) ;
2014-11-11 11:01:01 -08:00
out :
if ( lock )
2019-02-02 17:33:01 +08:00
f2fs_unlock_op ( sbi ) ;
2016-05-05 19:13:03 +08:00
free_partial :
2014-10-23 19:48:09 -07:00
/* lastly zero out the first data page */
if ( ! err )
2015-03-10 13:16:25 +08:00
err = truncate_partial_data_page ( inode , from , truncate_page ) ;
2012-11-02 17:09:44 +09:00
2013-04-20 01:28:52 +09:00
trace_f2fs_truncate_blocks_exit ( inode , err ) ;
2012-11-02 17:09:44 +09:00
return err ;
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
int f2fs_truncate_blocks ( struct inode * inode , u64 from , bool lock )
{
u64 free_from = from ;
2020-03-18 16:22:59 +08:00
int err ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
2020-03-18 16:22:59 +08:00
# ifdef CONFIG_F2FS_FS_COMPRESSION
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
/*
* for compressed file , only support cluster size
* aligned truncation .
*/
2020-04-08 19:55:17 +08:00
if ( f2fs_compressed_file ( inode ) )
free_from = round_up ( from ,
F2FS_I ( inode ) - > i_cluster_size < < PAGE_SHIFT ) ;
2020-03-18 16:22:59 +08:00
# endif
err = f2fs_do_truncate_blocks ( inode , free_from , lock ) ;
if ( err )
return err ;
# ifdef CONFIG_F2FS_FS_COMPRESSION
2021-08-09 10:21:04 +08:00
/*
* For compressed file , after release compress blocks , don ' t allow write
* direct , but we should allow write direct after truncate to zero .
*/
if ( f2fs_compressed_file ( inode ) & & ! free_from
& & is_inode_flag_set ( inode , FI_COMPRESS_RELEASED ) )
clear_inode_flag ( inode , FI_COMPRESS_RELEASED ) ;
2020-08-10 18:38:45 +08:00
if ( from ! = free_from ) {
2020-03-18 16:22:59 +08:00
err = f2fs_truncate_partial_cluster ( inode , from , lock ) ;
2020-08-10 18:38:45 +08:00
if ( err )
return err ;
}
2020-03-18 16:22:59 +08:00
# endif
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
2020-08-10 18:38:45 +08:00
return 0 ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
}
2016-06-02 13:49:38 -07:00
int f2fs_truncate ( struct inode * inode )
2012-11-02 17:09:44 +09:00
{
2015-08-24 17:39:42 +08:00
int err ;
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) )
return - EIO ;
2012-11-02 17:09:44 +09:00
if ( ! ( S_ISREG ( inode - > i_mode ) | | S_ISDIR ( inode - > i_mode ) | |
S_ISLNK ( inode - > i_mode ) ) )
2015-08-24 17:39:42 +08:00
return 0 ;
2012-11-02 17:09:44 +09:00
2013-04-20 01:28:52 +09:00
trace_f2fs_truncate ( inode ) ;
2017-03-09 15:24:24 -08:00
if ( time_to_inject ( F2FS_I_SB ( inode ) , FAULT_TRUNCATE ) ) {
2019-11-01 17:53:23 +08:00
f2fs_show_injection_info ( F2FS_I_SB ( inode ) , FAULT_TRUNCATE ) ;
2017-03-09 15:24:24 -08:00
return - EIO ;
}
2018-08-13 23:38:06 +02:00
2021-10-28 21:03:05 +08:00
err = f2fs_dquot_initialize ( inode ) ;
f2fs: fix to avoid inconsistent quota data
Occasionally, quota data may be corrupted detected by fsck:
Info: checkpoint state = 45 : crc compacted_summary unmount
[QUOTA WARNING] Usage inconsistent for ID 0:actual (1543036928, 762) != expected (1543032832, 762)
[ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found.
[QUOTA WARNING] Usage inconsistent for ID 0:actual (1352478720, 344) != expected (1352474624, 344)
[ASSERT] (fsck_chk_quota_files:1986) --> Quota file is missing or invalid quota file content found.
[FSCK] Unreachable nat entries [Ok..] [0x0]
[FSCK] SIT valid block bitmap checking [Ok..]
[FSCK] Hard link checking for regular file [Ok..] [0x0]
[FSCK] valid_block_count matching with CP [Ok..] [0xdf299]
[FSCK] valid_node_count matcing with CP (de lookup) [Ok..] [0x2b01]
[FSCK] valid_node_count matcing with CP (nat lookup) [Ok..] [0x2b01]
[FSCK] valid_inode_count matched with CP [Ok..] [0x2665]
[FSCK] free segment_count matched with CP [Ok..] [0xcb04]
[FSCK] next block offset is free [Ok..]
[FSCK] fixing SIT types
[FSCK] other corrupted bugs [Fail]
The root cause is:
If we open file w/ readonly flag, disk quota info won't be initialized
for this file, however, following mmap() will force to convert inline
inode via f2fs_convert_inline_inode(), which may increase block usage
for this inode w/o updating quota data, it causes inconsistent disk quota
info.
The issue will happen in following stack:
open(file, O_RDONLY)
mmap(file)
- f2fs_convert_inline_inode
- f2fs_convert_inline_page
- f2fs_reserve_block
- f2fs_reserve_new_block
- f2fs_reserve_new_blocks
- f2fs_i_blocks_write
- dquot_claim_block
inode->i_blocks increase, but the dqb_curspace keep the size for the dquots
is NULL.
To fix this issue, let's call dquot_initialize() anyway in both
f2fs_truncate() and f2fs_convert_inline_inode() functions to avoid potential
inconsistent quota data issue.
Fixes: 0abd675e97e6 ("f2fs: support plain user/group quota")
Signed-off-by: Daiyue Zhang <zhangdaiyue1@huawei.com>
Signed-off-by: Dehe Gu <gudehe@huawei.com>
Signed-off-by: Junchao Jiang <jiangjunchao1@huawei.com>
Signed-off-by: Ge Qiu <qiuge@huawei.com>
Signed-off-by: Yi Chen <chenyi77@huawei.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2021-01-28 17:02:56 +08:00
if ( err )
return err ;
2014-11-11 14:10:01 -08:00
/* we should check inline_data size */
2015-12-22 11:09:35 -08:00
if ( ! f2fs_may_inline_data ( inode ) ) {
2015-08-24 17:39:42 +08:00
err = f2fs_convert_inline_inode ( inode ) ;
if ( err )
return err ;
2014-11-11 14:10:01 -08:00
}
2019-02-02 17:33:01 +08:00
err = f2fs_truncate_blocks ( inode , i_size_read ( inode ) , true ) ;
2015-08-24 17:39:42 +08:00
if ( err )
return err ;
2016-09-14 07:48:04 -07:00
inode - > i_mtime = inode - > i_ctime = current_time ( inode ) ;
2016-10-14 11:51:23 -07:00
f2fs_mark_inode_dirty_sync ( inode , false ) ;
2015-08-24 17:39:42 +08:00
return 0 ;
2012-11-02 17:09:44 +09:00
}
2021-01-21 14:19:43 +01:00
int f2fs_getattr ( struct user_namespace * mnt_userns , const struct path * path ,
struct kstat * stat , u32 request_mask , unsigned int query_flags )
2012-11-02 17:09:44 +09:00
{
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-01-31 16:46:22 +00:00
struct inode * inode = d_inode ( path - > dentry ) ;
2017-05-03 23:59:12 +08:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
2018-01-25 14:54:42 +08:00
struct f2fs_inode * ri ;
2017-05-03 23:59:12 +08:00
unsigned int flags ;
2018-01-25 14:54:42 +08:00
if ( f2fs_has_extra_attr ( inode ) & &
2018-10-24 18:34:26 +08:00
f2fs_sb_has_inode_crtime ( F2FS_I_SB ( inode ) ) & &
2018-01-25 14:54:42 +08:00
F2FS_FITS_IN_INODE ( ri , fi - > i_extra_isize , i_crtime ) ) {
stat - > result_mask | = STATX_BTIME ;
stat - > btime . tv_sec = fi - > i_crtime . tv_sec ;
stat - > btime . tv_nsec = fi - > i_crtime . tv_nsec ;
}
2019-06-04 22:59:04 -07:00
flags = fi - > i_flags ;
2020-03-25 10:22:09 +08:00
if ( flags & F2FS_COMPR_FL )
stat - > attributes | = STATX_ATTR_COMPRESSED ;
2018-04-03 15:08:17 +08:00
if ( flags & F2FS_APPEND_FL )
2017-05-03 23:59:12 +08:00
stat - > attributes | = STATX_ATTR_APPEND ;
2018-12-12 15:20:11 +05:30
if ( IS_ENCRYPTED ( inode ) )
2017-05-03 23:59:12 +08:00
stat - > attributes | = STATX_ATTR_ENCRYPTED ;
2018-04-03 15:08:17 +08:00
if ( flags & F2FS_IMMUTABLE_FL )
2017-05-03 23:59:12 +08:00
stat - > attributes | = STATX_ATTR_IMMUTABLE ;
2018-04-03 15:08:17 +08:00
if ( flags & F2FS_NODUMP_FL )
2017-05-03 23:59:12 +08:00
stat - > attributes | = STATX_ATTR_NODUMP ;
2019-10-29 13:41:40 -07:00
if ( IS_VERITY ( inode ) )
stat - > attributes | = STATX_ATTR_VERITY ;
2017-05-03 23:59:12 +08:00
2020-03-25 10:22:09 +08:00
stat - > attributes_mask | = ( STATX_ATTR_COMPRESSED |
STATX_ATTR_APPEND |
2017-05-03 23:59:12 +08:00
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
2019-10-29 13:41:40 -07:00
STATX_ATTR_NODUMP |
STATX_ATTR_VERITY ) ;
2017-05-03 23:59:12 +08:00
2021-01-21 14:19:30 +01:00
generic_fillattr ( & init_user_ns , inode , stat ) ;
2017-10-13 10:27:45 -07:00
/* we need to show initial sectors used for inline_data/dentries */
if ( ( S_ISREG ( inode - > i_mode ) & & f2fs_has_inline_data ( inode ) ) | |
f2fs_has_inline_dentry ( inode ) )
stat - > blocks + = ( stat - > size + 511 ) > > 9 ;
2012-11-02 17:09:44 +09:00
return 0 ;
}
# ifdef CONFIG_F2FS_FS_POSIX_ACL
2021-01-21 14:19:27 +01:00
static void __setattr_copy ( struct user_namespace * mnt_userns ,
struct inode * inode , const struct iattr * attr )
2012-11-02 17:09:44 +09:00
{
unsigned int ia_valid = attr - > ia_valid ;
if ( ia_valid & ATTR_UID )
inode - > i_uid = attr - > ia_uid ;
if ( ia_valid & ATTR_GID )
inode - > i_gid = attr - > ia_gid ;
2019-11-24 21:31:45 +02:00
if ( ia_valid & ATTR_ATIME )
inode - > i_atime = attr - > ia_atime ;
if ( ia_valid & ATTR_MTIME )
inode - > i_mtime = attr - > ia_mtime ;
if ( ia_valid & ATTR_CTIME )
inode - > i_ctime = attr - > ia_ctime ;
2012-11-02 17:09:44 +09:00
if ( ia_valid & ATTR_MODE ) {
umode_t mode = attr - > ia_mode ;
2021-01-21 14:19:26 +01:00
kgid_t kgid = i_gid_into_mnt ( mnt_userns , inode ) ;
2012-11-02 17:09:44 +09:00
idmapped-mounts-v5.12
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCYCegywAKCRCRxhvAZXjc
ouJ6AQDlf+7jCQlQdeKKoN9QDFfMzG1ooemat36EpRRTONaGuAD8D9A4sUsG4+5f
4IU5Lj9oY4DEmF8HenbWK2ZHsesL2Qg=
=yPaw
-----END PGP SIGNATURE-----
Merge tag 'idmapped-mounts-v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull idmapped mounts from Christian Brauner:
"This introduces idmapped mounts which has been in the making for some
time. Simply put, different mounts can expose the same file or
directory with different ownership. This initial implementation comes
with ports for fat, ext4 and with Christoph's port for xfs with more
filesystems being actively worked on by independent people and
maintainers.
Idmapping mounts handle a wide range of long standing use-cases. Here
are just a few:
- Idmapped mounts make it possible to easily share files between
multiple users or multiple machines especially in complex
scenarios. For example, idmapped mounts will be used in the
implementation of portable home directories in
systemd-homed.service(8) where they allow users to move their home
directory to an external storage device and use it on multiple
computers where they are assigned different uids and gids. This
effectively makes it possible to assign random uids and gids at
login time.
- It is possible to share files from the host with unprivileged
containers without having to change ownership permanently through
chown(2).
- It is possible to idmap a container's rootfs and without having to
mangle every file. For example, Chromebooks use it to share the
user's Download folder with their unprivileged containers in their
Linux subsystem.
- It is possible to share files between containers with
non-overlapping idmappings.
- Filesystem that lack a proper concept of ownership such as fat can
use idmapped mounts to implement discretionary access (DAC)
permission checking.
- They allow users to efficiently changing ownership on a per-mount
basis without having to (recursively) chown(2) all files. In
contrast to chown (2) changing ownership of large sets of files is
instantenous with idmapped mounts. This is especially useful when
ownership of a whole root filesystem of a virtual machine or
container is changed. With idmapped mounts a single syscall
mount_setattr syscall will be sufficient to change the ownership of
all files.
- Idmapped mounts always take the current ownership into account as
idmappings specify what a given uid or gid is supposed to be mapped
to. This contrasts with the chown(2) syscall which cannot by itself
take the current ownership of the files it changes into account. It
simply changes the ownership to the specified uid and gid. This is
especially problematic when recursively chown(2)ing a large set of
files which is commong with the aforementioned portable home
directory and container and vm scenario.
- Idmapped mounts allow to change ownership locally, restricting it
to specific mounts, and temporarily as the ownership changes only
apply as long as the mount exists.
Several userspace projects have either already put up patches and
pull-requests for this feature or will do so should you decide to pull
this:
- systemd: In a wide variety of scenarios but especially right away
in their implementation of portable home directories.
https://systemd.io/HOME_DIRECTORY/
- container runtimes: containerd, runC, LXD:To share data between
host and unprivileged containers, unprivileged and privileged
containers, etc. The pull request for idmapped mounts support in
containerd, the default Kubernetes runtime is already up for quite
a while now: https://github.com/containerd/containerd/pull/4734
- The virtio-fs developers and several users have expressed interest
in using this feature with virtual machines once virtio-fs is
ported.
- ChromeOS: Sharing host-directories with unprivileged containers.
I've tightly synced with all those projects and all of those listed
here have also expressed their need/desire for this feature on the
mailing list. For more info on how people use this there's a bunch of
talks about this too. Here's just two recent ones:
https://www.cncf.io/wp-content/uploads/2020/12/Rootless-Containers-in-Gitpod.pdf
https://fosdem.org/2021/schedule/event/containers_idmap/
This comes with an extensive xfstests suite covering both ext4 and
xfs:
https://git.kernel.org/brauner/xfstests-dev/h/idmapped_mounts
It covers truncation, creation, opening, xattrs, vfscaps, setid
execution, setgid inheritance and more both with idmapped and
non-idmapped mounts. It already helped to discover an unrelated xfs
setgid inheritance bug which has since been fixed in mainline. It will
be sent for inclusion with the xfstests project should you decide to
merge this.
In order to support per-mount idmappings vfsmounts are marked with
user namespaces. The idmapping of the user namespace will be used to
map the ids of vfs objects when they are accessed through that mount.
By default all vfsmounts are marked with the initial user namespace.
The initial user namespace is used to indicate that a mount is not
idmapped. All operations behave as before and this is verified in the
testsuite.
Based on prior discussions we want to attach the whole user namespace
and not just a dedicated idmapping struct. This allows us to reuse all
the helpers that already exist for dealing with idmappings instead of
introducing a whole new range of helpers. In addition, if we decide in
the future that we are confident enough to enable unprivileged users
to setup idmapped mounts the permission checking can take into account
whether the caller is privileged in the user namespace the mount is
currently marked with.
The user namespace the mount will be marked with can be specified by
passing a file descriptor refering to the user namespace as an
argument to the new mount_setattr() syscall together with the new
MOUNT_ATTR_IDMAP flag. The system call follows the openat2() pattern
of extensibility.
The following conditions must be met in order to create an idmapped
mount:
- The caller must currently have the CAP_SYS_ADMIN capability in the
user namespace the underlying filesystem has been mounted in.
- The underlying filesystem must support idmapped mounts.
- The mount must not already be idmapped. This also implies that the
idmapping of a mount cannot be altered once it has been idmapped.
- The mount must be a detached/anonymous mount, i.e. it must have
been created by calling open_tree() with the OPEN_TREE_CLONE flag
and it must not already have been visible in the filesystem.
The last two points guarantee easier semantics for userspace and the
kernel and make the implementation significantly simpler.
By default vfsmounts are marked with the initial user namespace and no
behavioral or performance changes are observed.
The manpage with a detailed description can be found here:
https://git.kernel.org/brauner/man-pages/c/1d7b902e2875a1ff342e036a9f866a995640aea8
In order to support idmapped mounts, filesystems need to be changed
and mark themselves with the FS_ALLOW_IDMAP flag in fs_flags. The
patches to convert individual filesystem are not very large or
complicated overall as can be seen from the included fat, ext4, and
xfs ports. Patches for other filesystems are actively worked on and
will be sent out separately. The xfstestsuite can be used to verify
that port has been done correctly.
The mount_setattr() syscall is motivated independent of the idmapped
mounts patches and it's been around since July 2019. One of the most
valuable features of the new mount api is the ability to perform
mounts based on file descriptors only.
Together with the lookup restrictions available in the openat2()
RESOLVE_* flag namespace which we added in v5.6 this is the first time
we are close to hardened and race-free (e.g. symlinks) mounting and
path resolution.
While userspace has started porting to the new mount api to mount
proper filesystems and create new bind-mounts it is currently not
possible to change mount options of an already existing bind mount in
the new mount api since the mount_setattr() syscall is missing.
With the addition of the mount_setattr() syscall we remove this last
restriction and userspace can now fully port to the new mount api,
covering every use-case the old mount api could. We also add the
crucial ability to recursively change mount options for a whole mount
tree, both removing and adding mount options at the same time. This
syscall has been requested multiple times by various people and
projects.
There is a simple tool available at
https://github.com/brauner/mount-idmapped
that allows to create idmapped mounts so people can play with this
patch series. I'll add support for the regular mount binary should you
decide to pull this in the following weeks:
Here's an example to a simple idmapped mount of another user's home
directory:
u1001@f2-vm:/$ sudo ./mount --idmap both:1000:1001:1 /home/ubuntu/ /mnt
u1001@f2-vm:/$ ls -al /home/ubuntu/
total 28
drwxr-xr-x 2 ubuntu ubuntu 4096 Oct 28 22:07 .
drwxr-xr-x 4 root root 4096 Oct 28 04:00 ..
-rw------- 1 ubuntu ubuntu 3154 Oct 28 22:12 .bash_history
-rw-r--r-- 1 ubuntu ubuntu 220 Feb 25 2020 .bash_logout
-rw-r--r-- 1 ubuntu ubuntu 3771 Feb 25 2020 .bashrc
-rw-r--r-- 1 ubuntu ubuntu 807 Feb 25 2020 .profile
-rw-r--r-- 1 ubuntu ubuntu 0 Oct 16 16:11 .sudo_as_admin_successful
-rw------- 1 ubuntu ubuntu 1144 Oct 28 00:43 .viminfo
u1001@f2-vm:/$ ls -al /mnt/
total 28
drwxr-xr-x 2 u1001 u1001 4096 Oct 28 22:07 .
drwxr-xr-x 29 root root 4096 Oct 28 22:01 ..
-rw------- 1 u1001 u1001 3154 Oct 28 22:12 .bash_history
-rw-r--r-- 1 u1001 u1001 220 Feb 25 2020 .bash_logout
-rw-r--r-- 1 u1001 u1001 3771 Feb 25 2020 .bashrc
-rw-r--r-- 1 u1001 u1001 807 Feb 25 2020 .profile
-rw-r--r-- 1 u1001 u1001 0 Oct 16 16:11 .sudo_as_admin_successful
-rw------- 1 u1001 u1001 1144 Oct 28 00:43 .viminfo
u1001@f2-vm:/$ touch /mnt/my-file
u1001@f2-vm:/$ setfacl -m u:1001:rwx /mnt/my-file
u1001@f2-vm:/$ sudo setcap -n 1001 cap_net_raw+ep /mnt/my-file
u1001@f2-vm:/$ ls -al /mnt/my-file
-rw-rwxr--+ 1 u1001 u1001 0 Oct 28 22:14 /mnt/my-file
u1001@f2-vm:/$ ls -al /home/ubuntu/my-file
-rw-rwxr--+ 1 ubuntu ubuntu 0 Oct 28 22:14 /home/ubuntu/my-file
u1001@f2-vm:/$ getfacl /mnt/my-file
getfacl: Removing leading '/' from absolute path names
# file: mnt/my-file
# owner: u1001
# group: u1001
user::rw-
user:u1001:rwx
group::rw-
mask::rwx
other::r--
u1001@f2-vm:/$ getfacl /home/ubuntu/my-file
getfacl: Removing leading '/' from absolute path names
# file: home/ubuntu/my-file
# owner: ubuntu
# group: ubuntu
user::rw-
user:ubuntu:rwx
group::rw-
mask::rwx
other::r--"
* tag 'idmapped-mounts-v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: (41 commits)
xfs: remove the possibly unused mp variable in xfs_file_compat_ioctl
xfs: support idmapped mounts
ext4: support idmapped mounts
fat: handle idmapped mounts
tests: add mount_setattr() selftests
fs: introduce MOUNT_ATTR_IDMAP
fs: add mount_setattr()
fs: add attr_flags_to_mnt_flags helper
fs: split out functions to hold writers
namespace: only take read lock in do_reconfigure_mnt()
mount: make {lock,unlock}_mount_hash() static
namespace: take lock_mount_hash() directly when changing flags
nfs: do not export idmapped mounts
overlayfs: do not mount on top of idmapped mounts
ecryptfs: do not mount on top of idmapped mounts
ima: handle idmapped mounts
apparmor: handle idmapped mounts
fs: make helpers idmap mount aware
exec: handle idmapped mounts
would_dump: handle idmapped mounts
...
2021-02-23 13:39:45 -08:00
if ( ! in_group_p ( kgid ) & & ! capable_wrt_inode_uidgid ( mnt_userns , inode , CAP_FSETID ) )
2012-11-02 17:09:44 +09:00
mode & = ~ S_ISGID ;
2016-05-20 10:13:22 -07:00
set_acl_inode ( inode , mode ) ;
2012-11-02 17:09:44 +09:00
}
}
# else
# define __setattr_copy setattr_copy
# endif
2021-01-21 14:19:43 +01:00
int f2fs_setattr ( struct user_namespace * mnt_userns , struct dentry * dentry ,
struct iattr * attr )
2012-11-02 17:09:44 +09:00
{
2015-03-17 22:25:59 +00:00
struct inode * inode = d_inode ( dentry ) ;
2012-11-02 17:09:44 +09:00
int err ;
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) )
return - EIO ;
2020-12-26 18:07:01 +08:00
if ( unlikely ( IS_IMMUTABLE ( inode ) ) )
return - EPERM ;
if ( unlikely ( IS_APPEND ( inode ) & &
( attr - > ia_valid & ( ATTR_MODE | ATTR_UID |
ATTR_GID | ATTR_TIMES_SET ) ) ) )
return - EPERM ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( ( attr - > ia_valid & ATTR_SIZE ) & &
! f2fs_is_compress_backend_ready ( inode ) )
return - EOPNOTSUPP ;
2021-01-21 14:19:26 +01:00
err = setattr_prepare ( & init_user_ns , dentry , attr ) ;
2012-11-02 17:09:44 +09:00
if ( err )
return err ;
2017-11-29 12:35:32 -08:00
err = fscrypt_prepare_setattr ( dentry , attr ) ;
if ( err )
return err ;
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
err = fsverity_prepare_setattr ( dentry , attr ) ;
if ( err )
return err ;
2017-07-09 00:13:07 +08:00
if ( is_quota_modification ( inode , attr ) ) {
2021-10-28 21:03:05 +08:00
err = f2fs_dquot_initialize ( inode ) ;
2017-07-09 00:13:07 +08:00
if ( err )
return err ;
}
if ( ( attr - > ia_valid & ATTR_UID & &
! uid_eq ( attr - > ia_uid , inode - > i_uid ) ) | |
( attr - > ia_valid & ATTR_GID & &
! gid_eq ( attr - > ia_gid , inode - > i_gid ) ) ) {
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
f2fs_lock_op ( F2FS_I_SB ( inode ) ) ;
2017-07-09 00:13:07 +08:00
err = dquot_transfer ( inode , attr ) ;
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
if ( err ) {
set_sbi_flag ( F2FS_I_SB ( inode ) ,
SBI_QUOTA_NEED_REPAIR ) ;
f2fs_unlock_op ( F2FS_I_SB ( inode ) ) ;
2017-07-09 00:13:07 +08:00
return err ;
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
}
/*
* update uid / gid under lock_op ( ) , so that dquot and inode can
* be updated atomically .
*/
if ( attr - > ia_valid & ATTR_UID )
inode - > i_uid = attr - > ia_uid ;
if ( attr - > ia_valid & ATTR_GID )
inode - > i_gid = attr - > ia_gid ;
f2fs_mark_inode_dirty_sync ( inode , true ) ;
f2fs_unlock_op ( F2FS_I_SB ( inode ) ) ;
2017-07-09 00:13:07 +08:00
}
2014-09-15 18:02:09 +08:00
if ( attr - > ia_valid & ATTR_SIZE ) {
2019-09-03 10:06:26 +08:00
loff_t old_size = i_size_read ( inode ) ;
if ( attr - > ia_size > MAX_INLINE_DATA ( inode ) ) {
/*
* should convert inline inode before i_size_write to
* keep smaller than inline_data size with inline flag .
*/
err = f2fs_convert_inline_inode ( inode ) ;
if ( err )
return err ;
}
2018-08-05 23:04:25 +08:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2018-08-05 23:04:25 +08:00
truncate_setsize ( inode , attr - > ia_size ) ;
2019-09-03 10:06:26 +08:00
if ( attr - > ia_size < = old_size )
2016-06-02 13:49:38 -07:00
err = f2fs_truncate ( inode ) ;
2018-08-05 23:04:25 +08:00
/*
* do not trim all blocks after i_size if target size is
* larger than i_size .
*/
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2018-07-25 12:11:56 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2018-08-05 23:04:25 +08:00
if ( err )
return err ;
2015-12-01 11:36:16 +08:00
2020-02-27 19:30:03 +08:00
spin_lock ( & F2FS_I ( inode ) - > i_size_lock ) ;
2019-09-03 10:06:26 +08:00
inode - > i_mtime = inode - > i_ctime = current_time ( inode ) ;
2017-10-09 17:55:19 +08:00
F2FS_I ( inode ) - > last_disk_size = i_size_read ( inode ) ;
2020-02-27 19:30:03 +08:00
spin_unlock ( & F2FS_I ( inode ) - > i_size_lock ) ;
2012-11-02 17:09:44 +09:00
}
2021-01-21 14:19:26 +01:00
__setattr_copy ( & init_user_ns , inode , attr ) ;
2012-11-02 17:09:44 +09:00
if ( attr - > ia_valid & ATTR_MODE ) {
idmapped-mounts-v5.12
-----BEGIN PGP SIGNATURE-----
iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCYCegywAKCRCRxhvAZXjc
ouJ6AQDlf+7jCQlQdeKKoN9QDFfMzG1ooemat36EpRRTONaGuAD8D9A4sUsG4+5f
4IU5Lj9oY4DEmF8HenbWK2ZHsesL2Qg=
=yPaw
-----END PGP SIGNATURE-----
Merge tag 'idmapped-mounts-v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull idmapped mounts from Christian Brauner:
"This introduces idmapped mounts which has been in the making for some
time. Simply put, different mounts can expose the same file or
directory with different ownership. This initial implementation comes
with ports for fat, ext4 and with Christoph's port for xfs with more
filesystems being actively worked on by independent people and
maintainers.
Idmapping mounts handle a wide range of long standing use-cases. Here
are just a few:
- Idmapped mounts make it possible to easily share files between
multiple users or multiple machines especially in complex
scenarios. For example, idmapped mounts will be used in the
implementation of portable home directories in
systemd-homed.service(8) where they allow users to move their home
directory to an external storage device and use it on multiple
computers where they are assigned different uids and gids. This
effectively makes it possible to assign random uids and gids at
login time.
- It is possible to share files from the host with unprivileged
containers without having to change ownership permanently through
chown(2).
- It is possible to idmap a container's rootfs and without having to
mangle every file. For example, Chromebooks use it to share the
user's Download folder with their unprivileged containers in their
Linux subsystem.
- It is possible to share files between containers with
non-overlapping idmappings.
- Filesystem that lack a proper concept of ownership such as fat can
use idmapped mounts to implement discretionary access (DAC)
permission checking.
- They allow users to efficiently changing ownership on a per-mount
basis without having to (recursively) chown(2) all files. In
contrast to chown (2) changing ownership of large sets of files is
instantenous with idmapped mounts. This is especially useful when
ownership of a whole root filesystem of a virtual machine or
container is changed. With idmapped mounts a single syscall
mount_setattr syscall will be sufficient to change the ownership of
all files.
- Idmapped mounts always take the current ownership into account as
idmappings specify what a given uid or gid is supposed to be mapped
to. This contrasts with the chown(2) syscall which cannot by itself
take the current ownership of the files it changes into account. It
simply changes the ownership to the specified uid and gid. This is
especially problematic when recursively chown(2)ing a large set of
files which is commong with the aforementioned portable home
directory and container and vm scenario.
- Idmapped mounts allow to change ownership locally, restricting it
to specific mounts, and temporarily as the ownership changes only
apply as long as the mount exists.
Several userspace projects have either already put up patches and
pull-requests for this feature or will do so should you decide to pull
this:
- systemd: In a wide variety of scenarios but especially right away
in their implementation of portable home directories.
https://systemd.io/HOME_DIRECTORY/
- container runtimes: containerd, runC, LXD:To share data between
host and unprivileged containers, unprivileged and privileged
containers, etc. The pull request for idmapped mounts support in
containerd, the default Kubernetes runtime is already up for quite
a while now: https://github.com/containerd/containerd/pull/4734
- The virtio-fs developers and several users have expressed interest
in using this feature with virtual machines once virtio-fs is
ported.
- ChromeOS: Sharing host-directories with unprivileged containers.
I've tightly synced with all those projects and all of those listed
here have also expressed their need/desire for this feature on the
mailing list. For more info on how people use this there's a bunch of
talks about this too. Here's just two recent ones:
https://www.cncf.io/wp-content/uploads/2020/12/Rootless-Containers-in-Gitpod.pdf
https://fosdem.org/2021/schedule/event/containers_idmap/
This comes with an extensive xfstests suite covering both ext4 and
xfs:
https://git.kernel.org/brauner/xfstests-dev/h/idmapped_mounts
It covers truncation, creation, opening, xattrs, vfscaps, setid
execution, setgid inheritance and more both with idmapped and
non-idmapped mounts. It already helped to discover an unrelated xfs
setgid inheritance bug which has since been fixed in mainline. It will
be sent for inclusion with the xfstests project should you decide to
merge this.
In order to support per-mount idmappings vfsmounts are marked with
user namespaces. The idmapping of the user namespace will be used to
map the ids of vfs objects when they are accessed through that mount.
By default all vfsmounts are marked with the initial user namespace.
The initial user namespace is used to indicate that a mount is not
idmapped. All operations behave as before and this is verified in the
testsuite.
Based on prior discussions we want to attach the whole user namespace
and not just a dedicated idmapping struct. This allows us to reuse all
the helpers that already exist for dealing with idmappings instead of
introducing a whole new range of helpers. In addition, if we decide in
the future that we are confident enough to enable unprivileged users
to setup idmapped mounts the permission checking can take into account
whether the caller is privileged in the user namespace the mount is
currently marked with.
The user namespace the mount will be marked with can be specified by
passing a file descriptor refering to the user namespace as an
argument to the new mount_setattr() syscall together with the new
MOUNT_ATTR_IDMAP flag. The system call follows the openat2() pattern
of extensibility.
The following conditions must be met in order to create an idmapped
mount:
- The caller must currently have the CAP_SYS_ADMIN capability in the
user namespace the underlying filesystem has been mounted in.
- The underlying filesystem must support idmapped mounts.
- The mount must not already be idmapped. This also implies that the
idmapping of a mount cannot be altered once it has been idmapped.
- The mount must be a detached/anonymous mount, i.e. it must have
been created by calling open_tree() with the OPEN_TREE_CLONE flag
and it must not already have been visible in the filesystem.
The last two points guarantee easier semantics for userspace and the
kernel and make the implementation significantly simpler.
By default vfsmounts are marked with the initial user namespace and no
behavioral or performance changes are observed.
The manpage with a detailed description can be found here:
https://git.kernel.org/brauner/man-pages/c/1d7b902e2875a1ff342e036a9f866a995640aea8
In order to support idmapped mounts, filesystems need to be changed
and mark themselves with the FS_ALLOW_IDMAP flag in fs_flags. The
patches to convert individual filesystem are not very large or
complicated overall as can be seen from the included fat, ext4, and
xfs ports. Patches for other filesystems are actively worked on and
will be sent out separately. The xfstestsuite can be used to verify
that port has been done correctly.
The mount_setattr() syscall is motivated independent of the idmapped
mounts patches and it's been around since July 2019. One of the most
valuable features of the new mount api is the ability to perform
mounts based on file descriptors only.
Together with the lookup restrictions available in the openat2()
RESOLVE_* flag namespace which we added in v5.6 this is the first time
we are close to hardened and race-free (e.g. symlinks) mounting and
path resolution.
While userspace has started porting to the new mount api to mount
proper filesystems and create new bind-mounts it is currently not
possible to change mount options of an already existing bind mount in
the new mount api since the mount_setattr() syscall is missing.
With the addition of the mount_setattr() syscall we remove this last
restriction and userspace can now fully port to the new mount api,
covering every use-case the old mount api could. We also add the
crucial ability to recursively change mount options for a whole mount
tree, both removing and adding mount options at the same time. This
syscall has been requested multiple times by various people and
projects.
There is a simple tool available at
https://github.com/brauner/mount-idmapped
that allows to create idmapped mounts so people can play with this
patch series. I'll add support for the regular mount binary should you
decide to pull this in the following weeks:
Here's an example to a simple idmapped mount of another user's home
directory:
u1001@f2-vm:/$ sudo ./mount --idmap both:1000:1001:1 /home/ubuntu/ /mnt
u1001@f2-vm:/$ ls -al /home/ubuntu/
total 28
drwxr-xr-x 2 ubuntu ubuntu 4096 Oct 28 22:07 .
drwxr-xr-x 4 root root 4096 Oct 28 04:00 ..
-rw------- 1 ubuntu ubuntu 3154 Oct 28 22:12 .bash_history
-rw-r--r-- 1 ubuntu ubuntu 220 Feb 25 2020 .bash_logout
-rw-r--r-- 1 ubuntu ubuntu 3771 Feb 25 2020 .bashrc
-rw-r--r-- 1 ubuntu ubuntu 807 Feb 25 2020 .profile
-rw-r--r-- 1 ubuntu ubuntu 0 Oct 16 16:11 .sudo_as_admin_successful
-rw------- 1 ubuntu ubuntu 1144 Oct 28 00:43 .viminfo
u1001@f2-vm:/$ ls -al /mnt/
total 28
drwxr-xr-x 2 u1001 u1001 4096 Oct 28 22:07 .
drwxr-xr-x 29 root root 4096 Oct 28 22:01 ..
-rw------- 1 u1001 u1001 3154 Oct 28 22:12 .bash_history
-rw-r--r-- 1 u1001 u1001 220 Feb 25 2020 .bash_logout
-rw-r--r-- 1 u1001 u1001 3771 Feb 25 2020 .bashrc
-rw-r--r-- 1 u1001 u1001 807 Feb 25 2020 .profile
-rw-r--r-- 1 u1001 u1001 0 Oct 16 16:11 .sudo_as_admin_successful
-rw------- 1 u1001 u1001 1144 Oct 28 00:43 .viminfo
u1001@f2-vm:/$ touch /mnt/my-file
u1001@f2-vm:/$ setfacl -m u:1001:rwx /mnt/my-file
u1001@f2-vm:/$ sudo setcap -n 1001 cap_net_raw+ep /mnt/my-file
u1001@f2-vm:/$ ls -al /mnt/my-file
-rw-rwxr--+ 1 u1001 u1001 0 Oct 28 22:14 /mnt/my-file
u1001@f2-vm:/$ ls -al /home/ubuntu/my-file
-rw-rwxr--+ 1 ubuntu ubuntu 0 Oct 28 22:14 /home/ubuntu/my-file
u1001@f2-vm:/$ getfacl /mnt/my-file
getfacl: Removing leading '/' from absolute path names
# file: mnt/my-file
# owner: u1001
# group: u1001
user::rw-
user:u1001:rwx
group::rw-
mask::rwx
other::r--
u1001@f2-vm:/$ getfacl /home/ubuntu/my-file
getfacl: Removing leading '/' from absolute path names
# file: home/ubuntu/my-file
# owner: ubuntu
# group: ubuntu
user::rw-
user:ubuntu:rwx
group::rw-
mask::rwx
other::r--"
* tag 'idmapped-mounts-v5.12' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: (41 commits)
xfs: remove the possibly unused mp variable in xfs_file_compat_ioctl
xfs: support idmapped mounts
ext4: support idmapped mounts
fat: handle idmapped mounts
tests: add mount_setattr() selftests
fs: introduce MOUNT_ATTR_IDMAP
fs: add mount_setattr()
fs: add attr_flags_to_mnt_flags helper
fs: split out functions to hold writers
namespace: only take read lock in do_reconfigure_mnt()
mount: make {lock,unlock}_mount_hash() static
namespace: take lock_mount_hash() directly when changing flags
nfs: do not export idmapped mounts
overlayfs: do not mount on top of idmapped mounts
ecryptfs: do not mount on top of idmapped mounts
ima: handle idmapped mounts
apparmor: handle idmapped mounts
fs: make helpers idmap mount aware
exec: handle idmapped mounts
would_dump: handle idmapped mounts
...
2021-02-23 13:39:45 -08:00
err = posix_acl_chmod ( & init_user_ns , inode , f2fs_get_inode_mode ( inode ) ) ;
2020-12-25 16:52:27 +08:00
if ( is_inode_flag_set ( inode , FI_ACL_MODE ) ) {
if ( ! err )
inode - > i_mode = F2FS_I ( inode ) - > i_acl_mode ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_ACL_MODE ) ;
2012-11-02 17:09:44 +09:00
}
}
2016-12-11 15:35:15 +08:00
/* file size may changed here */
2019-02-23 09:48:27 +08:00
f2fs_mark_inode_dirty_sync ( inode , true ) ;
2016-10-14 13:30:31 -07:00
/* inode change will produce dirty node pages flushed by checkpoint */
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
2012-11-02 17:09:44 +09:00
return err ;
}
const struct inode_operations f2fs_file_inode_operations = {
. getattr = f2fs_getattr ,
. setattr = f2fs_setattr ,
. get_acl = f2fs_get_acl ,
2013-12-20 05:16:45 -08:00
. set_acl = f2fs_set_acl ,
2012-11-02 17:09:44 +09:00
. listxattr = f2fs_listxattr ,
2014-06-08 04:30:14 +09:00
. fiemap = f2fs_fiemap ,
2021-04-07 14:36:43 +02:00
. fileattr_get = f2fs_fileattr_get ,
. fileattr_set = f2fs_fileattr_set ,
2012-11-02 17:09:44 +09:00
} ;
2015-08-07 18:36:06 +08:00
static int fill_zero ( struct inode * inode , pgoff_t index ,
2012-11-02 17:09:44 +09:00
loff_t start , loff_t len )
{
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2012-11-02 17:09:44 +09:00
struct page * page ;
if ( ! len )
2015-08-07 18:36:06 +08:00
return 0 ;
2012-11-02 17:09:44 +09:00
2016-01-07 14:15:04 -08:00
f2fs_balance_fs ( sbi , true ) ;
2013-01-25 18:33:41 +09:00
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 18:08:30 +08:00
f2fs_lock_op ( sbi ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
page = f2fs_get_new_data_page ( inode , NULL , index , false ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 18:08:30 +08:00
f2fs_unlock_op ( sbi ) ;
2012-11-02 17:09:44 +09:00
2015-08-07 18:36:06 +08:00
if ( IS_ERR ( page ) )
return PTR_ERR ( page ) ;
2018-12-25 17:43:42 +08:00
f2fs_wait_on_page_writeback ( page , DATA , true , true ) ;
2015-08-07 18:36:06 +08:00
zero_user ( page , start , len ) ;
set_page_dirty ( page ) ;
f2fs_put_page ( page , 1 ) ;
return 0 ;
2012-11-02 17:09:44 +09:00
}
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
int f2fs_truncate_hole ( struct inode * inode , pgoff_t pg_start , pgoff_t pg_end )
2012-11-02 17:09:44 +09:00
{
int err ;
2015-09-17 20:22:44 +08:00
while ( pg_start < pg_end ) {
2012-11-02 17:09:44 +09:00
struct dnode_of_data dn ;
2015-09-17 20:22:44 +08:00
pgoff_t end_offset , count ;
2013-01-11 14:09:38 +09:00
2012-11-02 17:09:44 +09:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
err = f2fs_get_dnode_of_data ( & dn , pg_start , LOOKUP_NODE ) ;
2012-11-02 17:09:44 +09:00
if ( err ) {
2015-09-17 20:22:44 +08:00
if ( err = = - ENOENT ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
pg_start = f2fs_get_next_page_offset ( & dn ,
pg_start ) ;
2012-11-02 17:09:44 +09:00
continue ;
2015-09-17 20:22:44 +08:00
}
2012-11-02 17:09:44 +09:00
return err ;
}
2016-01-26 15:39:35 +08:00
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
2015-09-17 20:22:44 +08:00
count = min ( end_offset - dn . ofs_in_node , pg_end - pg_start ) ;
f2fs_bug_on ( F2FS_I_SB ( inode ) , count = = 0 | | count > end_offset ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_truncate_data_blocks_range ( & dn , count ) ;
2012-11-02 17:09:44 +09:00
f2fs_put_dnode ( & dn ) ;
2015-09-17 20:22:44 +08:00
pg_start + = count ;
2012-11-02 17:09:44 +09:00
}
return 0 ;
}
2013-11-22 16:52:50 +08:00
static int punch_hole ( struct inode * inode , loff_t offset , loff_t len )
2012-11-02 17:09:44 +09:00
{
pgoff_t pg_start , pg_end ;
loff_t off_start , off_end ;
2015-12-22 11:09:35 -08:00
int ret ;
2012-11-02 17:09:44 +09:00
2015-12-22 11:09:35 -08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
return ret ;
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pg_start = ( ( unsigned long long ) offset ) > > PAGE_SHIFT ;
pg_end = ( ( unsigned long long ) offset + len ) > > PAGE_SHIFT ;
2012-11-02 17:09:44 +09:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
off_start = offset & ( PAGE_SIZE - 1 ) ;
off_end = ( offset + len ) & ( PAGE_SIZE - 1 ) ;
2012-11-02 17:09:44 +09:00
if ( pg_start = = pg_end ) {
2015-08-07 18:36:06 +08:00
ret = fill_zero ( inode , pg_start , off_start ,
2012-11-02 17:09:44 +09:00
off_end - off_start ) ;
2015-08-07 18:36:06 +08:00
if ( ret )
return ret ;
2012-11-02 17:09:44 +09:00
} else {
2015-08-07 18:36:06 +08:00
if ( off_start ) {
ret = fill_zero ( inode , pg_start + + , off_start ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
PAGE_SIZE - off_start ) ;
2015-08-07 18:36:06 +08:00
if ( ret )
return ret ;
}
if ( off_end ) {
ret = fill_zero ( inode , pg_end , 0 , off_end ) ;
if ( ret )
return ret ;
}
2012-11-02 17:09:44 +09:00
if ( pg_start < pg_end ) {
loff_t blk_start , blk_end ;
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2013-04-08 20:16:44 -05:00
2016-01-07 14:15:04 -08:00
f2fs_balance_fs ( sbi , true ) ;
2012-11-02 17:09:44 +09:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
blk_start = ( loff_t ) pg_start < < PAGE_SHIFT ;
blk_end = ( loff_t ) pg_end < < PAGE_SHIFT ;
2018-08-05 23:04:25 +08:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
f2fs-for-5.15-rc1
In this cycle, we've addressed some performance issues such as lock contention,
misbehaving compress_cache, allowing extent_cache for compressed files, and new
sysfs to adjust ra_size for fadvise. In order to diagnose the performance issues
quickly, we also added an iostat which shows the IO latencies periodically. On
the stability side, we've found two memory leakage cases in the error path in
compression flow. And, we've also fixed various corner cases in fiemap, quota,
checkpoint=disable, zstd, and so on.
Enhancement:
- avoid long checkpoint latency by releasing nat_tree_lock
- collect and show iostats periodically
- support extent_cache for compressed files
- add a sysfs entry to manage ra_size given fadvise(POSIX_FADV_SEQUENTIAL)
- report f2fs GC status via sysfs
- add discard_unit=%s in mount option to handle zoned device
Bug fix:
- fix two memory leakages when an error happens in the compressed IO flow
- fix commpress_cache to get the right LBA
- fix fiemap to deal with compressed case correctly
- fix wrong EIO returns due to SBI_NEED_FSCK
- fix missing writes when enabling checkpoint back
- fix quota deadlock
- fix zstd level mount option
In addition to the above major updates, we've cleaned up several code paths such
as dio, unnecessary operations, debugfs/f2fs/status, sanity check, and typos.
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEE00UqedjCtOrGVvQiQBSofoJIUNIFAmEyw1sACgkQQBSofoJI
UNLJmA/+NHUgwUjLMcHvmLyp6QYpQDZtKj93/sRDo+YHOYNdYFjWWUb329PYTKWS
kEdzApCP+KHfVxeSkiL/x3qWP+RlTkIf96P0kR3/BKi0tjg25G2riFWztusDDFpt
xi+AW5sUFDvIx1tFumvQHAQedSwBgcZ96ovT5EwxEuONkljhZC9phEC6vSXz9nOR
e2EQIyezbC5O21np1KSeqSgqRMpVkJkVcEHy4VmpMBCLMOOYPepWwKw+yPaV/jR/
zUXdo2/53vma50M5LCDPCtjCtWQgLoeNeGLxyjfzQuTJU6TmtPY65JObLPt6pUSj
fRW6qIziTZbVYXzOWBD0EYilv2N4c3BNJdhQCpx2Vyjw9/LLxzqKPOUyzBoa1kjY
eZVvmaLXVCKsoJdHDSi7OH/4BqS6SuSZE8eO/nGkgswqiErHZ0Vwl3bFCWC7r/Bk
r2U5spJx/83XO6c9H1bzeWEies1DRtwnCDIRRuw35RtJ4uHZaqCfkuJ7rOBwC90X
4SpaAKdUxP2RWc3GKELBIhaqPn7vyMy9ile6VU14PjM8UcY5hyE87T2azqR8gGut
nVjRL4cbMGTPj6m1Qj8KqBRSaLuShe6AncUy7bNGiM+JlcLcdB6OJ1ZYLl9hjx2r
TbIouXThgcZ4SIK0DEaBLKz2b9/0TfaO9gw1XzpRma+bWA1pApM=
=W67o
-----END PGP SIGNATURE-----
Merge tag 'f2fs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
Pull f2fs updates from Jaegeuk Kim:
"In this cycle, we've addressed some performance issues such as lock
contention, misbehaving compress_cache, allowing extent_cache for
compressed files, and new sysfs to adjust ra_size for fadvise.
In order to diagnose the performance issues quickly, we also added an
iostat which shows the IO latencies periodically.
On the stability side, we've found two memory leakage cases in the
error path in compression flow. And, we've also fixed various corner
cases in fiemap, quota, checkpoint=disable, zstd, and so on.
Enhancements:
- avoid long checkpoint latency by releasing nat_tree_lock
- collect and show iostats periodically
- support extent_cache for compressed files
- add a sysfs entry to manage ra_size given fadvise(POSIX_FADV_SEQUENTIAL)
- report f2fs GC status via sysfs
- add discard_unit=%s in mount option to handle zoned device
Bug fixes:
- fix two memory leakages when an error happens in the compressed IO flow
- fix commpress_cache to get the right LBA
- fix fiemap to deal with compressed case correctly
- fix wrong EIO returns due to SBI_NEED_FSCK
- fix missing writes when enabling checkpoint back
- fix quota deadlock
- fix zstd level mount option
In addition to the above major updates, we've cleaned up several code
paths such as dio, unnecessary operations, debugfs/f2fs/status, sanity
check, and typos"
* tag 'f2fs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (46 commits)
f2fs: should put a page beyond EOF when preparing a write
f2fs: deallocate compressed pages when error happens
f2fs: enable realtime discard iff device supports discard
f2fs: guarantee to write dirty data when enabling checkpoint back
f2fs: fix to unmap pages from userspace process in punch_hole()
f2fs: fix unexpected ENOENT comes from f2fs_map_blocks()
f2fs: fix to account missing .skipped_gc_rwsem
f2fs: adjust unlock order for cleanup
f2fs: Don't create discard thread when device doesn't support realtime discard
f2fs: rebuild nat_bits during umount
f2fs: introduce periodic iostat io latency traces
f2fs: separate out iostat feature
f2fs: compress: do sanity check on cluster
f2fs: fix description about main_blkaddr node
f2fs: convert S_IRUGO to 0444
f2fs: fix to keep compatibility of fault injection interface
f2fs: support fault injection for f2fs_kmem_cache_alloc()
f2fs: compress: allow write compress released file after truncate to zero
f2fs: correct comment in segment.h
f2fs: improve sbi status info in debugfs/f2fs/status
...
2021-09-04 10:48:47 -07:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2018-08-05 23:04:25 +08:00
2021-08-25 19:34:19 +08:00
truncate_pagecache_range ( inode , blk_start , blk_end - 1 ) ;
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 16:21:29 +09:00
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 18:08:30 +08:00
f2fs_lock_op ( sbi ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_truncate_hole ( inode , pg_start , pg_end ) ;
f2fs: use rw_sem instead of fs_lock(locks mutex)
The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-09-27 18:08:30 +08:00
f2fs_unlock_op ( sbi ) ;
2018-08-05 23:04:25 +08:00
f2fs-for-5.15-rc1
In this cycle, we've addressed some performance issues such as lock contention,
misbehaving compress_cache, allowing extent_cache for compressed files, and new
sysfs to adjust ra_size for fadvise. In order to diagnose the performance issues
quickly, we also added an iostat which shows the IO latencies periodically. On
the stability side, we've found two memory leakage cases in the error path in
compression flow. And, we've also fixed various corner cases in fiemap, quota,
checkpoint=disable, zstd, and so on.
Enhancement:
- avoid long checkpoint latency by releasing nat_tree_lock
- collect and show iostats periodically
- support extent_cache for compressed files
- add a sysfs entry to manage ra_size given fadvise(POSIX_FADV_SEQUENTIAL)
- report f2fs GC status via sysfs
- add discard_unit=%s in mount option to handle zoned device
Bug fix:
- fix two memory leakages when an error happens in the compressed IO flow
- fix commpress_cache to get the right LBA
- fix fiemap to deal with compressed case correctly
- fix wrong EIO returns due to SBI_NEED_FSCK
- fix missing writes when enabling checkpoint back
- fix quota deadlock
- fix zstd level mount option
In addition to the above major updates, we've cleaned up several code paths such
as dio, unnecessary operations, debugfs/f2fs/status, sanity check, and typos.
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEE00UqedjCtOrGVvQiQBSofoJIUNIFAmEyw1sACgkQQBSofoJI
UNLJmA/+NHUgwUjLMcHvmLyp6QYpQDZtKj93/sRDo+YHOYNdYFjWWUb329PYTKWS
kEdzApCP+KHfVxeSkiL/x3qWP+RlTkIf96P0kR3/BKi0tjg25G2riFWztusDDFpt
xi+AW5sUFDvIx1tFumvQHAQedSwBgcZ96ovT5EwxEuONkljhZC9phEC6vSXz9nOR
e2EQIyezbC5O21np1KSeqSgqRMpVkJkVcEHy4VmpMBCLMOOYPepWwKw+yPaV/jR/
zUXdo2/53vma50M5LCDPCtjCtWQgLoeNeGLxyjfzQuTJU6TmtPY65JObLPt6pUSj
fRW6qIziTZbVYXzOWBD0EYilv2N4c3BNJdhQCpx2Vyjw9/LLxzqKPOUyzBoa1kjY
eZVvmaLXVCKsoJdHDSi7OH/4BqS6SuSZE8eO/nGkgswqiErHZ0Vwl3bFCWC7r/Bk
r2U5spJx/83XO6c9H1bzeWEies1DRtwnCDIRRuw35RtJ4uHZaqCfkuJ7rOBwC90X
4SpaAKdUxP2RWc3GKELBIhaqPn7vyMy9ile6VU14PjM8UcY5hyE87T2azqR8gGut
nVjRL4cbMGTPj6m1Qj8KqBRSaLuShe6AncUy7bNGiM+JlcLcdB6OJ1ZYLl9hjx2r
TbIouXThgcZ4SIK0DEaBLKz2b9/0TfaO9gw1XzpRma+bWA1pApM=
=W67o
-----END PGP SIGNATURE-----
Merge tag 'f2fs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
Pull f2fs updates from Jaegeuk Kim:
"In this cycle, we've addressed some performance issues such as lock
contention, misbehaving compress_cache, allowing extent_cache for
compressed files, and new sysfs to adjust ra_size for fadvise.
In order to diagnose the performance issues quickly, we also added an
iostat which shows the IO latencies periodically.
On the stability side, we've found two memory leakage cases in the
error path in compression flow. And, we've also fixed various corner
cases in fiemap, quota, checkpoint=disable, zstd, and so on.
Enhancements:
- avoid long checkpoint latency by releasing nat_tree_lock
- collect and show iostats periodically
- support extent_cache for compressed files
- add a sysfs entry to manage ra_size given fadvise(POSIX_FADV_SEQUENTIAL)
- report f2fs GC status via sysfs
- add discard_unit=%s in mount option to handle zoned device
Bug fixes:
- fix two memory leakages when an error happens in the compressed IO flow
- fix commpress_cache to get the right LBA
- fix fiemap to deal with compressed case correctly
- fix wrong EIO returns due to SBI_NEED_FSCK
- fix missing writes when enabling checkpoint back
- fix quota deadlock
- fix zstd level mount option
In addition to the above major updates, we've cleaned up several code
paths such as dio, unnecessary operations, debugfs/f2fs/status, sanity
check, and typos"
* tag 'f2fs-for-5.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (46 commits)
f2fs: should put a page beyond EOF when preparing a write
f2fs: deallocate compressed pages when error happens
f2fs: enable realtime discard iff device supports discard
f2fs: guarantee to write dirty data when enabling checkpoint back
f2fs: fix to unmap pages from userspace process in punch_hole()
f2fs: fix unexpected ENOENT comes from f2fs_map_blocks()
f2fs: fix to account missing .skipped_gc_rwsem
f2fs: adjust unlock order for cleanup
f2fs: Don't create discard thread when device doesn't support realtime discard
f2fs: rebuild nat_bits during umount
f2fs: introduce periodic iostat io latency traces
f2fs: separate out iostat feature
f2fs: compress: do sanity check on cluster
f2fs: fix description about main_blkaddr node
f2fs: convert S_IRUGO to 0444
f2fs: fix to keep compatibility of fault injection interface
f2fs: support fault injection for f2fs_kmem_cache_alloc()
f2fs: compress: allow write compress released file after truncate to zero
f2fs: correct comment in segment.h
f2fs: improve sbi status info in debugfs/f2fs/status
...
2021-09-04 10:48:47 -07:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2018-07-25 12:11:56 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2012-11-02 17:09:44 +09:00
}
}
return ret ;
}
2016-07-08 17:42:21 -07:00
static int __read_out_blkaddrs ( struct inode * inode , block_t * blkaddr ,
int * do_replace , pgoff_t off , pgoff_t len )
2015-05-06 13:09:46 +08:00
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct dnode_of_data dn ;
2016-07-08 17:42:21 -07:00
int ret , done , i ;
2015-07-16 18:18:11 +08:00
2016-07-08 17:42:21 -07:00
next_dnode :
2015-10-07 12:28:41 -07:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_get_dnode_of_data ( & dn , off , LOOKUP_NODE_RA ) ;
2015-10-07 12:28:41 -07:00
if ( ret & & ret ! = - ENOENT ) {
return ret ;
} else if ( ret = = - ENOENT ) {
2016-07-08 17:42:21 -07:00
if ( dn . max_level = = 0 )
return - ENOENT ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
done = min ( ( pgoff_t ) ADDRS_PER_BLOCK ( inode ) -
dn . ofs_in_node , len ) ;
2016-07-08 17:42:21 -07:00
blkaddr + = done ;
do_replace + = done ;
goto next ;
}
done = min ( ( pgoff_t ) ADDRS_PER_PAGE ( dn . node_page , inode ) -
dn . ofs_in_node , len ) ;
for ( i = 0 ; i < done ; i + + , blkaddr + + , do_replace + + , dn . ofs_in_node + + ) {
2020-02-14 17:44:10 +08:00
* blkaddr = f2fs_data_blkaddr ( & dn ) ;
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
if ( __is_valid_data_blkaddr ( * blkaddr ) & &
! f2fs_is_valid_blkaddr ( sbi , * blkaddr ,
DATA_GENERIC_ENHANCE ) ) {
f2fs_put_dnode ( & dn ) ;
2019-06-20 11:36:14 +08:00
return - EFSCORRUPTED ;
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
}
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
if ( ! f2fs_is_checkpointed_data ( sbi , * blkaddr ) ) {
2016-07-08 17:42:21 -07:00
2020-02-14 17:44:12 +08:00
if ( f2fs_lfs_mode ( sbi ) ) {
2016-07-08 17:42:21 -07:00
f2fs_put_dnode ( & dn ) ;
2019-08-15 19:45:36 +08:00
return - EOPNOTSUPP ;
2016-07-08 17:42:21 -07:00
}
2015-10-07 12:28:41 -07:00
/* do not invalidate this block address */
2016-02-24 17:16:47 +08:00
f2fs_update_data_blkaddr ( & dn , NULL_ADDR ) ;
2016-07-08 17:42:21 -07:00
* do_replace = 1 ;
2015-05-06 13:09:46 +08:00
}
2015-10-07 12:28:41 -07:00
}
2016-07-08 17:42:21 -07:00
f2fs_put_dnode ( & dn ) ;
next :
len - = done ;
off + = done ;
if ( len )
goto next_dnode ;
return 0 ;
}
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
static int __roll_back_blkaddrs ( struct inode * inode , block_t * blkaddr ,
int * do_replace , pgoff_t off , int len )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct dnode_of_data dn ;
int ret , i ;
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
for ( i = 0 ; i < len ; i + + , do_replace + + , blkaddr + + ) {
if ( * do_replace = = 0 )
continue ;
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_get_dnode_of_data ( & dn , off + i , LOOKUP_NODE_RA ) ;
2016-07-08 17:42:21 -07:00
if ( ret ) {
dec_valid_block_count ( sbi , inode , 1 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_invalidate_blocks ( sbi , * blkaddr ) ;
2016-07-08 17:42:21 -07:00
} else {
f2fs_update_data_blkaddr ( & dn , * blkaddr ) ;
2016-06-03 19:29:38 -07:00
}
2016-07-08 17:42:21 -07:00
f2fs_put_dnode ( & dn ) ;
}
return 0 ;
}
static int __clone_blkaddrs ( struct inode * src_inode , struct inode * dst_inode ,
block_t * blkaddr , int * do_replace ,
pgoff_t src , pgoff_t dst , pgoff_t len , bool full )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( src_inode ) ;
pgoff_t i = 0 ;
int ret ;
2016-06-03 19:29:38 -07:00
2016-07-08 17:42:21 -07:00
while ( i < len ) {
if ( blkaddr [ i ] = = NULL_ADDR & & ! full ) {
i + + ;
continue ;
2015-10-07 12:28:41 -07:00
}
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
if ( do_replace [ i ] | | blkaddr [ i ] = = NULL_ADDR ) {
struct dnode_of_data dn ;
struct node_info ni ;
size_t new_size ;
pgoff_t ilen ;
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
set_new_dnode ( & dn , dst_inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_get_dnode_of_data ( & dn , dst + i , ALLOC_NODE ) ;
2016-07-08 17:42:21 -07:00
if ( ret )
return ret ;
2015-05-06 13:09:46 +08:00
2018-07-17 00:02:17 +08:00
ret = f2fs_get_node_info ( sbi , dn . nid , & ni ) ;
if ( ret ) {
f2fs_put_dnode ( & dn ) ;
return ret ;
}
2016-07-08 17:42:21 -07:00
ilen = min ( ( pgoff_t )
ADDRS_PER_PAGE ( dn . node_page , dst_inode ) -
dn . ofs_in_node , len - i ) ;
do {
2020-02-14 17:44:10 +08:00
dn . data_blkaddr = f2fs_data_blkaddr ( & dn ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_truncate_data_blocks_range ( & dn , 1 ) ;
2016-07-08 17:42:21 -07:00
if ( do_replace [ i ] ) {
f2fs_i_blocks_write ( src_inode ,
2017-07-09 00:13:07 +08:00
1 , false , false ) ;
2016-07-08 17:42:21 -07:00
f2fs_i_blocks_write ( dst_inode ,
2017-07-09 00:13:07 +08:00
1 , true , false ) ;
2016-07-08 17:42:21 -07:00
f2fs_replace_block ( sbi , & dn , dn . data_blkaddr ,
blkaddr [ i ] , ni . version , true , false ) ;
do_replace [ i ] = 0 ;
}
dn . ofs_in_node + + ;
i + + ;
2019-11-07 17:29:00 +08:00
new_size = ( loff_t ) ( dst + i ) < < PAGE_SHIFT ;
2016-07-08 17:42:21 -07:00
if ( dst_inode - > i_size < new_size )
f2fs_i_size_write ( dst_inode , new_size ) ;
2016-11-23 10:51:17 -08:00
} while ( - - ilen & & ( do_replace [ i ] | | blkaddr [ i ] = = NULL_ADDR ) ) ;
2015-10-07 12:28:41 -07:00
2016-07-08 17:42:21 -07:00
f2fs_put_dnode ( & dn ) ;
} else {
struct page * psrc , * pdst ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
psrc = f2fs_get_lock_data_page ( src_inode ,
src + i , true ) ;
2016-07-08 17:42:21 -07:00
if ( IS_ERR ( psrc ) )
return PTR_ERR ( psrc ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
pdst = f2fs_get_new_data_page ( dst_inode , NULL , dst + i ,
2016-07-08 17:42:21 -07:00
true ) ;
if ( IS_ERR ( pdst ) ) {
f2fs_put_page ( psrc , 1 ) ;
return PTR_ERR ( pdst ) ;
}
f2fs_copy_page ( psrc , pdst ) ;
set_page_dirty ( pdst ) ;
f2fs_put_page ( pdst , 1 ) ;
2015-10-07 12:28:41 -07:00
f2fs_put_page ( psrc , 1 ) ;
2015-05-06 13:09:46 +08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_truncate_hole ( src_inode ,
src + i , src + i + 1 ) ;
2016-07-08 17:42:21 -07:00
if ( ret )
return ret ;
i + + ;
}
2015-10-07 12:28:41 -07:00
}
return 0 ;
2016-07-08 17:42:21 -07:00
}
2015-05-06 13:09:46 +08:00
2016-07-08 17:42:21 -07:00
static int __exchange_data_block ( struct inode * src_inode ,
struct inode * dst_inode , pgoff_t src , pgoff_t dst ,
2016-07-16 21:59:22 -07:00
pgoff_t len , bool full )
2016-07-08 17:42:21 -07:00
{
block_t * src_blkaddr ;
int * do_replace ;
2016-07-16 21:59:22 -07:00
pgoff_t olen ;
2016-07-08 17:42:21 -07:00
int ret ;
2016-07-16 21:59:22 -07:00
while ( len ) {
2019-03-25 21:08:19 +08:00
olen = min ( ( pgoff_t ) 4 * ADDRS_PER_BLOCK ( src_inode ) , len ) ;
2016-07-08 17:42:21 -07:00
2017-11-30 19:28:18 +08:00
src_blkaddr = f2fs_kvzalloc ( F2FS_I_SB ( src_inode ) ,
treewide: Use array_size in f2fs_kvzalloc()
The f2fs_kvzalloc() function has no 2-factor argument form, so
multiplication factors need to be wrapped in array_size(). This patch
replaces cases of:
f2fs_kvzalloc(handle, a * b, gfp)
with:
f2fs_kvzalloc(handle, array_size(a, b), gfp)
as well as handling cases of:
f2fs_kvzalloc(handle, a * b * c, gfp)
with:
f2fs_kvzalloc(handle, array3_size(a, b, c), gfp)
This does, however, attempt to ignore constant size factors like:
f2fs_kvzalloc(handle, 4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
expression HANDLE;
type TYPE;
expression THING, E;
@@
(
f2fs_kvzalloc(HANDLE,
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
f2fs_kvzalloc(HANDLE,
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression HANDLE;
expression COUNT;
typedef u8;
typedef __u8;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(char) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
expression HANDLE;
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT_ID)
+ array_size(COUNT_ID, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT_ID
+ array_size(COUNT_ID, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT_CONST)
+ array_size(COUNT_CONST, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT_CONST
+ array_size(COUNT_CONST, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT_ID)
+ array_size(COUNT_ID, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT_ID
+ array_size(COUNT_ID, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT_CONST)
+ array_size(COUNT_CONST, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT_CONST
+ array_size(COUNT_CONST, sizeof(THING))
, ...)
)
// 2-factor product, only identifiers.
@@
expression HANDLE;
identifier SIZE, COUNT;
@@
f2fs_kvzalloc(HANDLE,
- SIZE * COUNT
+ array_size(COUNT, SIZE)
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression HANDLE;
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression HANDLE;
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
expression HANDLE;
identifier STRIDE, SIZE, COUNT;
@@
(
f2fs_kvzalloc(HANDLE,
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products
// when they're not all constants...
@@
expression HANDLE;
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
f2fs_kvzalloc(HANDLE, C1 * C2 * C3, ...)
|
f2fs_kvzalloc(HANDLE,
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants.
@@
expression HANDLE;
expression E1, E2;
constant C1, C2;
@@
(
f2fs_kvzalloc(HANDLE, C1 * C2, ...)
|
f2fs_kvzalloc(HANDLE,
- E1 * E2
+ array_size(E1, E2)
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 14:28:35 -07:00
array_size ( olen , sizeof ( block_t ) ) ,
2019-12-03 19:02:15 -08:00
GFP_NOFS ) ;
2016-07-16 21:59:22 -07:00
if ( ! src_blkaddr )
return - ENOMEM ;
2016-07-08 17:42:21 -07:00
2017-11-30 19:28:18 +08:00
do_replace = f2fs_kvzalloc ( F2FS_I_SB ( src_inode ) ,
treewide: Use array_size in f2fs_kvzalloc()
The f2fs_kvzalloc() function has no 2-factor argument form, so
multiplication factors need to be wrapped in array_size(). This patch
replaces cases of:
f2fs_kvzalloc(handle, a * b, gfp)
with:
f2fs_kvzalloc(handle, array_size(a, b), gfp)
as well as handling cases of:
f2fs_kvzalloc(handle, a * b * c, gfp)
with:
f2fs_kvzalloc(handle, array3_size(a, b, c), gfp)
This does, however, attempt to ignore constant size factors like:
f2fs_kvzalloc(handle, 4 * 1024, gfp)
though any constants defined via macros get caught up in the conversion.
Any factors with a sizeof() of "unsigned char", "char", and "u8" were
dropped, since they're redundant.
The Coccinelle script used for this was:
// Fix redundant parens around sizeof().
@@
expression HANDLE;
type TYPE;
expression THING, E;
@@
(
f2fs_kvzalloc(HANDLE,
- (sizeof(TYPE)) * E
+ sizeof(TYPE) * E
, ...)
|
f2fs_kvzalloc(HANDLE,
- (sizeof(THING)) * E
+ sizeof(THING) * E
, ...)
)
// Drop single-byte sizes and redundant parens.
@@
expression HANDLE;
expression COUNT;
typedef u8;
typedef __u8;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(u8) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(__u8) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(char) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(unsigned char) * (COUNT)
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(u8) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(__u8) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(char) * COUNT
+ COUNT
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(unsigned char) * COUNT
+ COUNT
, ...)
)
// 2-factor product with sizeof(type/expression) and identifier or constant.
@@
expression HANDLE;
type TYPE;
expression THING;
identifier COUNT_ID;
constant COUNT_CONST;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT_ID)
+ array_size(COUNT_ID, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT_ID
+ array_size(COUNT_ID, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT_CONST)
+ array_size(COUNT_CONST, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT_CONST
+ array_size(COUNT_CONST, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT_ID)
+ array_size(COUNT_ID, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT_ID
+ array_size(COUNT_ID, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT_CONST)
+ array_size(COUNT_CONST, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT_CONST
+ array_size(COUNT_CONST, sizeof(THING))
, ...)
)
// 2-factor product, only identifiers.
@@
expression HANDLE;
identifier SIZE, COUNT;
@@
f2fs_kvzalloc(HANDLE,
- SIZE * COUNT
+ array_size(COUNT, SIZE)
, ...)
// 3-factor product with 1 sizeof(type) or sizeof(expression), with
// redundant parens removed.
@@
expression HANDLE;
expression THING;
identifier STRIDE, COUNT;
type TYPE;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(TYPE))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT) * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * (COUNT) * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT * (STRIDE)
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING) * COUNT * STRIDE
+ array3_size(COUNT, STRIDE, sizeof(THING))
, ...)
)
// 3-factor product with 2 sizeof(variable), with redundant parens removed.
@@
expression HANDLE;
expression THING1, THING2;
identifier COUNT;
type TYPE1, TYPE2;
@@
(
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(TYPE2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(TYPE2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(THING1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(THING1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * COUNT
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
|
f2fs_kvzalloc(HANDLE,
- sizeof(TYPE1) * sizeof(THING2) * (COUNT)
+ array3_size(COUNT, sizeof(TYPE1), sizeof(THING2))
, ...)
)
// 3-factor product, only identifiers, with redundant parens removed.
@@
expression HANDLE;
identifier STRIDE, SIZE, COUNT;
@@
(
f2fs_kvzalloc(HANDLE,
- (COUNT) * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * (STRIDE) * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * STRIDE * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- (COUNT) * (STRIDE) * (SIZE)
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
|
f2fs_kvzalloc(HANDLE,
- COUNT * STRIDE * SIZE
+ array3_size(COUNT, STRIDE, SIZE)
, ...)
)
// Any remaining multi-factor products, first at least 3-factor products
// when they're not all constants...
@@
expression HANDLE;
expression E1, E2, E3;
constant C1, C2, C3;
@@
(
f2fs_kvzalloc(HANDLE, C1 * C2 * C3, ...)
|
f2fs_kvzalloc(HANDLE,
- E1 * E2 * E3
+ array3_size(E1, E2, E3)
, ...)
)
// And then all remaining 2 factors products when they're not all constants.
@@
expression HANDLE;
expression E1, E2;
constant C1, C2;
@@
(
f2fs_kvzalloc(HANDLE, C1 * C2, ...)
|
f2fs_kvzalloc(HANDLE,
- E1 * E2
+ array_size(E1, E2)
, ...)
)
Signed-off-by: Kees Cook <keescook@chromium.org>
2018-06-12 14:28:35 -07:00
array_size ( olen , sizeof ( int ) ) ,
2019-12-03 19:02:15 -08:00
GFP_NOFS ) ;
2016-07-16 21:59:22 -07:00
if ( ! do_replace ) {
kvfree ( src_blkaddr ) ;
return - ENOMEM ;
}
2016-07-08 17:42:21 -07:00
2016-07-16 21:59:22 -07:00
ret = __read_out_blkaddrs ( src_inode , src_blkaddr ,
do_replace , src , olen ) ;
if ( ret )
goto roll_back ;
2016-07-08 17:42:21 -07:00
2016-07-16 21:59:22 -07:00
ret = __clone_blkaddrs ( src_inode , dst_inode , src_blkaddr ,
do_replace , src , dst , olen , full ) ;
if ( ret )
goto roll_back ;
src + = olen ;
dst + = olen ;
len - = olen ;
kvfree ( src_blkaddr ) ;
kvfree ( do_replace ) ;
}
2016-07-08 17:42:21 -07:00
return 0 ;
roll_back :
2018-05-28 23:47:19 +08:00
__roll_back_blkaddrs ( src_inode , src_blkaddr , do_replace , src , olen ) ;
2016-07-08 17:42:21 -07:00
kvfree ( src_blkaddr ) ;
kvfree ( do_replace ) ;
2015-10-07 12:28:41 -07:00
return ret ;
}
2015-05-06 13:09:46 +08:00
2018-07-25 12:11:56 +09:00
static int f2fs_do_collapse ( struct inode * inode , loff_t offset , loff_t len )
2015-10-07 12:28:41 -07:00
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2019-06-20 16:42:08 +02:00
pgoff_t nrpages = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
2018-07-25 12:11:56 +09:00
pgoff_t start = offset > > PAGE_SHIFT ;
pgoff_t end = ( offset + len ) > > PAGE_SHIFT ;
2016-07-08 17:42:21 -07:00
int ret ;
2015-10-07 12:28:41 -07:00
2016-07-08 17:42:21 -07:00
f2fs_balance_fs ( sbi , true ) ;
2016-07-12 11:07:52 -07:00
2018-07-25 12:11:56 +09:00
/* avoid gc operation during block exchange */
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2016-07-12 11:07:52 -07:00
2018-07-25 12:11:56 +09:00
f2fs_lock_op ( sbi ) ;
f2fs_drop_extent_tree ( inode ) ;
truncate_pagecache ( inode , offset ) ;
2016-07-08 17:42:21 -07:00
ret = __exchange_data_block ( inode , inode , end , start , nrpages - end , true ) ;
f2fs_unlock_op ( sbi ) ;
2018-07-25 12:11:56 +09:00
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2018-07-25 12:11:56 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2015-05-06 13:09:46 +08:00
return ret ;
}
static int f2fs_collapse_range ( struct inode * inode , loff_t offset , loff_t len )
{
loff_t new_size ;
int ret ;
if ( offset + len > = i_size_read ( inode ) )
return - EINVAL ;
/* collapse range should be aligned to block size of f2fs. */
if ( offset & ( F2FS_BLKSIZE - 1 ) | | len & ( F2FS_BLKSIZE - 1 ) )
return - EINVAL ;
2015-12-22 11:09:35 -08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
return ret ;
2015-06-17 13:59:05 -07:00
2015-05-06 13:09:46 +08:00
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range ( inode - > i_mapping , offset , LLONG_MAX ) ;
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-05-06 13:09:46 +08:00
2018-07-25 12:11:56 +09:00
ret = f2fs_do_collapse ( inode , offset , len ) ;
2015-05-06 13:09:46 +08:00
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-05-06 13:09:46 +08:00
2015-10-07 12:28:41 -07:00
/* write out all moved pages, if possible */
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2015-10-07 12:28:41 -07:00
filemap_write_and_wait_range ( inode - > i_mapping , offset , LLONG_MAX ) ;
truncate_pagecache ( inode , offset ) ;
2015-05-06 13:09:46 +08:00
new_size = i_size_read ( inode ) - len ;
2019-02-02 17:33:01 +08:00
ret = f2fs_truncate_blocks ( inode , new_size , true ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2015-05-06 13:09:46 +08:00
if ( ! ret )
2016-05-20 09:22:03 -07:00
f2fs_i_size_write ( inode , new_size ) ;
2015-05-06 13:09:46 +08:00
return ret ;
}
2016-05-09 19:56:31 +08:00
static int f2fs_do_zero_range ( struct dnode_of_data * dn , pgoff_t start ,
pgoff_t end )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( dn - > inode ) ;
pgoff_t index = start ;
unsigned int ofs_in_node = dn - > ofs_in_node ;
blkcnt_t count = 0 ;
int ret ;
for ( ; index < end ; index + + , dn - > ofs_in_node + + ) {
2020-02-14 17:44:10 +08:00
if ( f2fs_data_blkaddr ( dn ) = = NULL_ADDR )
2016-05-09 19:56:31 +08:00
count + + ;
}
dn - > ofs_in_node = ofs_in_node ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_reserve_new_blocks ( dn , count ) ;
2016-05-09 19:56:31 +08:00
if ( ret )
return ret ;
dn - > ofs_in_node = ofs_in_node ;
for ( index = start ; index < end ; index + + , dn - > ofs_in_node + + ) {
2020-02-14 17:44:10 +08:00
dn - > data_blkaddr = f2fs_data_blkaddr ( dn ) ;
2016-05-09 19:56:31 +08:00
/*
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
* f2fs_reserve_new_blocks will not guarantee entire block
2016-05-09 19:56:31 +08:00
* allocation .
*/
if ( dn - > data_blkaddr = = NULL_ADDR ) {
ret = - ENOSPC ;
break ;
}
if ( dn - > data_blkaddr ! = NEW_ADDR ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_invalidate_blocks ( sbi , dn - > data_blkaddr ) ;
2016-05-09 19:56:31 +08:00
dn - > data_blkaddr = NEW_ADDR ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_set_data_blkaddr ( dn ) ;
2016-05-09 19:56:31 +08:00
}
}
f2fs_update_extent_cache_range ( dn , start , 0 , index - start ) ;
return ret ;
}
2015-05-06 13:11:13 +08:00
static int f2fs_zero_range ( struct inode * inode , loff_t offset , loff_t len ,
int mode )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct address_space * mapping = inode - > i_mapping ;
pgoff_t index , pg_start , pg_end ;
loff_t new_size = i_size_read ( inode ) ;
loff_t off_start , off_end ;
int ret = 0 ;
ret = inode_newsize_ok ( inode , ( len + offset ) ) ;
if ( ret )
return ret ;
2015-12-22 11:09:35 -08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
return ret ;
2015-05-06 13:11:13 +08:00
ret = filemap_write_and_wait_range ( mapping , offset , offset + len - 1 ) ;
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-05-06 13:11:13 +08:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pg_start = ( ( unsigned long long ) offset ) > > PAGE_SHIFT ;
pg_end = ( ( unsigned long long ) offset + len ) > > PAGE_SHIFT ;
2015-05-06 13:11:13 +08:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
off_start = offset & ( PAGE_SIZE - 1 ) ;
off_end = ( offset + len ) & ( PAGE_SIZE - 1 ) ;
2015-05-06 13:11:13 +08:00
if ( pg_start = = pg_end ) {
2015-08-07 18:36:06 +08:00
ret = fill_zero ( inode , pg_start , off_start ,
off_end - off_start ) ;
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-08-07 18:36:06 +08:00
2015-05-06 13:11:13 +08:00
new_size = max_t ( loff_t , new_size , offset + len ) ;
} else {
if ( off_start ) {
2015-08-07 18:36:06 +08:00
ret = fill_zero ( inode , pg_start + + , off_start ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
PAGE_SIZE - off_start ) ;
2015-08-07 18:36:06 +08:00
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-08-07 18:36:06 +08:00
2015-05-06 13:11:13 +08:00
new_size = max_t ( loff_t , new_size ,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
( loff_t ) pg_start < < PAGE_SHIFT ) ;
2015-05-06 13:11:13 +08:00
}
2016-05-09 19:56:31 +08:00
for ( index = pg_start ; index < pg_end ; ) {
2015-05-06 13:11:13 +08:00
struct dnode_of_data dn ;
2016-05-09 19:56:31 +08:00
unsigned int end_offset ;
pgoff_t end ;
2015-05-06 13:11:13 +08:00
2018-08-05 23:02:22 +08:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( mapping ) ;
2018-08-05 23:02:22 +08:00
truncate_pagecache_range ( inode ,
( loff_t ) index < < PAGE_SHIFT ,
( ( loff_t ) pg_end < < PAGE_SHIFT ) - 1 ) ;
2015-05-06 13:11:13 +08:00
f2fs_lock_op ( sbi ) ;
2016-05-09 19:56:31 +08:00
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_get_dnode_of_data ( & dn , index , ALLOC_NODE ) ;
2015-05-06 13:11:13 +08:00
if ( ret ) {
f2fs_unlock_op ( sbi ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2018-08-05 23:02:22 +08:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2015-05-06 13:11:13 +08:00
goto out ;
}
2016-05-09 19:56:31 +08:00
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
end = min ( pg_end , end_offset - dn . ofs_in_node + index ) ;
ret = f2fs_do_zero_range ( & dn , index , end ) ;
2015-05-06 13:11:13 +08:00
f2fs_put_dnode ( & dn ) ;
2018-08-05 23:02:22 +08:00
2015-05-06 13:11:13 +08:00
f2fs_unlock_op ( sbi ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2018-08-05 23:02:22 +08:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2016-10-11 22:57:02 +08:00
f2fs_balance_fs ( sbi , dn . node_changed ) ;
2016-05-09 19:56:31 +08:00
if ( ret )
goto out ;
2015-05-06 13:11:13 +08:00
2016-05-09 19:56:31 +08:00
index = end ;
2015-05-06 13:11:13 +08:00
new_size = max_t ( loff_t , new_size ,
2016-05-09 19:56:31 +08:00
( loff_t ) index < < PAGE_SHIFT ) ;
2015-05-06 13:11:13 +08:00
}
if ( off_end ) {
2015-08-07 18:36:06 +08:00
ret = fill_zero ( inode , pg_end , 0 , off_end ) ;
if ( ret )
goto out ;
2015-05-06 13:11:13 +08:00
new_size = max_t ( loff_t , new_size , offset + len ) ;
}
}
out :
2018-02-25 23:38:21 +08:00
if ( new_size > i_size_read ( inode ) ) {
if ( mode & FALLOC_FL_KEEP_SIZE )
file_set_keep_isize ( inode ) ;
else
f2fs_i_size_write ( inode , new_size ) ;
}
2015-05-06 13:11:13 +08:00
return ret ;
}
2015-05-28 19:16:57 +08:00
static int f2fs_insert_range ( struct inode * inode , loff_t offset , loff_t len )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2021-04-13 18:10:37 +02:00
struct address_space * mapping = inode - > i_mapping ;
2016-07-08 17:42:21 -07:00
pgoff_t nr , pg_start , pg_end , delta , idx ;
2015-05-28 19:16:57 +08:00
loff_t new_size ;
2015-10-07 12:28:41 -07:00
int ret = 0 ;
2015-05-28 19:16:57 +08:00
new_size = i_size_read ( inode ) + len ;
2017-03-10 17:54:52 +08:00
ret = inode_newsize_ok ( inode , new_size ) ;
if ( ret )
return ret ;
2015-05-28 19:16:57 +08:00
if ( offset > = i_size_read ( inode ) )
return - EINVAL ;
/* insert range should be aligned to block size of f2fs. */
if ( offset & ( F2FS_BLKSIZE - 1 ) | | len & ( F2FS_BLKSIZE - 1 ) )
return - EINVAL ;
2015-12-22 11:09:35 -08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
return ret ;
2015-06-17 13:59:05 -07:00
2016-01-07 14:15:04 -08:00
f2fs_balance_fs ( sbi , true ) ;
2015-12-22 13:23:35 -08:00
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( mapping ) ;
2019-02-02 17:33:01 +08:00
ret = f2fs_truncate_blocks ( inode , i_size_read ( inode ) , true ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2015-05-28 19:16:57 +08:00
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-05-28 19:16:57 +08:00
/* write out all dirty pages from offset */
2021-04-13 18:10:37 +02:00
ret = filemap_write_and_wait_range ( mapping , offset , LLONG_MAX ) ;
2015-05-28 19:16:57 +08:00
if ( ret )
2018-07-25 12:11:56 +09:00
return ret ;
2015-05-28 19:16:57 +08:00
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pg_start = offset > > PAGE_SHIFT ;
pg_end = ( offset + len ) > > PAGE_SHIFT ;
2015-05-28 19:16:57 +08:00
delta = pg_end - pg_start ;
2019-06-20 16:42:08 +02:00
idx = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
2016-07-08 17:42:21 -07:00
2018-07-25 12:11:56 +09:00
/* avoid gc operation during block exchange */
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( mapping ) ;
2018-07-25 12:11:56 +09:00
truncate_pagecache ( inode , offset ) ;
2016-07-08 17:42:21 -07:00
while ( ! ret & & idx > pg_start ) {
nr = idx - pg_start ;
if ( nr > delta )
nr = delta ;
idx - = nr ;
2015-05-28 19:16:57 +08:00
f2fs_lock_op ( sbi ) ;
2016-07-12 11:07:52 -07:00
f2fs_drop_extent_tree ( inode ) ;
2016-07-08 17:42:21 -07:00
ret = __exchange_data_block ( inode , inode , idx ,
idx + delta , nr , false ) ;
2015-05-28 19:16:57 +08:00
f2fs_unlock_op ( sbi ) ;
}
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2018-07-25 12:11:56 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2015-05-28 19:16:57 +08:00
2015-10-07 12:28:41 -07:00
/* write out all moved pages, if possible */
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( mapping ) ;
filemap_write_and_wait_range ( mapping , offset , LLONG_MAX ) ;
2015-10-07 12:28:41 -07:00
truncate_pagecache ( inode , offset ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2015-10-07 12:28:41 -07:00
if ( ! ret )
2016-05-20 09:22:03 -07:00
f2fs_i_size_write ( inode , new_size ) ;
2015-05-28 19:16:57 +08:00
return ret ;
}
2012-11-02 17:09:44 +09:00
static int expand_inode_data ( struct inode * inode , loff_t offset ,
loff_t len , int mode )
{
2014-09-02 15:31:18 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2017-11-28 09:23:00 +09:00
struct f2fs_map_blocks map = { . m_next_pgofs = NULL ,
2018-11-13 14:33:45 +08:00
. m_next_extent = NULL , . m_seg_type = NO_CHECK_TYPE ,
. m_may_create = true } ;
2021-03-24 11:24:33 +08:00
pgoff_t pg_start , pg_end ;
2012-11-02 17:09:44 +09:00
loff_t new_size = i_size_read ( inode ) ;
2016-05-06 15:30:38 -07:00
loff_t off_end ;
2021-03-24 11:24:33 +08:00
block_t expanded = 0 ;
2016-11-11 16:31:56 -08:00
int err ;
2012-11-02 17:09:44 +09:00
2016-11-11 16:31:56 -08:00
err = inode_newsize_ok ( inode , ( len + offset ) ) ;
if ( err )
return err ;
2012-11-02 17:09:44 +09:00
2016-11-11 16:31:56 -08:00
err = f2fs_convert_inline_inode ( inode ) ;
if ( err )
return err ;
2013-12-27 12:28:59 +09:00
2016-01-07 14:15:04 -08:00
f2fs_balance_fs ( sbi , true ) ;
2015-12-22 13:23:35 -08:00
2021-03-24 11:24:33 +08:00
pg_start = ( ( unsigned long long ) offset ) > > PAGE_SHIFT ;
2016-05-06 15:30:38 -07:00
pg_end = ( ( unsigned long long ) offset + len ) > > PAGE_SHIFT ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
off_end = ( offset + len ) & ( PAGE_SIZE - 1 ) ;
2012-11-02 17:09:44 +09:00
2021-03-24 11:24:33 +08:00
map . m_lblk = pg_start ;
map . m_len = pg_end - pg_start ;
2016-05-06 15:30:38 -07:00
if ( off_end )
map . m_len + + ;
2014-06-13 13:05:55 +09:00
2019-10-18 10:06:40 -07:00
if ( ! map . m_len )
return 0 ;
if ( f2fs_is_pinned_file ( inode ) ) {
2021-03-05 17:56:01 +08:00
block_t sec_blks = BLKS_PER_SEC ( sbi ) ;
block_t sec_len = roundup ( map . m_len , sec_blks ) ;
2019-10-18 10:06:40 -07:00
2021-03-05 17:56:01 +08:00
map . m_len = sec_blks ;
2019-10-18 10:06:40 -07:00
next_alloc :
if ( has_not_enough_free_secs ( sbi , 0 ,
GET_SEC_FROM_SEG ( sbi , overprovision_segments ( sbi ) ) ) ) {
2020-01-14 19:36:50 +08:00
down_write ( & sbi - > gc_lock ) ;
2021-02-20 17:35:40 +08:00
err = f2fs_gc ( sbi , true , false , false , NULL_SEGNO ) ;
2019-10-18 10:06:40 -07:00
if ( err & & err ! = - ENODATA & & err ! = - EAGAIN )
goto out_err ;
}
down_write ( & sbi - > pin_sem ) ;
2020-05-27 13:02:31 +09:00
f2fs_lock_op ( sbi ) ;
2021-04-21 09:54:55 +08:00
f2fs_allocate_new_section ( sbi , CURSEG_COLD_DATA_PINNED , false ) ;
2020-05-27 13:02:31 +09:00
f2fs_unlock_op ( sbi ) ;
f2fs: introduce inmem curseg
Previous implementation of aligned pinfile allocation will:
- allocate new segment on cold data log no matter whether last used
segment is partially used or not, it makes IOs more random;
- force concurrent cold data/GCed IO going into warm data area, it
can make a bad effect on hot/cold data separation;
In this patch, we introduce a new type of log named 'inmem curseg',
the differents from normal curseg is:
- it reuses existed segment type (CURSEG_XXX_NODE/DATA);
- it only exists in memory, its segno, blkofs, summary will not b
persisted into checkpoint area;
With this new feature, we can enhance scalability of log, special
allocators can be created for purposes:
- pure lfs allocator for aligned pinfile allocation or file
defragmentation
- pure ssr allocator for later feature
So that, let's update aligned pinfile allocation to use this new
inmem curseg fwk.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 21:14:45 +08:00
map . m_seg_type = CURSEG_COLD_DATA_PINNED ;
2019-10-18 10:06:40 -07:00
err = f2fs_map_blocks ( inode , & map , 1 , F2FS_GET_BLOCK_PRE_DIO ) ;
2021-11-12 14:31:16 -08:00
file_dont_truncate ( inode ) ;
f2fs: introduce inmem curseg
Previous implementation of aligned pinfile allocation will:
- allocate new segment on cold data log no matter whether last used
segment is partially used or not, it makes IOs more random;
- force concurrent cold data/GCed IO going into warm data area, it
can make a bad effect on hot/cold data separation;
In this patch, we introduce a new type of log named 'inmem curseg',
the differents from normal curseg is:
- it reuses existed segment type (CURSEG_XXX_NODE/DATA);
- it only exists in memory, its segno, blkofs, summary will not b
persisted into checkpoint area;
With this new feature, we can enhance scalability of log, special
allocators can be created for purposes:
- pure lfs allocator for aligned pinfile allocation or file
defragmentation
- pure ssr allocator for later feature
So that, let's update aligned pinfile allocation to use this new
inmem curseg fwk.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2020-08-04 21:14:45 +08:00
2019-10-18 10:06:40 -07:00
up_write ( & sbi - > pin_sem ) ;
2019-06-26 18:23:05 -07:00
2021-03-24 11:24:33 +08:00
expanded + = map . m_len ;
2021-03-05 17:56:01 +08:00
sec_len - = map . m_len ;
2019-10-18 10:06:40 -07:00
map . m_lblk + = map . m_len ;
2021-03-05 17:56:01 +08:00
if ( ! err & & sec_len )
2019-10-18 10:06:40 -07:00
goto next_alloc ;
2021-03-24 11:24:33 +08:00
map . m_len = expanded ;
2019-10-18 10:06:40 -07:00
} else {
err = f2fs_map_blocks ( inode , & map , 1 , F2FS_GET_BLOCK_PRE_AIO ) ;
2021-03-24 11:24:33 +08:00
expanded = map . m_len ;
2019-10-18 10:06:40 -07:00
}
out_err :
2016-11-11 16:31:56 -08:00
if ( err ) {
2016-05-06 15:30:38 -07:00
pgoff_t last_off ;
2012-11-02 17:09:44 +09:00
2021-03-24 11:24:33 +08:00
if ( ! expanded )
2016-11-11 16:31:56 -08:00
return err ;
2014-06-13 13:07:31 +09:00
2021-03-24 11:24:33 +08:00
last_off = pg_start + expanded - 1 ;
2016-05-06 15:30:38 -07:00
/* update new size to the failed position */
2018-05-30 04:34:58 +09:00
new_size = ( last_off = = pg_end ) ? offset + len :
2016-05-06 15:30:38 -07:00
( loff_t ) ( last_off + 1 ) < < PAGE_SHIFT ;
} else {
new_size = ( ( loff_t ) pg_end < < PAGE_SHIFT ) + off_end ;
2012-11-02 17:09:44 +09:00
}
2017-11-05 21:53:30 +08:00
if ( new_size > i_size_read ( inode ) ) {
if ( mode & FALLOC_FL_KEEP_SIZE )
file_set_keep_isize ( inode ) ;
else
f2fs_i_size_write ( inode , new_size ) ;
}
2012-11-02 17:09:44 +09:00
2016-11-11 16:31:56 -08:00
return err ;
2012-11-02 17:09:44 +09:00
}
static long f2fs_fallocate ( struct file * file , int mode ,
loff_t offset , loff_t len )
{
2013-02-27 16:59:05 -05:00
struct inode * inode = file_inode ( file ) ;
2015-04-21 15:59:12 +09:00
long ret = 0 ;
2012-11-02 17:09:44 +09:00
2017-10-23 23:48:49 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) )
return - EIO ;
2019-08-23 17:58:36 +08:00
if ( ! f2fs_is_checkpoint_ready ( F2FS_I_SB ( inode ) ) )
return - ENOSPC ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( ! f2fs_is_compress_backend_ready ( inode ) )
return - EOPNOTSUPP ;
2017-10-23 23:48:49 +02:00
2015-09-11 14:39:02 +08:00
/* f2fs only support ->fallocate for regular file */
if ( ! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
2018-12-12 15:20:11 +05:30
if ( IS_ENCRYPTED ( inode ) & &
2015-05-28 19:16:57 +08:00
( mode & ( FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE ) ) )
2015-04-21 20:39:58 -07:00
return - EOPNOTSUPP ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( f2fs_compressed_file ( inode ) & &
( mode & ( FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE ) ) )
return - EOPNOTSUPP ;
2015-05-06 13:09:46 +08:00
if ( mode & ~ ( FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
2015-05-28 19:16:57 +08:00
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
FALLOC_FL_INSERT_RANGE ) )
2012-11-02 17:09:44 +09:00
return - EOPNOTSUPP ;
2016-01-22 15:40:57 -05:00
inode_lock ( inode ) ;
2014-01-28 10:29:26 +08:00
2015-04-21 15:59:12 +09:00
if ( mode & FALLOC_FL_PUNCH_HOLE ) {
if ( offset > = inode - > i_size )
goto out ;
2013-11-22 16:52:50 +08:00
ret = punch_hole ( inode , offset , len ) ;
2015-05-06 13:09:46 +08:00
} else if ( mode & FALLOC_FL_COLLAPSE_RANGE ) {
ret = f2fs_collapse_range ( inode , offset , len ) ;
2015-05-06 13:11:13 +08:00
} else if ( mode & FALLOC_FL_ZERO_RANGE ) {
ret = f2fs_zero_range ( inode , offset , len , mode ) ;
2015-05-28 19:16:57 +08:00
} else if ( mode & FALLOC_FL_INSERT_RANGE ) {
ret = f2fs_insert_range ( inode , offset , len ) ;
2015-05-06 13:09:46 +08:00
} else {
2012-11-02 17:09:44 +09:00
ret = expand_inode_data ( inode , offset , len , mode ) ;
2015-05-06 13:09:46 +08:00
}
2012-11-02 17:09:44 +09:00
2012-12-30 14:52:37 +09:00
if ( ! ret ) {
2016-09-14 07:48:04 -07:00
inode - > i_mtime = inode - > i_ctime = current_time ( inode ) ;
2016-10-14 11:51:23 -07:00
f2fs_mark_inode_dirty_sync ( inode , false ) ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2012-12-30 14:52:37 +09:00
}
2014-01-28 10:29:26 +08:00
2015-04-21 15:59:12 +09:00
out :
2016-01-22 15:40:57 -05:00
inode_unlock ( inode ) ;
2014-01-28 10:29:26 +08:00
2013-04-23 17:00:52 +09:00
trace_f2fs_fallocate ( inode , mode , offset , len , ret ) ;
2012-11-02 17:09:44 +09:00
return ret ;
}
2014-12-09 06:08:59 -08:00
static int f2fs_release_file ( struct inode * inode , struct file * filp )
{
2016-04-11 11:51:51 -07:00
/*
* f2fs_relase_file is called at every close calls . So we should
* not drop any inmemory pages by close called by other process .
*/
if ( ! ( filp - > f_mode & FMODE_WRITE ) | |
atomic_read ( & inode - > i_writecount ) ! = 1 )
return 0 ;
2014-12-09 06:08:59 -08:00
/* some remained atomic pages should discarded */
if ( f2fs_is_atomic_file ( inode ) )
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_drop_inmem_pages ( inode ) ;
2014-12-09 06:08:59 -08:00
if ( f2fs_is_volatile_file ( inode ) ) {
2016-05-20 10:13:22 -07:00
set_inode_flag ( inode , FI_DROP_CACHE ) ;
2014-12-09 06:08:59 -08:00
filemap_fdatawrite ( inode - > i_mapping ) ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_DROP_CACHE ) ;
2018-06-04 23:20:51 +08:00
clear_inode_flag ( inode , FI_VOLATILE_FILE ) ;
stat_dec_volatile_write ( inode ) ;
2014-12-09 06:08:59 -08:00
}
return 0 ;
}
2017-07-24 19:46:29 -07:00
static int f2fs_file_flush ( struct file * file , fl_owner_t id )
2012-11-02 17:09:44 +09:00
{
2017-07-24 19:46:29 -07:00
struct inode * inode = file_inode ( file ) ;
/*
* If the process doing a transaction is crashed , we should do
* roll - back . Otherwise , other reader / write can see corrupted database
* until all the writers close its file . Since this should be done
* before dropping file lock , it needs to do in - > flush .
*/
if ( f2fs_is_atomic_file ( inode ) & &
F2FS_I ( inode ) - > inmem_task = = current )
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_drop_inmem_pages ( inode ) ;
2017-07-24 19:46:29 -07:00
return 0 ;
2012-11-02 17:09:44 +09:00
}
2019-06-04 22:59:04 -07:00
static int f2fs_setflags_common ( struct inode * inode , u32 iflags , u32 mask )
2017-07-29 00:32:52 +08:00
{
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
2020-03-05 15:20:26 -08:00
u32 masked_flags = fi - > i_flags & mask ;
2021-05-06 12:11:14 -07:00
/* mask can be shrunk by flags_valid selector */
iflags & = mask ;
2017-07-29 00:32:52 +08:00
/* Is it quota file? Do not allow user to mess with it */
if ( IS_NOQUOTA ( inode ) )
return - EPERM ;
2020-03-05 15:20:26 -08:00
if ( ( iflags ^ masked_flags ) & F2FS_CASEFOLD_FL ) {
f2fs: Support case-insensitive file name lookups
Modeled after commit b886ee3e778e ("ext4: Support case-insensitive file
name lookups")
"""
This patch implements the actual support for case-insensitive file name
lookups in f2fs, based on the feature bit and the encoding stored in the
superblock.
A filesystem that has the casefold feature set is able to configure
directories with the +F (F2FS_CASEFOLD_FL) attribute, enabling lookups
to succeed in that directory in a case-insensitive fashion, i.e: match
a directory entry even if the name used by userspace is not a byte per
byte match with the disk name, but is an equivalent case-insensitive
version of the Unicode string. This operation is called a
case-insensitive file name lookup.
The feature is configured as an inode attribute applied to directories
and inherited by its children. This attribute can only be enabled on
empty directories for filesystems that support the encoding feature,
thus preventing collision of file names that only differ by case.
* dcache handling:
For a +F directory, F2Fs only stores the first equivalent name dentry
used in the dcache. This is done to prevent unintentional duplication of
dentries in the dcache, while also allowing the VFS code to quickly find
the right entry in the cache despite which equivalent string was used in
a previous lookup, without having to resort to ->lookup().
d_hash() of casefolded directories is implemented as the hash of the
casefolded string, such that we always have a well-known bucket for all
the equivalencies of the same string. d_compare() uses the
utf8_strncasecmp() infrastructure, which handles the comparison of
equivalent, same case, names as well.
For now, negative lookups are not inserted in the dcache, since they
would need to be invalidated anyway, because we can't trust missing file
dentries. This is bad for performance but requires some leveraging of
the vfs layer to fix. We can live without that for now, and so does
everyone else.
* on-disk data:
Despite using a specific version of the name as the internal
representation within the dcache, the name stored and fetched from the
disk is a byte-per-byte match with what the user requested, making this
implementation 'name-preserving'. i.e. no actual information is lost
when writing to storage.
DX is supported by modifying the hashes used in +F directories to make
them case/encoding-aware. The new disk hashes are calculated as the
hash of the full casefolded string, instead of the string directly.
This allows us to efficiently search for file names in the htree without
requiring the user to provide an exact name.
* Dealing with invalid sequences:
By default, when a invalid UTF-8 sequence is identified, ext4 will treat
it as an opaque byte sequence, ignoring the encoding and reverting to
the old behavior for that unique file. This means that case-insensitive
file name lookup will not work only for that file. An optional bit can
be set in the superblock telling the filesystem code and userspace tools
to enforce the encoding. When that optional bit is set, any attempt to
create a file name using an invalid UTF-8 sequence will fail and return
an error to userspace.
* Normalization algorithm:
The UTF-8 algorithms used to compare strings in f2fs is implemented
in fs/unicode, and is based on a previous version developed by
SGI. It implements the Canonical decomposition (NFD) algorithm
described by the Unicode specification 12.1, or higher, combined with
the elimination of ignorable code points (NFDi) and full
case-folding (CF) as documented in fs/unicode/utf8_norm.c.
NFD seems to be the best normalization method for F2FS because:
- It has a lower cost than NFC/NFKC (which requires
decomposing to NFD as an intermediary step)
- It doesn't eliminate important semantic meaning like
compatibility decompositions.
Although:
- This implementation is not completely linguistic accurate, because
different languages have conflicting rules, which would require the
specialization of the filesystem to a given locale, which brings all
sorts of problems for removable media and for users who use more than
one language.
"""
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-23 16:05:29 -07:00
if ( ! f2fs_sb_has_casefold ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
if ( ! f2fs_empty_dir ( inode ) )
return - ENOTEMPTY ;
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( iflags & ( F2FS_COMPR_FL | F2FS_NOCOMP_FL ) ) {
if ( ! f2fs_sb_has_compression ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
if ( ( iflags & F2FS_COMPR_FL ) & & ( iflags & F2FS_NOCOMP_FL ) )
return - EINVAL ;
}
2020-03-05 15:20:26 -08:00
if ( ( iflags ^ masked_flags ) & F2FS_COMPR_FL ) {
2020-03-27 18:29:51 +08:00
if ( masked_flags & F2FS_COMPR_FL ) {
2020-09-08 11:44:11 +09:00
if ( ! f2fs_disable_compressed_file ( inode ) )
2020-03-10 20:50:09 +08:00
return - EINVAL ;
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( iflags & F2FS_NOCOMP_FL )
return - EINVAL ;
if ( iflags & F2FS_COMPR_FL ) {
if ( ! f2fs_may_compress ( inode ) )
return - EINVAL ;
2020-09-18 11:03:49 +08:00
if ( S_ISREG ( inode - > i_mode ) & & inode - > i_size )
return - EINVAL ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
set_compress_context ( inode ) ;
}
}
2020-03-05 15:20:26 -08:00
if ( ( iflags ^ masked_flags ) & F2FS_NOCOMP_FL ) {
if ( masked_flags & F2FS_COMPR_FL )
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
return - EINVAL ;
}
2019-07-01 13:26:30 -07:00
fi - > i_flags = iflags | ( fi - > i_flags & ~ mask ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
f2fs_bug_on ( F2FS_I_SB ( inode ) , ( fi - > i_flags & F2FS_COMPR_FL ) & &
( fi - > i_flags & F2FS_NOCOMP_FL ) ) ;
2017-07-29 00:32:52 +08:00
2018-04-03 15:08:17 +08:00
if ( fi - > i_flags & F2FS_PROJINHERIT_FL )
2017-07-29 00:32:52 +08:00
set_inode_flag ( inode , FI_PROJ_INHERIT ) ;
else
clear_inode_flag ( inode , FI_PROJ_INHERIT ) ;
inode - > i_ctime = current_time ( inode ) ;
f2fs_set_inode_flags ( inode ) ;
2018-12-18 19:20:17 +08:00
f2fs_mark_inode_dirty_sync ( inode , true ) ;
2017-07-29 00:32:52 +08:00
return 0 ;
}
2021-04-07 14:36:43 +02:00
/* FS_IOC_[GS]ETFLAGS and FS_IOC_FS[GS]ETXATTR support */
2019-06-04 22:59:04 -07:00
/*
* To make a new on - disk f2fs i_flag gettable via FS_IOC_GETFLAGS , add an entry
* for it to f2fs_fsflags_map [ ] , and add its FS_ * _FL equivalent to
* F2FS_GETTABLE_FS_FL . To also make it settable via FS_IOC_SETFLAGS , also add
* its FS_ * _FL equivalent to F2FS_SETTABLE_FS_FL .
2021-04-07 14:36:43 +02:00
*
* Translating flags to fsx_flags value used by FS_IOC_FSGETXATTR and
* FS_IOC_FSSETXATTR is done by the VFS .
2019-06-04 22:59:04 -07:00
*/
static const struct {
u32 iflag ;
u32 fsflag ;
} f2fs_fsflags_map [ ] = {
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
{ F2FS_COMPR_FL , FS_COMPR_FL } ,
2019-06-04 22:59:04 -07:00
{ F2FS_SYNC_FL , FS_SYNC_FL } ,
{ F2FS_IMMUTABLE_FL , FS_IMMUTABLE_FL } ,
{ F2FS_APPEND_FL , FS_APPEND_FL } ,
{ F2FS_NODUMP_FL , FS_NODUMP_FL } ,
{ F2FS_NOATIME_FL , FS_NOATIME_FL } ,
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
{ F2FS_NOCOMP_FL , FS_NOCOMP_FL } ,
2019-06-04 22:59:04 -07:00
{ F2FS_INDEX_FL , FS_INDEX_FL } ,
{ F2FS_DIRSYNC_FL , FS_DIRSYNC_FL } ,
{ F2FS_PROJINHERIT_FL , FS_PROJINHERIT_FL } ,
f2fs: Support case-insensitive file name lookups
Modeled after commit b886ee3e778e ("ext4: Support case-insensitive file
name lookups")
"""
This patch implements the actual support for case-insensitive file name
lookups in f2fs, based on the feature bit and the encoding stored in the
superblock.
A filesystem that has the casefold feature set is able to configure
directories with the +F (F2FS_CASEFOLD_FL) attribute, enabling lookups
to succeed in that directory in a case-insensitive fashion, i.e: match
a directory entry even if the name used by userspace is not a byte per
byte match with the disk name, but is an equivalent case-insensitive
version of the Unicode string. This operation is called a
case-insensitive file name lookup.
The feature is configured as an inode attribute applied to directories
and inherited by its children. This attribute can only be enabled on
empty directories for filesystems that support the encoding feature,
thus preventing collision of file names that only differ by case.
* dcache handling:
For a +F directory, F2Fs only stores the first equivalent name dentry
used in the dcache. This is done to prevent unintentional duplication of
dentries in the dcache, while also allowing the VFS code to quickly find
the right entry in the cache despite which equivalent string was used in
a previous lookup, without having to resort to ->lookup().
d_hash() of casefolded directories is implemented as the hash of the
casefolded string, such that we always have a well-known bucket for all
the equivalencies of the same string. d_compare() uses the
utf8_strncasecmp() infrastructure, which handles the comparison of
equivalent, same case, names as well.
For now, negative lookups are not inserted in the dcache, since they
would need to be invalidated anyway, because we can't trust missing file
dentries. This is bad for performance but requires some leveraging of
the vfs layer to fix. We can live without that for now, and so does
everyone else.
* on-disk data:
Despite using a specific version of the name as the internal
representation within the dcache, the name stored and fetched from the
disk is a byte-per-byte match with what the user requested, making this
implementation 'name-preserving'. i.e. no actual information is lost
when writing to storage.
DX is supported by modifying the hashes used in +F directories to make
them case/encoding-aware. The new disk hashes are calculated as the
hash of the full casefolded string, instead of the string directly.
This allows us to efficiently search for file names in the htree without
requiring the user to provide an exact name.
* Dealing with invalid sequences:
By default, when a invalid UTF-8 sequence is identified, ext4 will treat
it as an opaque byte sequence, ignoring the encoding and reverting to
the old behavior for that unique file. This means that case-insensitive
file name lookup will not work only for that file. An optional bit can
be set in the superblock telling the filesystem code and userspace tools
to enforce the encoding. When that optional bit is set, any attempt to
create a file name using an invalid UTF-8 sequence will fail and return
an error to userspace.
* Normalization algorithm:
The UTF-8 algorithms used to compare strings in f2fs is implemented
in fs/unicode, and is based on a previous version developed by
SGI. It implements the Canonical decomposition (NFD) algorithm
described by the Unicode specification 12.1, or higher, combined with
the elimination of ignorable code points (NFDi) and full
case-folding (CF) as documented in fs/unicode/utf8_norm.c.
NFD seems to be the best normalization method for F2FS because:
- It has a lower cost than NFC/NFKC (which requires
decomposing to NFD as an intermediary step)
- It doesn't eliminate important semantic meaning like
compatibility decompositions.
Although:
- This implementation is not completely linguistic accurate, because
different languages have conflicting rules, which would require the
specialization of the filesystem to a given locale, which brings all
sorts of problems for removable media and for users who use more than
one language.
"""
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-23 16:05:29 -07:00
{ F2FS_CASEFOLD_FL , FS_CASEFOLD_FL } ,
2019-06-04 22:59:04 -07:00
} ;
# define F2FS_GETTABLE_FS_FL ( \
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
FS_COMPR_FL | \
2019-06-04 22:59:04 -07:00
FS_SYNC_FL | \
FS_IMMUTABLE_FL | \
FS_APPEND_FL | \
FS_NODUMP_FL | \
FS_NOATIME_FL | \
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
FS_NOCOMP_FL | \
2019-06-04 22:59:04 -07:00
FS_INDEX_FL | \
FS_DIRSYNC_FL | \
FS_PROJINHERIT_FL | \
FS_ENCRYPT_FL | \
FS_INLINE_DATA_FL | \
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
FS_NOCOW_FL | \
f2fs-for-5.4-rc1
In this round, we introduced casefolding support in f2fs, and fixed various bugs
in individual features such as IO alignment, checkpoint=disable, quota, and
swapfile.
Enhancement:
- support casefolding w/ enhancement in ext4
- support fiemap for directory
- support FS_IO_GET|SET_FSLABEL
Bug fix:
- fix IO stuck during checkpoint=disable
- avoid infinite GC loop
- fix panic/overflow related to IO alignment feature
- fix livelock in swap file
- fix discard command leak
- disallow dio for atomic_write
-----BEGIN PGP SIGNATURE-----
iQIzBAABCAAdFiEE00UqedjCtOrGVvQiQBSofoJIUNIFAl2FL1wACgkQQBSofoJI
UNIRdg/+M6QNiAazIbqzPXyUUkyZQYR5YKhu2abd49o/g38xjTq1afH0PZpQyDrA
9RncR4xsW8F241vPLVoCSanfaa+MxN6xHi3TrD8zYtZWxOcPF6v1ETHeUXGTHuJ2
gqlk+mm+CnY02M6rxW7XwixuXwttT3bF9+cf1YBWRpNoVrR+SjNqgeJS7FmJwXKd
nGKb+94OxuygL1NUop+LDUo3qRQjc0Sxv/7qj/K4lhqgTjhAxMYT2KvUP/1MZ7U0
Kh9WIayDXnpoioxMPnt4VEb+JgXfLLFELvQzNjwulk15GIweuJzwVYCBXcRoX0cK
eRBRmRy/kRp/e0R1gvl3kYrXQC2AC5QTlBVH/0ESwnaukFiUBKB509vH4aqE/vpB
Krldjfg+uMHkc7XiNBf1boDp713vJ76iRKUDWoVb6H/sPbdJ+jtrnUNeBP8CVpWh
u31SY1MppnmKhhsoCHQRbhbXO/Z29imBQgF9Tm3IFWImyLY3IU40vFj2fR15gJkL
X3x/HWxQynSqyqEOwAZrvhCRTvBAIGIVy5292Di1RkqIoh8saxcqiaywgLz1+eVE
0DCOoh8R6sSbfN/EEh+yZqTxmjo0VGVTw30XVI6QEo4cY5Vfc9u6dN6SRWVRvbjb
kPb3dKcMrttgbn3fcXU8Jbw1AOor9N6afHaqs0swQJyci2RwJyc=
=oonf
-----END PGP SIGNATURE-----
Merge tag 'f2fs-for-5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs
Pull f2fs updates from Jaegeuk Kim:
"In this round, we introduced casefolding support in f2fs, and fixed
various bugs in individual features such as IO alignment,
checkpoint=disable, quota, and swapfile.
Enhancement:
- support casefolding w/ enhancement in ext4
- support fiemap for directory
- support FS_IO_GET|SET_FSLABEL
Bug fix:
- fix IO stuck during checkpoint=disable
- avoid infinite GC loop
- fix panic/overflow related to IO alignment feature
- fix livelock in swap file
- fix discard command leak
- disallow dio for atomic_write"
* tag 'f2fs-for-5.4' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs: (51 commits)
f2fs: add a condition to detect overflow in f2fs_ioc_gc_range()
f2fs: fix to add missing F2FS_IO_ALIGNED() condition
f2fs: fix to fallback to buffered IO in IO aligned mode
f2fs: fix to handle error path correctly in f2fs_map_blocks
f2fs: fix extent corrupotion during directIO in LFS mode
f2fs: check all the data segments against all node ones
f2fs: Add a small clarification to CONFIG_FS_F2FS_FS_SECURITY
f2fs: fix inode rwsem regression
f2fs: fix to avoid accessing uninitialized field of inode page in is_alive()
f2fs: avoid infinite GC loop due to stale atomic files
f2fs: Fix indefinite loop in f2fs_gc()
f2fs: convert inline_data in prior to i_size_write
f2fs: fix error path of f2fs_convert_inline_page()
f2fs: add missing documents of reserve_root/resuid/resgid
f2fs: fix flushing node pages when checkpoint is disabled
f2fs: enhance f2fs_is_checkpoint_ready()'s readability
f2fs: clean up __bio_alloc()'s parameter
f2fs: fix wrong error injection path in inc_valid_block_count()
f2fs: fix to writeout dirty inode during node flush
f2fs: optimize case-insensitive lookups
...
2019-09-21 14:26:33 -07:00
FS_VERITY_FL | \
f2fs: Support case-insensitive file name lookups
Modeled after commit b886ee3e778e ("ext4: Support case-insensitive file
name lookups")
"""
This patch implements the actual support for case-insensitive file name
lookups in f2fs, based on the feature bit and the encoding stored in the
superblock.
A filesystem that has the casefold feature set is able to configure
directories with the +F (F2FS_CASEFOLD_FL) attribute, enabling lookups
to succeed in that directory in a case-insensitive fashion, i.e: match
a directory entry even if the name used by userspace is not a byte per
byte match with the disk name, but is an equivalent case-insensitive
version of the Unicode string. This operation is called a
case-insensitive file name lookup.
The feature is configured as an inode attribute applied to directories
and inherited by its children. This attribute can only be enabled on
empty directories for filesystems that support the encoding feature,
thus preventing collision of file names that only differ by case.
* dcache handling:
For a +F directory, F2Fs only stores the first equivalent name dentry
used in the dcache. This is done to prevent unintentional duplication of
dentries in the dcache, while also allowing the VFS code to quickly find
the right entry in the cache despite which equivalent string was used in
a previous lookup, without having to resort to ->lookup().
d_hash() of casefolded directories is implemented as the hash of the
casefolded string, such that we always have a well-known bucket for all
the equivalencies of the same string. d_compare() uses the
utf8_strncasecmp() infrastructure, which handles the comparison of
equivalent, same case, names as well.
For now, negative lookups are not inserted in the dcache, since they
would need to be invalidated anyway, because we can't trust missing file
dentries. This is bad for performance but requires some leveraging of
the vfs layer to fix. We can live without that for now, and so does
everyone else.
* on-disk data:
Despite using a specific version of the name as the internal
representation within the dcache, the name stored and fetched from the
disk is a byte-per-byte match with what the user requested, making this
implementation 'name-preserving'. i.e. no actual information is lost
when writing to storage.
DX is supported by modifying the hashes used in +F directories to make
them case/encoding-aware. The new disk hashes are calculated as the
hash of the full casefolded string, instead of the string directly.
This allows us to efficiently search for file names in the htree without
requiring the user to provide an exact name.
* Dealing with invalid sequences:
By default, when a invalid UTF-8 sequence is identified, ext4 will treat
it as an opaque byte sequence, ignoring the encoding and reverting to
the old behavior for that unique file. This means that case-insensitive
file name lookup will not work only for that file. An optional bit can
be set in the superblock telling the filesystem code and userspace tools
to enforce the encoding. When that optional bit is set, any attempt to
create a file name using an invalid UTF-8 sequence will fail and return
an error to userspace.
* Normalization algorithm:
The UTF-8 algorithms used to compare strings in f2fs is implemented
in fs/unicode, and is based on a previous version developed by
SGI. It implements the Canonical decomposition (NFD) algorithm
described by the Unicode specification 12.1, or higher, combined with
the elimination of ignorable code points (NFDi) and full
case-folding (CF) as documented in fs/unicode/utf8_norm.c.
NFD seems to be the best normalization method for F2FS because:
- It has a lower cost than NFC/NFKC (which requires
decomposing to NFD as an intermediary step)
- It doesn't eliminate important semantic meaning like
compatibility decompositions.
Although:
- This implementation is not completely linguistic accurate, because
different languages have conflicting rules, which would require the
specialization of the filesystem to a given locale, which brings all
sorts of problems for removable media and for users who use more than
one language.
"""
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-23 16:05:29 -07:00
FS_CASEFOLD_FL )
2019-06-04 22:59:04 -07:00
# define F2FS_SETTABLE_FS_FL ( \
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
FS_COMPR_FL | \
2019-06-04 22:59:04 -07:00
FS_SYNC_FL | \
FS_IMMUTABLE_FL | \
FS_APPEND_FL | \
FS_NODUMP_FL | \
FS_NOATIME_FL | \
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
FS_NOCOMP_FL | \
2019-06-04 22:59:04 -07:00
FS_DIRSYNC_FL | \
f2fs: Support case-insensitive file name lookups
Modeled after commit b886ee3e778e ("ext4: Support case-insensitive file
name lookups")
"""
This patch implements the actual support for case-insensitive file name
lookups in f2fs, based on the feature bit and the encoding stored in the
superblock.
A filesystem that has the casefold feature set is able to configure
directories with the +F (F2FS_CASEFOLD_FL) attribute, enabling lookups
to succeed in that directory in a case-insensitive fashion, i.e: match
a directory entry even if the name used by userspace is not a byte per
byte match with the disk name, but is an equivalent case-insensitive
version of the Unicode string. This operation is called a
case-insensitive file name lookup.
The feature is configured as an inode attribute applied to directories
and inherited by its children. This attribute can only be enabled on
empty directories for filesystems that support the encoding feature,
thus preventing collision of file names that only differ by case.
* dcache handling:
For a +F directory, F2Fs only stores the first equivalent name dentry
used in the dcache. This is done to prevent unintentional duplication of
dentries in the dcache, while also allowing the VFS code to quickly find
the right entry in the cache despite which equivalent string was used in
a previous lookup, without having to resort to ->lookup().
d_hash() of casefolded directories is implemented as the hash of the
casefolded string, such that we always have a well-known bucket for all
the equivalencies of the same string. d_compare() uses the
utf8_strncasecmp() infrastructure, which handles the comparison of
equivalent, same case, names as well.
For now, negative lookups are not inserted in the dcache, since they
would need to be invalidated anyway, because we can't trust missing file
dentries. This is bad for performance but requires some leveraging of
the vfs layer to fix. We can live without that for now, and so does
everyone else.
* on-disk data:
Despite using a specific version of the name as the internal
representation within the dcache, the name stored and fetched from the
disk is a byte-per-byte match with what the user requested, making this
implementation 'name-preserving'. i.e. no actual information is lost
when writing to storage.
DX is supported by modifying the hashes used in +F directories to make
them case/encoding-aware. The new disk hashes are calculated as the
hash of the full casefolded string, instead of the string directly.
This allows us to efficiently search for file names in the htree without
requiring the user to provide an exact name.
* Dealing with invalid sequences:
By default, when a invalid UTF-8 sequence is identified, ext4 will treat
it as an opaque byte sequence, ignoring the encoding and reverting to
the old behavior for that unique file. This means that case-insensitive
file name lookup will not work only for that file. An optional bit can
be set in the superblock telling the filesystem code and userspace tools
to enforce the encoding. When that optional bit is set, any attempt to
create a file name using an invalid UTF-8 sequence will fail and return
an error to userspace.
* Normalization algorithm:
The UTF-8 algorithms used to compare strings in f2fs is implemented
in fs/unicode, and is based on a previous version developed by
SGI. It implements the Canonical decomposition (NFD) algorithm
described by the Unicode specification 12.1, or higher, combined with
the elimination of ignorable code points (NFDi) and full
case-folding (CF) as documented in fs/unicode/utf8_norm.c.
NFD seems to be the best normalization method for F2FS because:
- It has a lower cost than NFC/NFKC (which requires
decomposing to NFD as an intermediary step)
- It doesn't eliminate important semantic meaning like
compatibility decompositions.
Although:
- This implementation is not completely linguistic accurate, because
different languages have conflicting rules, which would require the
specialization of the filesystem to a given locale, which brings all
sorts of problems for removable media and for users who use more than
one language.
"""
Signed-off-by: Daniel Rosenberg <drosen@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-07-23 16:05:29 -07:00
FS_PROJINHERIT_FL | \
FS_CASEFOLD_FL )
2019-06-04 22:59:04 -07:00
/* Convert f2fs on-disk i_flags to FS_IOC_{GET,SET}FLAGS flags */
static inline u32 f2fs_iflags_to_fsflags ( u32 iflags )
{
u32 fsflags = 0 ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( f2fs_fsflags_map ) ; i + + )
if ( iflags & f2fs_fsflags_map [ i ] . iflag )
fsflags | = f2fs_fsflags_map [ i ] . fsflag ;
return fsflags ;
}
/* Convert FS_IOC_{GET,SET}FLAGS flags to f2fs on-disk i_flags */
static inline u32 f2fs_fsflags_to_iflags ( u32 fsflags )
{
u32 iflags = 0 ;
int i ;
for ( i = 0 ; i < ARRAY_SIZE ( f2fs_fsflags_map ) ; i + + )
if ( fsflags & f2fs_fsflags_map [ i ] . fsflag )
iflags | = f2fs_fsflags_map [ i ] . iflag ;
return iflags ;
}
2015-01-23 20:36:04 +08:00
static int f2fs_ioc_getversion ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
return put_user ( inode - > i_generation , ( int __user * ) arg ) ;
}
2014-10-06 17:39:50 -07:00
static int f2fs_ioc_start_atomic_write ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
f2fs: avoid infinite GC loop due to stale atomic files
If committing atomic pages is failed when doing f2fs_do_sync_file(), we can
get commited pages but atomic_file being still set like:
- inmem: 0, atomic IO: 4 (Max. 10), volatile IO: 0 (Max. 0)
If GC selects this block, we can get an infinite loop like this:
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
In that moment, we can observe:
[Before]
Try to move 5084219 blocks (BG: 384508)
- data blocks : 4962373 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4534686 (10)
[After]
Try to move 5088973 blocks (BG: 384508)
- data blocks : 4967127 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4539440 (10)
So, refactor atomic_write flow like this:
1. start_atomic_write
- add inmem_list and set atomic_file
2. write()
- register it in inmem_pages
3. commit_atomic_write
- if no error, f2fs_drop_inmem_pages()
- f2fs_commit_inmme_pages() failed
: __revoked_inmem_pages() was done
- f2fs_do_sync_file failed
: abort_atomic_write later
4. abort_atomic_write
- f2fs_drop_inmem_pages
5. f2fs_drop_inmem_pages
- clear atomic_file
- remove inmem_list
Based on this change, when GC fails to move block in atomic_file,
f2fs_drop_inmem_pages_all() can call f2fs_drop_inmem_pages().
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-09-09 13:10:59 +01:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2015-07-17 18:06:35 +08:00
int ret ;
2014-10-06 17:39:50 -07:00
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-10-06 17:39:50 -07:00
return - EACCES ;
2017-03-17 10:04:15 +08:00
if ( ! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
2019-07-25 22:39:11 +08:00
if ( filp - > f_flags & O_DIRECT )
return - EINVAL ;
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2016-05-09 19:56:33 +08:00
inode_lock ( inode ) ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
f2fs_disable_compressed_file ( inode ) ;
2018-07-27 18:15:11 +09:00
if ( f2fs_is_atomic_file ( inode ) ) {
if ( is_inode_flag_set ( inode , FI_ATOMIC_REVOKE_REQUEST ) )
ret = - EINVAL ;
2016-05-09 19:56:32 +08:00
goto out ;
2018-07-27 18:15:11 +09:00
}
2014-10-06 17:39:50 -07:00
2015-07-17 18:06:35 +08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
2016-05-09 19:56:32 +08:00
goto out ;
2014-10-06 17:39:50 -07:00
2018-07-25 12:11:56 +09:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2018-12-28 11:00:38 -08:00
/*
* Should wait end_io to count F2FS_WB_CP_DATA correctly by
* f2fs_is_atomic_file .
*/
if ( get_dirty_pages ( inode ) )
2019-06-18 17:48:42 +08:00
f2fs_warn ( F2FS_I_SB ( inode ) , " Unexpected flush for atomic writes: ino=%lu, npages=%u " ,
inode - > i_ino , get_dirty_pages ( inode ) ) ;
2016-04-12 14:36:11 -07:00
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 , LLONG_MAX ) ;
2018-07-25 12:11:56 +09:00
if ( ret ) {
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2017-03-18 09:20:55 +08:00
goto out ;
2018-07-25 12:11:56 +09:00
}
2018-12-28 11:00:38 -08:00
f2fs: avoid infinite GC loop due to stale atomic files
If committing atomic pages is failed when doing f2fs_do_sync_file(), we can
get commited pages but atomic_file being still set like:
- inmem: 0, atomic IO: 4 (Max. 10), volatile IO: 0 (Max. 0)
If GC selects this block, we can get an infinite loop like this:
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
In that moment, we can observe:
[Before]
Try to move 5084219 blocks (BG: 384508)
- data blocks : 4962373 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4534686 (10)
[After]
Try to move 5088973 blocks (BG: 384508)
- data blocks : 4967127 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4539440 (10)
So, refactor atomic_write flow like this:
1. start_atomic_write
- add inmem_list and set atomic_file
2. write()
- register it in inmem_pages
3. commit_atomic_write
- if no error, f2fs_drop_inmem_pages()
- f2fs_commit_inmme_pages() failed
: __revoked_inmem_pages() was done
- f2fs_do_sync_file failed
: abort_atomic_write later
4. abort_atomic_write
- f2fs_drop_inmem_pages
5. f2fs_drop_inmem_pages
- clear atomic_file
- remove inmem_list
Based on this change, when GC fails to move block in atomic_file,
f2fs_drop_inmem_pages_all() can call f2fs_drop_inmem_pages().
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-09-09 13:10:59 +01:00
spin_lock ( & sbi - > inode_lock [ ATOMIC_FILE ] ) ;
if ( list_empty ( & fi - > inmem_ilist ) )
list_add_tail ( & fi - > inmem_ilist , & sbi - > inode_list [ ATOMIC_FILE ] ) ;
2019-11-13 16:01:03 +05:30
sbi - > atomic_files + + ;
f2fs: avoid infinite GC loop due to stale atomic files
If committing atomic pages is failed when doing f2fs_do_sync_file(), we can
get commited pages but atomic_file being still set like:
- inmem: 0, atomic IO: 4 (Max. 10), volatile IO: 0 (Max. 0)
If GC selects this block, we can get an infinite loop like this:
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
In that moment, we can observe:
[Before]
Try to move 5084219 blocks (BG: 384508)
- data blocks : 4962373 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4534686 (10)
[After]
Try to move 5088973 blocks (BG: 384508)
- data blocks : 4967127 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4539440 (10)
So, refactor atomic_write flow like this:
1. start_atomic_write
- add inmem_list and set atomic_file
2. write()
- register it in inmem_pages
3. commit_atomic_write
- if no error, f2fs_drop_inmem_pages()
- f2fs_commit_inmme_pages() failed
: __revoked_inmem_pages() was done
- f2fs_do_sync_file failed
: abort_atomic_write later
4. abort_atomic_write
- f2fs_drop_inmem_pages
5. f2fs_drop_inmem_pages
- clear atomic_file
- remove inmem_list
Based on this change, when GC fails to move block in atomic_file,
f2fs_drop_inmem_pages_all() can call f2fs_drop_inmem_pages().
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-09-09 13:10:59 +01:00
spin_unlock ( & sbi - > inode_lock [ ATOMIC_FILE ] ) ;
/* add inode in inmem_list first and set atomic_file */
2018-04-18 11:06:39 +08:00
set_inode_flag ( inode , FI_ATOMIC_FILE ) ;
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
clear_inode_flag ( inode , FI_ATOMIC_REVOKE_REQUEST ) ;
2018-07-25 12:11:56 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2017-03-18 09:20:55 +08:00
2018-07-25 12:11:56 +09:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2017-07-24 19:46:29 -07:00
F2FS_I ( inode ) - > inmem_task = current ;
2016-12-28 13:55:09 -08:00
stat_update_max_atomic_write ( inode ) ;
2017-03-18 09:20:55 +08:00
out :
2016-05-09 19:56:33 +08:00
inode_unlock ( inode ) ;
2016-05-09 19:56:32 +08:00
mnt_drop_write_file ( filp ) ;
2016-04-12 14:36:11 -07:00
return ret ;
2014-10-06 17:39:50 -07:00
}
static int f2fs_ioc_commit_atomic_write ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
int ret ;
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-10-06 17:39:50 -07:00
return - EACCES ;
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2018-07-25 12:11:56 +09:00
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
2016-05-09 19:56:33 +08:00
2018-07-25 12:11:56 +09:00
inode_lock ( inode ) ;
2018-02-27 22:45:24 +08:00
2018-04-18 17:45:02 +08:00
if ( f2fs_is_volatile_file ( inode ) ) {
ret = - EINVAL ;
2016-05-09 19:56:32 +08:00
goto err_out ;
2018-04-18 17:45:02 +08:00
}
2016-05-09 19:56:32 +08:00
2015-07-25 00:29:17 -07:00
if ( f2fs_is_atomic_file ( inode ) ) {
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ret = f2fs_commit_inmem_pages ( inode ) ;
2017-01-07 18:50:26 +08:00
if ( ret )
2015-07-25 00:52:52 -07:00
goto err_out ;
2017-01-07 18:50:26 +08:00
2016-12-28 13:55:09 -08:00
ret = f2fs_do_sync_file ( filp , 0 , LLONG_MAX , 0 , true ) ;
f2fs: avoid infinite GC loop due to stale atomic files
If committing atomic pages is failed when doing f2fs_do_sync_file(), we can
get commited pages but atomic_file being still set like:
- inmem: 0, atomic IO: 4 (Max. 10), volatile IO: 0 (Max. 0)
If GC selects this block, we can get an infinite loop like this:
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
f2fs_submit_page_bio: dev = (253,7), ino = 2, page_index = 0x2359a8, oldaddr = 0x2359a8, newaddr = 0x2359a8, rw = READ(), type = COLD_DATA
f2fs_submit_read_bio: dev = (253,7)/(253,7), rw = READ(), DATA, sector = 18533696, size = 4096
f2fs_get_victim: dev = (253,7), type = No TYPE, policy = (Foreground GC, LFS-mode, Greedy), victim = 4355, cost = 1, ofs_unit = 1, pre_victim_secno = 4355, prefree = 0, free = 234
f2fs_iget: dev = (253,7), ino = 6247, pino = 5845, i_mode = 0x81b0, i_size = 319488, i_nlink = 1, i_blocks = 624, i_advise = 0x2c
In that moment, we can observe:
[Before]
Try to move 5084219 blocks (BG: 384508)
- data blocks : 4962373 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4534686 (10)
[After]
Try to move 5088973 blocks (BG: 384508)
- data blocks : 4967127 (274483)
- node blocks : 121846 (110025)
Skipped : atomic write 4539440 (10)
So, refactor atomic_write flow like this:
1. start_atomic_write
- add inmem_list and set atomic_file
2. write()
- register it in inmem_pages
3. commit_atomic_write
- if no error, f2fs_drop_inmem_pages()
- f2fs_commit_inmme_pages() failed
: __revoked_inmem_pages() was done
- f2fs_do_sync_file failed
: abort_atomic_write later
4. abort_atomic_write
- f2fs_drop_inmem_pages
5. f2fs_drop_inmem_pages
- clear atomic_file
- remove inmem_list
Based on this change, when GC fails to move block in atomic_file,
f2fs_drop_inmem_pages_all() can call f2fs_drop_inmem_pages().
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-09-09 13:10:59 +01:00
if ( ! ret )
f2fs_drop_inmem_pages ( inode ) ;
2016-12-28 13:55:09 -08:00
} else {
2017-08-23 18:23:25 +08:00
ret = f2fs_do_sync_file ( filp , 0 , LLONG_MAX , 1 , false ) ;
2015-07-25 00:29:17 -07:00
}
2015-07-25 00:52:52 -07:00
err_out :
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
if ( is_inode_flag_set ( inode , FI_ATOMIC_REVOKE_REQUEST ) ) {
clear_inode_flag ( inode , FI_ATOMIC_REVOKE_REQUEST ) ;
ret = - EINVAL ;
}
2016-05-09 19:56:33 +08:00
inode_unlock ( inode ) ;
2014-10-06 17:39:50 -07:00
mnt_drop_write_file ( filp ) ;
return ret ;
}
2014-10-06 16:11:16 -07:00
static int f2fs_ioc_start_volatile_write ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
2015-07-17 18:06:35 +08:00
int ret ;
2014-10-06 16:11:16 -07:00
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-10-06 16:11:16 -07:00
return - EACCES ;
2017-03-17 15:43:57 +08:00
if ( ! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2016-05-09 19:56:33 +08:00
inode_lock ( inode ) ;
2014-12-09 06:08:59 -08:00
if ( f2fs_is_volatile_file ( inode ) )
2016-05-09 19:56:32 +08:00
goto out ;
2014-12-09 06:08:59 -08:00
2015-07-17 18:06:35 +08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
2016-05-09 19:56:32 +08:00
goto out ;
2014-10-23 19:48:09 -07:00
2017-03-22 17:23:45 +08:00
stat_inc_volatile_write ( inode ) ;
stat_update_max_volatile_write ( inode ) ;
2016-05-20 10:13:22 -07:00
set_inode_flag ( inode , FI_VOLATILE_FILE ) ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2016-05-09 19:56:32 +08:00
out :
2016-05-09 19:56:33 +08:00
inode_unlock ( inode ) ;
2016-05-09 19:56:32 +08:00
mnt_drop_write_file ( filp ) ;
return ret ;
2014-10-06 16:11:16 -07:00
}
2014-12-09 06:08:59 -08:00
static int f2fs_ioc_release_volatile_write ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
2016-05-09 19:56:32 +08:00
int ret ;
2014-12-09 06:08:59 -08:00
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-12-09 06:08:59 -08:00
return - EACCES ;
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2016-05-09 19:56:33 +08:00
inode_lock ( inode ) ;
2014-12-09 06:08:59 -08:00
if ( ! f2fs_is_volatile_file ( inode ) )
2016-05-09 19:56:32 +08:00
goto out ;
2014-12-09 06:08:59 -08:00
2016-05-09 19:56:32 +08:00
if ( ! f2fs_is_first_block_written ( inode ) ) {
ret = truncate_partial_data_page ( inode , 0 , true ) ;
goto out ;
}
2015-03-17 17:16:35 -07:00
2016-05-09 19:56:32 +08:00
ret = punch_hole ( inode , 0 , F2FS_BLKSIZE ) ;
out :
2016-05-09 19:56:33 +08:00
inode_unlock ( inode ) ;
2016-05-09 19:56:32 +08:00
mnt_drop_write_file ( filp ) ;
return ret ;
2014-12-09 06:08:59 -08:00
}
static int f2fs_ioc_abort_volatile_write ( struct file * filp )
{
struct inode * inode = file_inode ( filp ) ;
int ret ;
2021-01-21 14:19:25 +01:00
if ( ! inode_owner_or_capable ( & init_user_ns , inode ) )
2014-12-09 06:08:59 -08:00
return - EACCES ;
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2016-05-09 19:56:33 +08:00
inode_lock ( inode ) ;
2016-04-11 13:15:10 -07:00
if ( f2fs_is_atomic_file ( inode ) )
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_drop_inmem_pages ( inode ) ;
2015-12-29 15:46:33 -08:00
if ( f2fs_is_volatile_file ( inode ) ) {
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_VOLATILE_FILE ) ;
2017-03-22 17:23:45 +08:00
stat_dec_volatile_write ( inode ) ;
2016-04-15 09:43:17 -07:00
ret = f2fs_do_sync_file ( filp , 0 , LLONG_MAX , 0 , true ) ;
2015-12-29 15:46:33 -08:00
}
2015-06-08 17:51:10 -07:00
2018-07-27 18:15:11 +09:00
clear_inode_flag ( inode , FI_ATOMIC_REVOKE_REQUEST ) ;
2016-05-09 19:56:33 +08:00
inode_unlock ( inode ) ;
2014-12-09 06:08:59 -08:00
mnt_drop_write_file ( filp ) ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2014-12-09 06:08:59 -08:00
return ret ;
}
2015-01-08 19:15:53 -08:00
static int f2fs_ioc_shutdown ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct super_block * sb = sbi - > sb ;
__u32 in ;
2018-06-20 13:39:53 +03:00
int ret = 0 ;
2015-01-08 19:15:53 -08:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( get_user ( in , ( __u32 __user * ) arg ) )
return - EFAULT ;
2018-05-18 11:51:52 +05:30
if ( in ! = F2FS_GOING_DOWN_FULLSYNC ) {
ret = mnt_want_write_file ( filp ) ;
2020-06-08 20:03:16 +08:00
if ( ret ) {
if ( ret = = - EROFS ) {
ret = 0 ;
f2fs_stop_checkpoint ( sbi , false ) ;
set_sbi_flag ( sbi , SBI_IS_SHUTDOWN ) ;
trace_f2fs_shutdown ( sbi , in , ret ) ;
}
2018-05-18 11:51:52 +05:30
return ret ;
2020-06-08 20:03:16 +08:00
}
2018-05-18 11:51:52 +05:30
}
2016-05-09 19:56:32 +08:00
2015-01-08 19:15:53 -08:00
switch ( in ) {
case F2FS_GOING_DOWN_FULLSYNC :
2020-11-24 11:54:06 +01:00
ret = freeze_bdev ( sb - > s_bdev ) ;
if ( ret )
2018-01-17 22:28:52 +08:00
goto out ;
2020-11-24 11:54:06 +01:00
f2fs_stop_checkpoint ( sbi , false ) ;
set_sbi_flag ( sbi , SBI_IS_SHUTDOWN ) ;
thaw_bdev ( sb - > s_bdev ) ;
2015-01-08 19:15:53 -08:00
break ;
case F2FS_GOING_DOWN_METASYNC :
/* do checkpoint only */
2018-01-17 22:28:52 +08:00
ret = f2fs_sync_fs ( sb , 1 ) ;
if ( ret )
goto out ;
2016-05-18 14:07:56 -07:00
f2fs_stop_checkpoint ( sbi , false ) ;
2018-06-21 13:46:23 -07:00
set_sbi_flag ( sbi , SBI_IS_SHUTDOWN ) ;
2015-01-08 19:15:53 -08:00
break ;
case F2FS_GOING_DOWN_NOSYNC :
2016-05-18 14:07:56 -07:00
f2fs_stop_checkpoint ( sbi , false ) ;
2018-06-21 13:46:23 -07:00
set_sbi_flag ( sbi , SBI_IS_SHUTDOWN ) ;
2015-01-08 19:15:53 -08:00
break ;
2015-10-07 09:46:37 -07:00
case F2FS_GOING_DOWN_METAFLUSH :
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_sync_meta_pages ( sbi , META , LONG_MAX , FS_META_IO ) ;
2016-05-18 14:07:56 -07:00
f2fs_stop_checkpoint ( sbi , false ) ;
2018-06-21 13:46:23 -07:00
set_sbi_flag ( sbi , SBI_IS_SHUTDOWN ) ;
2015-10-07 09:46:37 -07:00
break ;
2018-11-28 13:26:03 -08:00
case F2FS_GOING_DOWN_NEED_FSCK :
set_sbi_flag ( sbi , SBI_NEED_FSCK ) ;
2019-01-24 17:48:38 -08:00
set_sbi_flag ( sbi , SBI_CP_DISABLED_QUICK ) ;
set_sbi_flag ( sbi , SBI_IS_DIRTY ) ;
2018-11-28 13:26:03 -08:00
/* do checkpoint only */
ret = f2fs_sync_fs ( sb , 1 ) ;
2019-01-24 17:48:38 -08:00
goto out ;
2015-01-08 19:15:53 -08:00
default :
2016-05-09 19:56:32 +08:00
ret = - EINVAL ;
goto out ;
2015-01-08 19:15:53 -08:00
}
2018-01-18 17:23:29 +08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_stop_gc_thread ( sbi ) ;
f2fs_stop_discard_thread ( sbi ) ;
2018-01-18 17:23:29 +08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
f2fs_drop_discard_cmd ( sbi ) ;
2018-01-18 17:23:29 +08:00
clear_opt ( sbi , DISCARD ) ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( sbi , REQ_TIME ) ;
2016-05-09 19:56:32 +08:00
out :
2018-05-18 11:51:52 +05:30
if ( in ! = F2FS_GOING_DOWN_FULLSYNC )
mnt_drop_write_file ( filp ) ;
2019-02-26 19:01:15 +08:00
trace_f2fs_shutdown ( sbi , in , ret ) ;
2016-05-09 19:56:32 +08:00
return ret ;
2015-01-08 19:15:53 -08:00
}
2014-09-24 15:37:02 -07:00
static int f2fs_ioc_fitrim ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct super_block * sb = inode - > i_sb ;
struct request_queue * q = bdev_get_queue ( sb - > s_bdev ) ;
struct fstrim_range range ;
int ret ;
2014-09-20 22:06:39 -07:00
2014-09-24 15:37:02 -07:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2014-09-20 22:06:39 -07:00
f2fs: fix to avoid NULL pointer dereference on se->discard_map
https://bugzilla.kernel.org/show_bug.cgi?id=200951
These is a NULL pointer dereference issue reported in bugzilla:
Hi,
in the setup there is a SATA SSD connected to a SATA-to-USB bridge.
The disc is "Samsung SSD 850 PRO 256G" which supports TRIM.
There are four partitions:
sda1: FAT /boot
sda2: F2FS /
sda3: F2FS /home
sda4: F2FS
The bridge is ASMT1153e which uses the "uas" driver.
There is no TRIM pass-through, so, when mounting it reports:
mounting with "discard" option, but the device does not support discard
The USB host is USB3.0 and UASP capable. It is the one on RK3399.
Given this everything works fine, except there is no TRIM support.
In order to enable TRIM a new UDEV rule is added [1]:
/etc/udev/rules.d/10-sata-bridge-trim.rules:
ACTION=="add|change", ATTRS{idVendor}=="174c", ATTRS{idProduct}=="55aa", SUBSYSTEM=="scsi_disk", ATTR{provisioning_mode}="unmap"
After reboot any F2FS write hangs forever and dmesg reports:
Unable to handle kernel NULL pointer dereference
Also tested on a x86_64 system: works fine even with TRIM enabled.
same disc
same bridge
different usb host controller
different cpu architecture
not root filesystem
Regards,
Vicenç.
[1] Post #5 in https://bbs.archlinux.org/viewtopic.php?id=236280
Unable to handle kernel NULL pointer dereference at virtual address 000000000000003e
Mem abort info:
ESR = 0x96000004
Exception class = DABT (current EL), IL = 32 bits
SET = 0, FnV = 0
EA = 0, S1PTW = 0
Data abort info:
ISV = 0, ISS = 0x00000004
CM = 0, WnR = 0
user pgtable: 4k pages, 48-bit VAs, pgdp = 00000000626e3122
[000000000000003e] pgd=0000000000000000
Internal error: Oops: 96000004 [#1] SMP
Modules linked in: overlay snd_soc_hdmi_codec rc_cec dw_hdmi_i2s_audio dw_hdmi_cec snd_soc_simple_card snd_soc_simple_card_utils snd_soc_rockchip_i2s rockchip_rga snd_soc_rockchip_pcm rockchipdrm videobuf2_dma_sg v4l2_mem2mem rtc_rk808 videobuf2_memops analogix_dp videobuf2_v4l2 videobuf2_common dw_hdmi dw_wdt cec rc_core videodev drm_kms_helper media drm rockchip_thermal rockchip_saradc realtek drm_panel_orientation_quirks syscopyarea sysfillrect sysimgblt fb_sys_fops dwmac_rk stmmac_platform stmmac pwm_bl squashfs loop crypto_user gpio_keys hid_kensington
CPU: 5 PID: 957 Comm: nvim Not tainted 4.19.0-rc1-1-ARCH #1
Hardware name: Sapphire-RK3399 Board (DT)
pstate: 00000005 (nzcv daif -PAN -UAO)
pc : update_sit_entry+0x304/0x4b0
lr : update_sit_entry+0x108/0x4b0
sp : ffff00000ca13bd0
x29: ffff00000ca13bd0 x28: 000000000000003e
x27: 0000000000000020 x26: 0000000000080000
x25: 0000000000000048 x24: ffff8000ebb85cf8
x23: 0000000000000253 x22: 00000000ffffffff
x21: 00000000000535f2 x20: 00000000ffffffdf
x19: ffff8000eb9e6800 x18: ffff8000eb9e6be8
x17: 0000000007ce6926 x16: 000000001c83ffa8
x15: 0000000000000000 x14: ffff8000f602df90
x13: 0000000000000006 x12: 0000000000000040
x11: 0000000000000228 x10: 0000000000000000
x9 : 0000000000000000 x8 : 0000000000000000
x7 : 00000000000535f2 x6 : ffff8000ebff3440
x5 : ffff8000ebff3440 x4 : ffff8000ebe3a6c8
x3 : 00000000ffffffff x2 : 0000000000000020
x1 : 0000000000000000 x0 : ffff8000eb9e5800
Process nvim (pid: 957, stack limit = 0x0000000063a78320)
Call trace:
update_sit_entry+0x304/0x4b0
f2fs_invalidate_blocks+0x98/0x140
truncate_node+0x90/0x400
f2fs_remove_inode_page+0xe8/0x340
f2fs_evict_inode+0x2b0/0x408
evict+0xe0/0x1e0
iput+0x160/0x260
do_unlinkat+0x214/0x298
__arm64_sys_unlinkat+0x3c/0x68
el0_svc_handler+0x94/0x118
el0_svc+0x8/0xc
Code: f9400800 b9488400 36080140 f9400f01 (387c4820)
---[ end trace a0f21a307118c477 ]---
The reason is it is possible to enable discard flag on block queue via
UDEV, but during mount, f2fs will initialize se->discard_map only if
this flag is set, once the flag is set after mount, f2fs may dereference
NULL pointer on se->discard_map.
So this patch does below changes to fix this issue:
- initialize and update se->discard_map all the time.
- don't clear DISCARD option if device has no QUEUE_FLAG_DISCARD flag
during mount.
- don't issue small discard on zoned block device.
- introduce some functions to enhance the readability.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Tested-by: Vicente Bergas <vicencb@gmail.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-04 03:52:17 +08:00
if ( ! f2fs_hw_support_discard ( F2FS_SB ( sb ) ) )
2014-09-24 15:37:02 -07:00
return - EOPNOTSUPP ;
2014-09-20 22:06:39 -07:00
2014-09-24 15:37:02 -07:00
if ( copy_from_user ( & range , ( struct fstrim_range __user * ) arg ,
sizeof ( range ) ) )
return - EFAULT ;
2014-09-20 22:06:39 -07:00
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2014-09-24 15:37:02 -07:00
range . minlen = max ( ( unsigned int ) range . minlen ,
q - > limits . discard_granularity ) ;
ret = f2fs_trim_fs ( F2FS_SB ( sb ) , & range ) ;
2016-05-09 19:56:32 +08:00
mnt_drop_write_file ( filp ) ;
2014-09-24 15:37:02 -07:00
if ( ret < 0 )
return ret ;
2014-09-20 22:06:39 -07:00
2014-09-24 15:37:02 -07:00
if ( copy_to_user ( ( struct fstrim_range __user * ) arg , & range ,
sizeof ( range ) ) )
return - EFAULT ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2014-09-24 15:37:02 -07:00
return 0 ;
}
2015-04-20 15:19:06 -07:00
static bool uuid_is_nonzero ( __u8 u [ 16 ] )
{
int i ;
for ( i = 0 ; i < 16 ; i + + )
if ( u [ i ] )
return true ;
return false ;
}
static int f2fs_ioc_set_encryption_policy ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
2018-10-24 18:34:26 +08:00
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( inode ) ) )
2017-11-14 19:28:42 +08:00
return - EOPNOTSUPP ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
2016-05-09 19:56:32 +08:00
2016-11-26 19:07:49 -05:00
return fscrypt_ioctl_set_policy ( filp , ( const void __user * ) arg ) ;
2015-04-20 15:19:06 -07:00
}
static int f2fs_ioc_get_encryption_policy ( struct file * filp , unsigned long arg )
{
2018-10-24 18:34:26 +08:00
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
2017-11-14 19:28:42 +08:00
return - EOPNOTSUPP ;
2016-11-26 19:07:49 -05:00
return fscrypt_ioctl_get_policy ( filp , ( void __user * ) arg ) ;
2015-04-20 15:19:06 -07:00
}
static int f2fs_ioc_get_encryption_pwsalt ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
int err ;
2018-10-24 18:34:26 +08:00
if ( ! f2fs_sb_has_encrypt ( sbi ) )
2015-04-20 15:19:06 -07:00
return - EOPNOTSUPP ;
err = mnt_want_write_file ( filp ) ;
if ( err )
return err ;
2018-02-26 22:04:13 +08:00
down_write ( & sbi - > sb_lock ) ;
2018-02-11 22:53:20 +08:00
if ( uuid_is_nonzero ( sbi - > raw_super - > encrypt_pw_salt ) )
goto got_it ;
2015-04-20 15:19:06 -07:00
/* update superblock with uuid */
generate_random_uuid ( sbi - > raw_super - > encrypt_pw_salt ) ;
2015-06-08 13:28:03 +08:00
err = f2fs_commit_super ( sbi , false ) ;
2015-04-20 15:19:06 -07:00
if ( err ) {
/* undo new data */
memset ( sbi - > raw_super - > encrypt_pw_salt , 0 , 16 ) ;
2018-02-11 22:53:20 +08:00
goto out_err ;
2015-04-20 15:19:06 -07:00
}
got_it :
if ( copy_to_user ( ( __u8 __user * ) arg , sbi - > raw_super - > encrypt_pw_salt ,
16 ) )
2018-02-11 22:53:20 +08:00
err = - EFAULT ;
out_err :
2018-02-26 22:04:13 +08:00
up_write ( & sbi - > sb_lock ) ;
2018-02-11 22:53:20 +08:00
mnt_drop_write_file ( filp ) ;
return err ;
2015-04-20 15:19:06 -07:00
}
2019-08-04 19:35:48 -07:00
static int f2fs_ioc_get_encryption_policy_ex ( struct file * filp ,
unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_get_policy_ex ( filp , ( void __user * ) arg ) ;
}
static int f2fs_ioc_add_encryption_key ( struct file * filp , unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_add_key ( filp , ( void __user * ) arg ) ;
}
static int f2fs_ioc_remove_encryption_key ( struct file * filp , unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_remove_key ( filp , ( void __user * ) arg ) ;
}
static int f2fs_ioc_remove_encryption_key_all_users ( struct file * filp ,
unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_remove_key_all_users ( filp , ( void __user * ) arg ) ;
}
static int f2fs_ioc_get_encryption_key_status ( struct file * filp ,
unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_get_key_status ( filp , ( void __user * ) arg ) ;
}
2020-03-14 13:50:51 -07:00
static int f2fs_ioc_get_encryption_nonce ( struct file * filp , unsigned long arg )
{
if ( ! f2fs_sb_has_encrypt ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fscrypt_ioctl_get_nonce ( filp , ( void __user * ) arg ) ;
}
2015-07-10 18:08:10 +08:00
static int f2fs_ioc_gc ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2015-10-05 22:22:44 +08:00
__u32 sync ;
2016-05-09 19:56:32 +08:00
int ret ;
2015-07-10 18:08:10 +08:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2015-10-05 22:22:44 +08:00
if ( get_user ( sync , ( __u32 __user * ) arg ) )
2015-07-10 18:08:10 +08:00
return - EFAULT ;
2015-10-05 22:22:44 +08:00
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
2015-07-10 18:08:10 +08:00
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2015-10-05 22:22:44 +08:00
if ( ! sync ) {
2020-01-14 19:36:50 +08:00
if ( ! down_write_trylock ( & sbi - > gc_lock ) ) {
2016-05-09 19:56:32 +08:00
ret = - EBUSY ;
goto out ;
}
2015-10-05 22:22:44 +08:00
} else {
2020-01-14 19:36:50 +08:00
down_write ( & sbi - > gc_lock ) ;
2015-07-10 18:08:10 +08:00
}
2021-02-20 17:35:40 +08:00
ret = f2fs_gc ( sbi , sync , true , false , NULL_SEGNO ) ;
2016-05-09 19:56:32 +08:00
out :
mnt_drop_write_file ( filp ) ;
return ret ;
2015-07-10 18:08:10 +08:00
}
2020-11-10 09:24:37 +08:00
static int __f2fs_ioc_gc_range ( struct file * filp , struct f2fs_gc_range * range )
2017-06-15 16:44:42 -07:00
{
2020-11-10 09:24:37 +08:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( file_inode ( filp ) ) ;
2017-06-15 16:44:42 -07:00
u64 end ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
2020-11-10 09:24:37 +08:00
end = range - > start + range - > len ;
if ( end < range - > start | | range - > start < MAIN_BLKADDR ( sbi ) | |
2019-09-17 10:19:23 +05:30
end > = MAX_BLKADDR ( sbi ) )
2018-04-24 11:40:30 +08:00
return - EINVAL ;
2017-06-15 16:44:42 -07:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
do_more :
2020-11-10 09:24:37 +08:00
if ( ! range - > sync ) {
2020-01-14 19:36:50 +08:00
if ( ! down_write_trylock ( & sbi - > gc_lock ) ) {
2017-06-15 16:44:42 -07:00
ret = - EBUSY ;
goto out ;
}
} else {
2020-01-14 19:36:50 +08:00
down_write ( & sbi - > gc_lock ) ;
2017-06-15 16:44:42 -07:00
}
2021-02-20 17:35:40 +08:00
ret = f2fs_gc ( sbi , range - > sync , true , false ,
GET_SEGNO ( sbi , range - > start ) ) ;
2020-06-28 19:23:03 +08:00
if ( ret ) {
if ( ret = = - EBUSY )
ret = - EAGAIN ;
goto out ;
}
2020-11-10 09:24:37 +08:00
range - > start + = BLKS_PER_SEC ( sbi ) ;
if ( range - > start < = end )
2017-06-15 16:44:42 -07:00
goto do_more ;
out :
mnt_drop_write_file ( filp ) ;
return ret ;
}
2020-11-10 09:24:37 +08:00
static int f2fs_ioc_gc_range ( struct file * filp , unsigned long arg )
{
struct f2fs_gc_range range ;
if ( copy_from_user ( & range , ( struct f2fs_gc_range __user * ) arg ,
sizeof ( range ) ) )
return - EFAULT ;
return __f2fs_ioc_gc_range ( filp , & range ) ;
}
2018-07-17 20:41:49 +08:00
static int f2fs_ioc_write_checkpoint ( struct file * filp , unsigned long arg )
2015-10-05 22:24:19 +08:00
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
2016-05-09 19:56:32 +08:00
int ret ;
2015-10-05 22:24:19 +08:00
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
2018-08-20 19:21:43 -07:00
if ( unlikely ( is_sbi_flag_set ( sbi , SBI_CP_DISABLED ) ) ) {
2019-06-18 17:48:42 +08:00
f2fs_info ( sbi , " Skipping Checkpoint. Checkpoints currently disabled. " ) ;
2018-08-20 19:21:43 -07:00
return - EINVAL ;
}
2016-05-09 19:56:32 +08:00
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
ret = f2fs_sync_fs ( sbi - > sb , 1 ) ;
mnt_drop_write_file ( filp ) ;
return ret ;
2015-10-05 22:24:19 +08:00
}
2015-10-27 09:53:45 +08:00
static int f2fs_defragment_range ( struct f2fs_sb_info * sbi ,
struct file * filp ,
struct f2fs_defragment * range )
{
struct inode * inode = file_inode ( filp ) ;
2018-01-10 18:18:52 +08:00
struct f2fs_map_blocks map = { . m_next_extent = NULL ,
2021-04-06 09:47:35 +08:00
. m_seg_type = NO_CHECK_TYPE ,
2018-11-20 04:29:35 +08:00
. m_may_create = false } ;
2018-05-30 04:34:58 +09:00
struct extent_info ei = { 0 , 0 , 0 } ;
2018-01-10 18:18:52 +08:00
pgoff_t pg_start , pg_end , next_pgofs ;
2015-12-01 11:56:52 +08:00
unsigned int blk_per_seg = sbi - > blocks_per_seg ;
2015-10-27 09:53:45 +08:00
unsigned int total = 0 , sec_num ;
block_t blk_end = 0 ;
bool fragmented = false ;
int err ;
/* if in-place-update policy is enabled, don't waste time here */
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
if ( f2fs_should_update_inplace ( inode , NULL ) )
2015-10-27 09:53:45 +08:00
return - EINVAL ;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
pg_start = range - > start > > PAGE_SHIFT ;
pg_end = ( range - > start + range - > len ) > > PAGE_SHIFT ;
2015-10-27 09:53:45 +08:00
2016-01-07 14:15:04 -08:00
f2fs_balance_fs ( sbi , true ) ;
2015-10-27 09:53:45 +08:00
2016-01-22 15:40:57 -05:00
inode_lock ( inode ) ;
2015-10-27 09:53:45 +08:00
/* writeback all dirty pages in the range */
err = filemap_write_and_wait_range ( inode - > i_mapping , range - > start ,
2015-12-14 13:34:00 +08:00
range - > start + range - > len - 1 ) ;
2015-10-27 09:53:45 +08:00
if ( err )
goto out ;
/*
* lookup mapping info in extent cache , skip defragmenting if physical
* block addresses are continuous .
*/
if ( f2fs_lookup_extent_cache ( inode , pg_start , & ei ) ) {
if ( ei . fofs + ei . len > = pg_end )
goto out ;
}
map . m_lblk = pg_start ;
2018-01-10 18:18:52 +08:00
map . m_next_pgofs = & next_pgofs ;
2015-10-27 09:53:45 +08:00
/*
* lookup mapping info in dnode page cache , skip defragmenting if all
* physical block addresses are continuous even if there are hole ( s )
* in logical blocks .
*/
while ( map . m_lblk < pg_end ) {
2015-12-15 17:02:41 +08:00
map . m_len = pg_end - map . m_lblk ;
2017-08-09 17:27:30 +08:00
err = f2fs_map_blocks ( inode , & map , 0 , F2FS_GET_BLOCK_DEFAULT ) ;
2015-10-27 09:53:45 +08:00
if ( err )
goto out ;
if ( ! ( map . m_flags & F2FS_MAP_FLAGS ) ) {
2018-01-10 18:18:52 +08:00
map . m_lblk = next_pgofs ;
2015-10-27 09:53:45 +08:00
continue ;
}
2018-01-10 18:18:51 +08:00
if ( blk_end & & blk_end ! = map . m_pblk )
2015-10-27 09:53:45 +08:00
fragmented = true ;
2018-01-10 18:18:51 +08:00
/* record total count of block that we're going to move */
total + = map . m_len ;
2015-10-27 09:53:45 +08:00
blk_end = map . m_pblk + map . m_len ;
map . m_lblk + = map . m_len ;
}
2019-07-29 23:02:29 +08:00
if ( ! fragmented ) {
total = 0 ;
2015-10-27 09:53:45 +08:00
goto out ;
2019-07-29 23:02:29 +08:00
}
2015-10-27 09:53:45 +08:00
2019-06-20 16:42:08 +02:00
sec_num = DIV_ROUND_UP ( total , BLKS_PER_SEC ( sbi ) ) ;
2015-10-27 09:53:45 +08:00
/*
* make sure there are enough free section for LFS allocation , this can
* avoid defragment running in SSR mode when free section are allocated
* intensively
*/
2016-09-01 12:02:51 -07:00
if ( has_not_enough_free_secs ( sbi , 0 , sec_num ) ) {
2015-10-27 09:53:45 +08:00
err = - EAGAIN ;
goto out ;
}
2018-01-10 18:18:51 +08:00
map . m_lblk = pg_start ;
map . m_len = pg_end - pg_start ;
total = 0 ;
2015-10-27 09:53:45 +08:00
while ( map . m_lblk < pg_end ) {
pgoff_t idx ;
int cnt = 0 ;
do_map :
2015-12-15 17:02:41 +08:00
map . m_len = pg_end - map . m_lblk ;
2017-08-09 17:27:30 +08:00
err = f2fs_map_blocks ( inode , & map , 0 , F2FS_GET_BLOCK_DEFAULT ) ;
2015-10-27 09:53:45 +08:00
if ( err )
goto clear_out ;
if ( ! ( map . m_flags & F2FS_MAP_FLAGS ) ) {
2018-01-10 18:18:52 +08:00
map . m_lblk = next_pgofs ;
2019-07-29 23:02:29 +08:00
goto check ;
2015-10-27 09:53:45 +08:00
}
2016-05-20 10:13:22 -07:00
set_inode_flag ( inode , FI_DO_DEFRAG ) ;
2015-10-27 09:53:45 +08:00
idx = map . m_lblk ;
while ( idx < map . m_lblk + map . m_len & & cnt < blk_per_seg ) {
struct page * page ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
page = f2fs_get_lock_data_page ( inode , idx , true ) ;
2015-10-27 09:53:45 +08:00
if ( IS_ERR ( page ) ) {
err = PTR_ERR ( page ) ;
goto clear_out ;
}
set_page_dirty ( page ) ;
f2fs_put_page ( page , 1 ) ;
idx + + ;
cnt + + ;
total + + ;
}
map . m_lblk = idx ;
2019-07-29 23:02:29 +08:00
check :
if ( map . m_lblk < pg_end & & cnt < blk_per_seg )
2015-10-27 09:53:45 +08:00
goto do_map ;
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_DO_DEFRAG ) ;
2015-10-27 09:53:45 +08:00
err = filemap_fdatawrite ( inode - > i_mapping ) ;
if ( err )
goto out ;
}
clear_out :
2016-05-20 10:13:22 -07:00
clear_inode_flag ( inode , FI_DO_DEFRAG ) ;
2015-10-27 09:53:45 +08:00
out :
2016-01-22 15:40:57 -05:00
inode_unlock ( inode ) ;
2015-10-27 09:53:45 +08:00
if ( ! err )
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 15:29:47 +03:00
range - > len = ( u64 ) total < < PAGE_SHIFT ;
2015-10-27 09:53:45 +08:00
return err ;
}
static int f2fs_ioc_defragment ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct f2fs_defragment range ;
int err ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
2017-04-25 12:45:12 +00:00
if ( ! S_ISREG ( inode - > i_mode ) | | f2fs_is_atomic_file ( inode ) )
2015-10-27 09:53:45 +08:00
return - EINVAL ;
2017-03-10 17:55:07 +08:00
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
2015-10-27 09:53:45 +08:00
if ( copy_from_user ( & range , ( struct f2fs_defragment __user * ) arg ,
2017-03-10 17:55:07 +08:00
sizeof ( range ) ) )
return - EFAULT ;
2015-10-27 09:53:45 +08:00
/* verify alignment of offset & size */
2017-03-10 17:55:07 +08:00
if ( range . start & ( F2FS_BLKSIZE - 1 ) | | range . len & ( F2FS_BLKSIZE - 1 ) )
return - EINVAL ;
2015-10-27 09:53:45 +08:00
2017-03-08 10:47:12 +08:00
if ( unlikely ( ( range . start + range . len ) > > PAGE_SHIFT >
2021-01-13 13:21:54 +08:00
max_file_blocks ( inode ) ) )
2017-03-10 17:55:07 +08:00
return - EINVAL ;
err = mnt_want_write_file ( filp ) ;
if ( err )
return err ;
2017-03-08 10:47:12 +08:00
2015-10-27 09:53:45 +08:00
err = f2fs_defragment_range ( sbi , filp , & range ) ;
2017-03-10 17:55:07 +08:00
mnt_drop_write_file ( filp ) ;
2016-01-08 16:57:48 -08:00
f2fs_update_time ( sbi , REQ_TIME ) ;
2015-10-27 09:53:45 +08:00
if ( err < 0 )
2017-03-10 17:55:07 +08:00
return err ;
2015-10-27 09:53:45 +08:00
if ( copy_to_user ( ( struct f2fs_defragment __user * ) arg , & range ,
sizeof ( range ) ) )
2017-03-10 17:55:07 +08:00
return - EFAULT ;
return 0 ;
2015-10-27 09:53:45 +08:00
}
2016-07-08 15:16:47 -07:00
static int f2fs_move_file_range ( struct file * file_in , loff_t pos_in ,
struct file * file_out , loff_t pos_out , size_t len )
{
struct inode * src = file_inode ( file_in ) ;
struct inode * dst = file_inode ( file_out ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( src ) ;
size_t olen = len , dst_max_i_size = 0 ;
size_t dst_osize ;
int ret ;
if ( file_in - > f_path . mnt ! = file_out - > f_path . mnt | |
src - > i_sb ! = dst - > i_sb )
return - EXDEV ;
if ( unlikely ( f2fs_readonly ( src - > i_sb ) ) )
return - EROFS ;
2016-08-04 20:13:02 +08:00
if ( ! S_ISREG ( src - > i_mode ) | | ! S_ISREG ( dst - > i_mode ) )
return - EINVAL ;
2016-07-08 15:16:47 -07:00
2018-12-12 15:20:11 +05:30
if ( IS_ENCRYPTED ( src ) | | IS_ENCRYPTED ( dst ) )
2016-07-08 15:16:47 -07:00
return - EOPNOTSUPP ;
2020-08-30 21:45:23 +00:00
if ( pos_out < 0 | | pos_in < 0 )
return - EINVAL ;
2016-09-13 11:35:42 +08:00
if ( src = = dst ) {
if ( pos_in = = pos_out )
return 0 ;
if ( pos_out > pos_in & & pos_out < pos_in + len )
return - EINVAL ;
}
2016-07-08 15:16:47 -07:00
inode_lock ( src ) ;
2016-08-04 20:13:03 +08:00
if ( src ! = dst ) {
2017-11-03 10:21:05 +08:00
ret = - EBUSY ;
if ( ! inode_trylock ( dst ) )
goto out ;
2016-08-04 20:13:03 +08:00
}
2016-07-08 15:16:47 -07:00
ret = - EINVAL ;
if ( pos_in + len > src - > i_size | | pos_in + len < pos_in )
goto out_unlock ;
if ( len = = 0 )
olen = len = src - > i_size - pos_in ;
if ( pos_in + len = = src - > i_size )
len = ALIGN ( src - > i_size , F2FS_BLKSIZE ) - pos_in ;
if ( len = = 0 ) {
ret = 0 ;
goto out_unlock ;
}
dst_osize = dst - > i_size ;
if ( pos_out + olen > dst - > i_size )
dst_max_i_size = pos_out + olen ;
/* verify the end result is block aligned */
if ( ! IS_ALIGNED ( pos_in , F2FS_BLKSIZE ) | |
! IS_ALIGNED ( pos_in + len , F2FS_BLKSIZE ) | |
! IS_ALIGNED ( pos_out , F2FS_BLKSIZE ) )
goto out_unlock ;
ret = f2fs_convert_inline_inode ( src ) ;
if ( ret )
goto out_unlock ;
ret = f2fs_convert_inline_inode ( dst ) ;
if ( ret )
goto out_unlock ;
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range ( src - > i_mapping ,
pos_in , pos_in + len ) ;
if ( ret )
goto out_unlock ;
ret = filemap_write_and_wait_range ( dst - > i_mapping ,
pos_out , pos_out + len ) ;
if ( ret )
goto out_unlock ;
f2fs_balance_fs ( sbi , true ) ;
2018-07-25 12:11:56 +09:00
down_write ( & F2FS_I ( src ) - > i_gc_rwsem [ WRITE ] ) ;
if ( src ! = dst ) {
ret = - EBUSY ;
if ( ! down_write_trylock ( & F2FS_I ( dst ) - > i_gc_rwsem [ WRITE ] ) )
goto out_src ;
}
2016-07-08 15:16:47 -07:00
f2fs_lock_op ( sbi ) ;
2016-09-10 11:19:37 +08:00
ret = __exchange_data_block ( src , dst , pos_in > > F2FS_BLKSIZE_BITS ,
pos_out > > F2FS_BLKSIZE_BITS ,
len > > F2FS_BLKSIZE_BITS , false ) ;
2016-07-08 15:16:47 -07:00
if ( ! ret ) {
if ( dst_max_i_size )
f2fs_i_size_write ( dst , dst_max_i_size ) ;
else if ( dst_osize ! = dst - > i_size )
f2fs_i_size_write ( dst , dst_osize ) ;
}
f2fs_unlock_op ( sbi ) ;
2018-07-25 12:11:56 +09:00
if ( src ! = dst )
2018-04-24 10:55:28 +08:00
up_write ( & F2FS_I ( dst ) - > i_gc_rwsem [ WRITE ] ) ;
2018-07-25 12:11:56 +09:00
out_src :
up_write ( & F2FS_I ( src ) - > i_gc_rwsem [ WRITE ] ) ;
out_unlock :
if ( src ! = dst )
2016-07-08 15:16:47 -07:00
inode_unlock ( dst ) ;
2016-08-04 20:13:03 +08:00
out :
2016-07-08 15:16:47 -07:00
inode_unlock ( src ) ;
return ret ;
}
2020-11-10 09:24:37 +08:00
static int __f2fs_ioc_move_range ( struct file * filp ,
struct f2fs_move_range * range )
2016-07-08 15:16:47 -07:00
{
struct fd dst ;
int err ;
if ( ! ( filp - > f_mode & FMODE_READ ) | |
! ( filp - > f_mode & FMODE_WRITE ) )
return - EBADF ;
2020-11-10 09:24:37 +08:00
dst = fdget ( range - > dst_fd ) ;
2016-07-08 15:16:47 -07:00
if ( ! dst . file )
return - EBADF ;
if ( ! ( dst . file - > f_mode & FMODE_WRITE ) ) {
err = - EBADF ;
goto err_out ;
}
err = mnt_want_write_file ( filp ) ;
if ( err )
goto err_out ;
2020-11-10 09:24:37 +08:00
err = f2fs_move_file_range ( filp , range - > pos_in , dst . file ,
range - > pos_out , range - > len ) ;
2016-07-08 15:16:47 -07:00
mnt_drop_write_file ( filp ) ;
err_out :
fdput ( dst ) ;
return err ;
}
2020-11-10 09:24:37 +08:00
static int f2fs_ioc_move_range ( struct file * filp , unsigned long arg )
{
struct f2fs_move_range range ;
if ( copy_from_user ( & range , ( struct f2fs_move_range __user * ) arg ,
sizeof ( range ) ) )
return - EFAULT ;
return __f2fs_ioc_move_range ( filp , & range ) ;
}
2017-04-13 15:17:00 -07:00
static int f2fs_ioc_flush_device ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct sit_info * sm = SIT_I ( sbi ) ;
unsigned int start_segno = 0 , end_segno = 0 ;
unsigned int dev_start_segno = 0 , dev_end_segno = 0 ;
struct f2fs_flush_device range ;
int ret ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
2018-08-20 19:21:43 -07:00
if ( unlikely ( is_sbi_flag_set ( sbi , SBI_CP_DISABLED ) ) )
return - EINVAL ;
2017-04-13 15:17:00 -07:00
if ( copy_from_user ( & range , ( struct f2fs_flush_device __user * ) arg ,
sizeof ( range ) ) )
return - EFAULT ;
2019-03-16 09:13:06 +09:00
if ( ! f2fs_is_multi_device ( sbi ) | | sbi - > s_ndevs - 1 < = range . dev_num | |
2018-10-24 18:37:26 +08:00
__is_large_section ( sbi ) ) {
2019-06-18 17:48:42 +08:00
f2fs_warn ( sbi , " Can't flush %u in %d for segs_per_sec %u != 1 " ,
range . dev_num , sbi - > s_ndevs , sbi - > segs_per_sec ) ;
2017-04-13 15:17:00 -07:00
return - EINVAL ;
}
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
if ( range . dev_num ! = 0 )
dev_start_segno = GET_SEGNO ( sbi , FDEV ( range . dev_num ) . start_blk ) ;
dev_end_segno = GET_SEGNO ( sbi , FDEV ( range . dev_num ) . end_blk ) ;
start_segno = sm - > last_victim [ FLUSH_DEVICE ] ;
if ( start_segno < dev_start_segno | | start_segno > = dev_end_segno )
start_segno = dev_start_segno ;
end_segno = min ( start_segno + range . segments , dev_end_segno ) ;
while ( start_segno < end_segno ) {
2020-01-14 19:36:50 +08:00
if ( ! down_write_trylock ( & sbi - > gc_lock ) ) {
2017-04-13 15:17:00 -07:00
ret = - EBUSY ;
goto out ;
}
sm - > last_victim [ GC_CB ] = end_segno + 1 ;
sm - > last_victim [ GC_GREEDY ] = end_segno + 1 ;
sm - > last_victim [ ALLOC_NEXT ] = end_segno + 1 ;
2021-02-20 17:35:40 +08:00
ret = f2fs_gc ( sbi , true , true , true , start_segno ) ;
2017-04-13 15:17:00 -07:00
if ( ret = = - EAGAIN )
ret = 0 ;
else if ( ret < 0 )
break ;
start_segno + + ;
}
out :
mnt_drop_write_file ( filp ) ;
return ret ;
}
2017-07-21 12:58:59 -07:00
static int f2fs_ioc_get_features ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
u32 sb_feature = le32_to_cpu ( F2FS_I_SB ( inode ) - > raw_super - > feature ) ;
/* Must validate to set it with SQLite behavior in Android. */
sb_feature | = F2FS_FEATURE_ATOMIC_WRITE ;
return put_user ( sb_feature , ( u32 __user * ) arg ) ;
}
2017-04-13 15:17:00 -07:00
2017-07-29 00:32:52 +08:00
# ifdef CONFIG_QUOTA
2018-09-25 15:36:02 +08:00
int f2fs_transfer_project_quota ( struct inode * inode , kprojid_t kprojid )
{
struct dquot * transfer_to [ MAXQUOTAS ] = { } ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct super_block * sb = sbi - > sb ;
int err = 0 ;
transfer_to [ PRJQUOTA ] = dqget ( sb , make_kqid_projid ( kprojid ) ) ;
if ( ! IS_ERR ( transfer_to [ PRJQUOTA ] ) ) {
err = __dquot_transfer ( inode , transfer_to ) ;
if ( err )
set_sbi_flag ( sbi , SBI_QUOTA_NEED_REPAIR ) ;
dqput ( transfer_to [ PRJQUOTA ] ) ;
}
return err ;
}
2021-04-07 14:36:43 +02:00
static int f2fs_ioc_setproject ( struct inode * inode , __u32 projid )
2017-07-29 00:32:52 +08:00
{
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct page * ipage ;
kprojid_t kprojid ;
int err ;
2018-10-24 18:34:26 +08:00
if ( ! f2fs_sb_has_project_quota ( sbi ) ) {
2017-07-29 00:32:52 +08:00
if ( projid ! = F2FS_DEF_PROJID )
return - EOPNOTSUPP ;
else
return 0 ;
}
if ( ! f2fs_has_extra_attr ( inode ) )
return - EOPNOTSUPP ;
kprojid = make_kprojid ( & init_user_ns , ( projid_t ) projid ) ;
if ( projid_eq ( kprojid , F2FS_I ( inode ) - > i_projid ) )
return 0 ;
err = - EPERM ;
/* Is it quota file? Do not allow user to mess with it */
if ( IS_NOQUOTA ( inode ) )
2018-09-11 08:54:21 +09:00
return err ;
2017-07-29 00:32:52 +08:00
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
ipage = f2fs_get_node_page ( sbi , inode - > i_ino ) ;
2018-09-11 08:54:21 +09:00
if ( IS_ERR ( ipage ) )
return PTR_ERR ( ipage ) ;
2017-07-29 00:32:52 +08:00
if ( ! F2FS_FITS_IN_INODE ( F2FS_INODE ( ipage ) , fi - > i_extra_isize ,
i_projid ) ) {
err = - EOVERFLOW ;
f2fs_put_page ( ipage , 1 ) ;
2018-09-11 08:54:21 +09:00
return err ;
2017-07-29 00:32:52 +08:00
}
f2fs_put_page ( ipage , 1 ) ;
2021-10-28 21:03:05 +08:00
err = f2fs_dquot_initialize ( inode ) ;
2018-04-21 17:53:52 +08:00
if ( err )
2018-09-11 08:54:21 +09:00
return err ;
2017-07-29 00:32:52 +08:00
2018-09-25 15:36:02 +08:00
f2fs_lock_op ( sbi ) ;
err = f2fs_transfer_project_quota ( inode , kprojid ) ;
if ( err )
goto out_unlock ;
2017-07-29 00:32:52 +08:00
F2FS_I ( inode ) - > i_projid = kprojid ;
inode - > i_ctime = current_time ( inode ) ;
f2fs_mark_inode_dirty_sync ( inode , true ) ;
2018-09-25 15:36:02 +08:00
out_unlock :
f2fs_unlock_op ( sbi ) ;
2017-07-29 00:32:52 +08:00
return err ;
}
# else
2018-09-25 15:36:02 +08:00
int f2fs_transfer_project_quota ( struct inode * inode , kprojid_t kprojid )
{
return 0 ;
}
2021-04-07 14:36:43 +02:00
static int f2fs_ioc_setproject ( struct inode * inode , __u32 projid )
2017-07-29 00:32:52 +08:00
{
if ( projid ! = F2FS_DEF_PROJID )
return - EOPNOTSUPP ;
return 0 ;
}
# endif
2021-04-07 14:36:43 +02:00
int f2fs_fileattr_get ( struct dentry * dentry , struct fileattr * fa )
2017-07-29 00:32:52 +08:00
{
2021-04-07 14:36:43 +02:00
struct inode * inode = d_inode ( dentry ) ;
2017-07-29 00:32:52 +08:00
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
2021-04-07 14:36:43 +02:00
u32 fsflags = f2fs_iflags_to_fsflags ( fi - > i_flags ) ;
2017-07-29 00:32:52 +08:00
2021-04-07 14:36:43 +02:00
if ( IS_ENCRYPTED ( inode ) )
fsflags | = FS_ENCRYPT_FL ;
if ( IS_VERITY ( inode ) )
fsflags | = FS_VERITY_FL ;
if ( f2fs_has_inline_data ( inode ) | | f2fs_has_inline_dentry ( inode ) )
fsflags | = FS_INLINE_DATA_FL ;
if ( is_inode_flag_set ( inode , FI_PIN_FILE ) )
fsflags | = FS_NOCOW_FL ;
fileattr_fill_flags ( fa , fsflags & F2FS_GETTABLE_FS_FL ) ;
2017-07-29 00:32:52 +08:00
2018-10-24 18:34:26 +08:00
if ( f2fs_sb_has_project_quota ( F2FS_I_SB ( inode ) ) )
2019-07-01 13:26:29 -07:00
fa - > fsx_projid = from_kprojid ( & init_user_ns , fi - > i_projid ) ;
2017-07-29 00:32:52 +08:00
2018-09-11 08:54:21 +09:00
return 0 ;
}
2021-04-07 14:36:43 +02:00
int f2fs_fileattr_set ( struct user_namespace * mnt_userns ,
struct dentry * dentry , struct fileattr * fa )
2017-07-29 00:32:52 +08:00
{
2021-04-07 14:36:43 +02:00
struct inode * inode = d_inode ( dentry ) ;
u32 fsflags = fa - > flags , mask = F2FS_SETTABLE_FS_FL ;
2019-06-04 22:59:04 -07:00
u32 iflags ;
2017-07-29 00:32:52 +08:00
int err ;
2021-04-07 14:36:43 +02:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) )
return - EIO ;
if ( ! f2fs_is_checkpoint_ready ( F2FS_I_SB ( inode ) ) )
return - ENOSPC ;
if ( fsflags & ~ F2FS_GETTABLE_FS_FL )
2017-07-29 00:32:52 +08:00
return - EOPNOTSUPP ;
2021-04-07 14:36:43 +02:00
fsflags & = F2FS_SETTABLE_FS_FL ;
if ( ! fa - > flags_valid )
mask & = FS_COMMON_FL ;
2017-07-29 00:32:52 +08:00
2021-04-07 14:36:43 +02:00
iflags = f2fs_fsflags_to_iflags ( fsflags ) ;
2019-06-04 22:59:04 -07:00
if ( f2fs_mask_flags ( inode - > i_mode , iflags ) ! = iflags )
2017-07-29 00:32:52 +08:00
return - EOPNOTSUPP ;
2021-04-07 14:36:43 +02:00
err = f2fs_setflags_common ( inode , iflags , f2fs_fsflags_to_iflags ( mask ) ) ;
if ( ! err )
err = f2fs_ioc_setproject ( inode , fa - > fsx_projid ) ;
2017-07-29 00:32:52 +08:00
2018-09-11 08:54:21 +09:00
return err ;
2017-07-29 00:32:52 +08:00
}
2017-04-13 15:17:00 -07:00
2017-12-07 16:25:39 -08:00
int f2fs_pin_file_control ( struct inode * inode , bool inc )
{
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
/* Use i_gc_failures for normal file as a risk signal. */
if ( inc )
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
f2fs_i_gc_failures_write ( inode ,
fi - > i_gc_failures [ GC_FAILURE_PIN ] + 1 ) ;
2017-12-07 16:25:39 -08:00
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
if ( fi - > i_gc_failures [ GC_FAILURE_PIN ] > sbi - > gc_pin_file_threshold ) {
2019-06-18 17:48:42 +08:00
f2fs_warn ( sbi , " %s: Enable GC = ino %lx after %x GC trials " ,
__func__ , inode - > i_ino ,
fi - > i_gc_failures [ GC_FAILURE_PIN ] ) ;
2017-12-07 16:25:39 -08:00
clear_inode_flag ( inode , FI_PIN_FILE ) ;
return - EAGAIN ;
}
return 0 ;
}
static int f2fs_ioc_set_pin_file ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
__u32 pin ;
int ret = 0 ;
if ( get_user ( pin , ( __u32 __user * ) arg ) )
return - EFAULT ;
if ( ! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
if ( f2fs_readonly ( F2FS_I_SB ( inode ) - > sb ) )
return - EROFS ;
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
inode_lock ( inode ) ;
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
if ( f2fs_should_update_outplace ( inode , NULL ) ) {
2018-01-17 16:31:38 +08:00
ret = - EINVAL ;
goto out ;
}
2017-12-07 16:25:39 -08:00
if ( ! pin ) {
clear_inode_flag ( inode , FI_PIN_FILE ) ;
2018-07-28 18:37:58 +08:00
f2fs_i_gc_failures_write ( inode , 0 ) ;
2017-12-07 16:25:39 -08:00
goto done ;
}
if ( f2fs_pin_file_control ( inode , false ) ) {
ret = - EAGAIN ;
goto out ;
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
2017-12-07 16:25:39 -08:00
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
goto out ;
2020-09-08 11:44:11 +09:00
if ( ! f2fs_disable_compressed_file ( inode ) ) {
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
ret = - EOPNOTSUPP ;
goto out ;
}
2017-12-07 16:25:39 -08:00
set_inode_flag ( inode , FI_PIN_FILE ) ;
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
ret = F2FS_I ( inode ) - > i_gc_failures [ GC_FAILURE_PIN ] ;
2017-12-07 16:25:39 -08:00
done :
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
out :
inode_unlock ( inode ) ;
mnt_drop_write_file ( filp ) ;
return ret ;
}
static int f2fs_ioc_get_pin_file ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
__u32 pin = 0 ;
if ( is_inode_flag_set ( inode , FI_PIN_FILE ) )
f2fs: avoid stucking GC due to atomic write
f2fs doesn't allow abuse on atomic write class interface, so except
limiting in-mem pages' total memory usage capacity, we need to limit
atomic-write usage as well when filesystem is seriously fragmented,
otherwise we may run into infinite loop during foreground GC because
target blocks in victim segment are belong to atomic opened file for
long time.
Now, we will detect failure due to atomic write in foreground GC, if
the count exceeds threshold, we will drop all atomic written data in
cache, by this, I expect it can keep our system running safely to
prevent Dos attack.
In addition, his patch adds to show GC skip information in debugfs,
now it just shows count of skipped caused by atomic write.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-07 20:28:54 +08:00
pin = F2FS_I ( inode ) - > i_gc_failures [ GC_FAILURE_PIN ] ;
2017-12-07 16:25:39 -08:00
return put_user ( pin , ( u32 __user * ) arg ) ;
}
2018-01-11 14:42:30 +08:00
int f2fs_precache_extents ( struct inode * inode )
{
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct f2fs_map_blocks map ;
pgoff_t m_next_extent ;
loff_t end ;
int err ;
if ( is_inode_flag_set ( inode , FI_NO_EXTENT ) )
return - EOPNOTSUPP ;
map . m_lblk = 0 ;
map . m_next_pgofs = NULL ;
map . m_next_extent = & m_next_extent ;
map . m_seg_type = NO_CHECK_TYPE ;
2018-11-20 04:29:35 +08:00
map . m_may_create = false ;
2021-01-13 13:21:54 +08:00
end = max_file_blocks ( inode ) ;
2018-01-11 14:42:30 +08:00
while ( map . m_lblk < end ) {
map . m_len = end - map . m_lblk ;
2018-04-24 10:55:28 +08:00
down_write ( & fi - > i_gc_rwsem [ WRITE ] ) ;
2018-01-11 14:42:30 +08:00
err = f2fs_map_blocks ( inode , & map , 0 , F2FS_GET_BLOCK_PRECACHE ) ;
2018-04-24 10:55:28 +08:00
up_write ( & fi - > i_gc_rwsem [ WRITE ] ) ;
2018-01-11 14:42:30 +08:00
if ( err )
return err ;
map . m_lblk = m_next_extent ;
}
2021-05-15 11:09:41 -07:00
return 0 ;
2018-01-11 14:42:30 +08:00
}
static int f2fs_ioc_precache_extents ( struct file * filp , unsigned long arg )
{
return f2fs_precache_extents ( file_inode ( filp ) ) ;
}
2019-06-05 11:33:25 +08:00
static int f2fs_ioc_resize_fs ( struct file * filp , unsigned long arg )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( file_inode ( filp ) ) ;
__u64 block_count ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
if ( copy_from_user ( & block_count , ( void __user * ) arg ,
sizeof ( block_count ) ) )
return - EFAULT ;
2020-03-31 11:43:07 -07:00
return f2fs_resize_fs ( sbi , block_count ) ;
2019-06-05 11:33:25 +08:00
}
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
static int f2fs_ioc_enable_verity ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
f2fs_update_time ( F2FS_I_SB ( inode ) , REQ_TIME ) ;
if ( ! f2fs_sb_has_verity ( F2FS_I_SB ( inode ) ) ) {
f2fs_warn ( F2FS_I_SB ( inode ) ,
2021-05-26 13:05:36 -07:00
" Can't enable fs-verity on inode %lu: the verity feature is not enabled on this filesystem " ,
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
inode - > i_ino ) ;
return - EOPNOTSUPP ;
}
return fsverity_ioctl_enable ( filp , ( const void __user * ) arg ) ;
}
static int f2fs_ioc_measure_verity ( struct file * filp , unsigned long arg )
{
if ( ! f2fs_sb_has_verity ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fsverity_ioctl_measure ( filp , ( void __user * ) arg ) ;
}
fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl
Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity
metadata from a file that has fs-verity enabled, including:
- The Merkle tree
- The fsverity_descriptor (not including the signature if present)
- The built-in signature, if present
This ioctl has similar semantics to pread(). It is passed the type of
metadata to read (one of the above three), and a buffer, offset, and
size. It returns the number of bytes read or an error.
Separate patches will add support for each of the above metadata types.
This patch just adds the ioctl itself.
This ioctl doesn't make any assumption about where the metadata is
stored on-disk. It does assume the metadata is in a stable format, but
that's basically already the case:
- The Merkle tree and fsverity_descriptor are defined by how fs-verity
file digests are computed; see the "File digest computation" section
of Documentation/filesystems/fsverity.rst. Technically, the way in
which the levels of the tree are ordered relative to each other wasn't
previously specified, but it's logical to put the root level first.
- The built-in signature is the value passed to FS_IOC_ENABLE_VERITY.
This ioctl is useful because it allows writing a server program that
takes a verity file and serves it to a client program, such that the
client can do its own fs-verity compatible verification of the file.
This only makes sense if the client doesn't trust the server and if the
server needs to provide the storage for the client.
More concretely, there is interest in using this ability in Android to
export APK files (which are protected by fs-verity) to "protected VMs".
This would use Protected KVM (https://lwn.net/Articles/836693), which
provides an isolated execution environment without having to trust the
traditional "host". A "guest" VM can boot from a signed image and
perform specific tasks in a minimum trusted environment using files that
have fs-verity enabled on the host, without trusting the host or
requiring that the guest has its own trusted storage.
Technically, it would be possible to duplicate the metadata and store it
in separate files for serving. However, that would be less efficient
and would require extra care in userspace to maintain file consistency.
In addition to the above, the ability to read the built-in signatures is
useful because it allows a system that is using the in-kernel signature
verification to migrate to userspace signature verification.
Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org
Reviewed-by: Victor Hsieh <victorhsieh@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2021-01-15 10:18:16 -08:00
static int f2fs_ioc_read_verity_metadata ( struct file * filp , unsigned long arg )
{
if ( ! f2fs_sb_has_verity ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - EOPNOTSUPP ;
return fsverity_ioctl_read_metadata ( filp , ( const void __user * ) arg ) ;
}
2020-07-14 15:18:12 -07:00
static int f2fs_ioc_getfslabel ( struct file * filp , unsigned long arg )
2019-07-17 17:06:11 +08:00
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
char * vbuf ;
int count ;
int err = 0 ;
vbuf = f2fs_kzalloc ( sbi , MAX_VOLUME_NAME , GFP_KERNEL ) ;
if ( ! vbuf )
return - ENOMEM ;
down_read ( & sbi - > sb_lock ) ;
count = utf16s_to_utf8s ( sbi - > raw_super - > volume_name ,
ARRAY_SIZE ( sbi - > raw_super - > volume_name ) ,
UTF16_LITTLE_ENDIAN , vbuf , MAX_VOLUME_NAME ) ;
up_read ( & sbi - > sb_lock ) ;
if ( copy_to_user ( ( char __user * ) arg , vbuf ,
min ( FSLABEL_MAX , count ) ) )
err = - EFAULT ;
2020-09-14 16:47:00 +08:00
kfree ( vbuf ) ;
2019-07-17 17:06:11 +08:00
return err ;
}
2020-07-14 15:18:12 -07:00
static int f2fs_ioc_setfslabel ( struct file * filp , unsigned long arg )
2019-07-17 17:06:11 +08:00
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
char * vbuf ;
int err = 0 ;
if ( ! capable ( CAP_SYS_ADMIN ) )
return - EPERM ;
vbuf = strndup_user ( ( const char __user * ) arg , FSLABEL_MAX ) ;
if ( IS_ERR ( vbuf ) )
return PTR_ERR ( vbuf ) ;
err = mnt_want_write_file ( filp ) ;
if ( err )
goto out ;
down_write ( & sbi - > sb_lock ) ;
memset ( sbi - > raw_super - > volume_name , 0 ,
sizeof ( sbi - > raw_super - > volume_name ) ) ;
utf8s_to_utf16s ( vbuf , strlen ( vbuf ) , UTF16_LITTLE_ENDIAN ,
sbi - > raw_super - > volume_name ,
ARRAY_SIZE ( sbi - > raw_super - > volume_name ) ) ;
err = f2fs_commit_super ( sbi , false ) ;
up_write ( & sbi - > sb_lock ) ;
mnt_drop_write_file ( filp ) ;
out :
kfree ( vbuf ) ;
return err ;
}
2020-02-21 18:09:21 +08:00
static int f2fs_get_compress_blocks ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
__u64 blocks ;
if ( ! f2fs_sb_has_compression ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
if ( ! f2fs_compressed_file ( inode ) )
return - EINVAL ;
2020-09-08 11:44:10 +09:00
blocks = atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) ;
2020-02-21 18:09:21 +08:00
return put_user ( blocks , ( u64 __user * ) arg ) ;
}
2020-03-06 15:36:09 +08:00
static int release_compress_blocks ( struct dnode_of_data * dn , pgoff_t count )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( dn - > inode ) ;
unsigned int released_blocks = 0 ;
int cluster_size = F2FS_I ( dn - > inode ) - > i_cluster_size ;
block_t blkaddr ;
int i ;
for ( i = 0 ; i < count ; i + + ) {
blkaddr = data_blkaddr ( dn - > inode , dn - > node_page ,
dn - > ofs_in_node + i ) ;
if ( ! __is_valid_data_blkaddr ( blkaddr ) )
continue ;
if ( unlikely ( ! f2fs_is_valid_blkaddr ( sbi , blkaddr ,
DATA_GENERIC_ENHANCE ) ) )
return - EFSCORRUPTED ;
}
while ( count ) {
int compr_blocks = 0 ;
for ( i = 0 ; i < cluster_size ; i + + , dn - > ofs_in_node + + ) {
blkaddr = f2fs_data_blkaddr ( dn ) ;
if ( i = = 0 ) {
if ( blkaddr = = COMPRESS_ADDR )
continue ;
dn - > ofs_in_node + = cluster_size ;
goto next ;
}
if ( __is_valid_data_blkaddr ( blkaddr ) )
compr_blocks + + ;
if ( blkaddr ! = NEW_ADDR )
continue ;
dn - > data_blkaddr = NULL_ADDR ;
f2fs_set_data_blkaddr ( dn ) ;
}
f2fs_i_compr_blocks_update ( dn - > inode , compr_blocks , false ) ;
dec_valid_block_count ( sbi , dn - > inode ,
cluster_size - compr_blocks ) ;
released_blocks + = cluster_size - compr_blocks ;
next :
count - = cluster_size ;
}
return released_blocks ;
}
static int f2fs_release_compress_blocks ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
pgoff_t page_idx = 0 , last_idx ;
unsigned int released_blocks = 0 ;
int ret ;
int writecount ;
if ( ! f2fs_sb_has_compression ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
if ( ! f2fs_compressed_file ( inode ) )
return - EINVAL ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
inode_lock ( inode ) ;
writecount = atomic_read ( & inode - > i_writecount ) ;
2020-10-12 13:59:47 +09:00
if ( ( filp - > f_mode & FMODE_WRITE & & writecount ! = 1 ) | |
( ! ( filp - > f_mode & FMODE_WRITE ) & & writecount ) ) {
2020-03-06 15:36:09 +08:00
ret = - EBUSY ;
goto out ;
}
2021-05-25 11:39:35 -07:00
if ( is_inode_flag_set ( inode , FI_COMPRESS_RELEASED ) ) {
2020-03-06 15:36:09 +08:00
ret = - EINVAL ;
goto out ;
}
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 , LLONG_MAX ) ;
if ( ret )
goto out ;
2021-05-25 11:39:35 -07:00
set_inode_flag ( inode , FI_COMPRESS_RELEASED ) ;
2020-03-06 15:36:09 +08:00
inode - > i_ctime = current_time ( inode ) ;
f2fs_mark_inode_dirty_sync ( inode , true ) ;
2020-09-08 11:44:10 +09:00
if ( ! atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) )
2020-07-30 14:09:28 +09:00
goto out ;
2020-03-06 15:36:09 +08:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2020-03-06 15:36:09 +08:00
last_idx = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
while ( page_idx < last_idx ) {
struct dnode_of_data dn ;
pgoff_t end_offset , count ;
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
ret = f2fs_get_dnode_of_data ( & dn , page_idx , LOOKUP_NODE ) ;
if ( ret ) {
if ( ret = = - ENOENT ) {
page_idx = f2fs_get_next_page_offset ( & dn ,
page_idx ) ;
ret = 0 ;
continue ;
}
break ;
}
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
count = min ( end_offset - dn . ofs_in_node , last_idx - page_idx ) ;
2020-04-08 19:55:17 +08:00
count = round_up ( count , F2FS_I ( inode ) - > i_cluster_size ) ;
2020-03-06 15:36:09 +08:00
ret = release_compress_blocks ( & dn , count ) ;
f2fs_put_dnode ( & dn ) ;
if ( ret < 0 )
break ;
page_idx + = count ;
released_blocks + = ret ;
}
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2021-08-24 08:11:38 +08:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2020-03-06 15:36:09 +08:00
out :
inode_unlock ( inode ) ;
mnt_drop_write_file ( filp ) ;
if ( ret > = 0 ) {
ret = put_user ( released_blocks , ( u64 __user * ) arg ) ;
2020-09-08 11:44:10 +09:00
} else if ( released_blocks & &
atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) ) {
2020-03-06 15:36:09 +08:00
set_sbi_flag ( sbi , SBI_NEED_FSCK ) ;
f2fs_warn ( sbi , " %s: partial blocks were released i_ino=%lx "
2020-09-08 11:44:10 +09:00
" iblocks=%llu, released=%u, compr_blocks=%u, "
2020-03-06 15:36:09 +08:00
" run fsck to fix. " ,
__func__ , inode - > i_ino , inode - > i_blocks ,
released_blocks ,
2020-09-08 11:44:10 +09:00
atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) ) ;
2020-03-06 15:36:09 +08:00
}
return ret ;
}
2020-03-06 14:35:33 +08:00
static int reserve_compress_blocks ( struct dnode_of_data * dn , pgoff_t count )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( dn - > inode ) ;
unsigned int reserved_blocks = 0 ;
int cluster_size = F2FS_I ( dn - > inode ) - > i_cluster_size ;
block_t blkaddr ;
int i ;
for ( i = 0 ; i < count ; i + + ) {
blkaddr = data_blkaddr ( dn - > inode , dn - > node_page ,
dn - > ofs_in_node + i ) ;
if ( ! __is_valid_data_blkaddr ( blkaddr ) )
continue ;
if ( unlikely ( ! f2fs_is_valid_blkaddr ( sbi , blkaddr ,
DATA_GENERIC_ENHANCE ) ) )
return - EFSCORRUPTED ;
}
while ( count ) {
int compr_blocks = 0 ;
blkcnt_t reserved ;
int ret ;
for ( i = 0 ; i < cluster_size ; i + + , dn - > ofs_in_node + + ) {
blkaddr = f2fs_data_blkaddr ( dn ) ;
if ( i = = 0 ) {
if ( blkaddr = = COMPRESS_ADDR )
continue ;
dn - > ofs_in_node + = cluster_size ;
goto next ;
}
if ( __is_valid_data_blkaddr ( blkaddr ) ) {
compr_blocks + + ;
continue ;
}
dn - > data_blkaddr = NEW_ADDR ;
f2fs_set_data_blkaddr ( dn ) ;
}
reserved = cluster_size - compr_blocks ;
ret = inc_valid_block_count ( sbi , dn - > inode , & reserved ) ;
if ( ret )
return ret ;
if ( reserved ! = cluster_size - compr_blocks )
return - ENOSPC ;
f2fs_i_compr_blocks_update ( dn - > inode , compr_blocks , true ) ;
reserved_blocks + = reserved ;
next :
count - = cluster_size ;
}
return reserved_blocks ;
}
static int f2fs_reserve_compress_blocks ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
pgoff_t page_idx = 0 , last_idx ;
unsigned int reserved_blocks = 0 ;
int ret ;
if ( ! f2fs_sb_has_compression ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
if ( ! f2fs_compressed_file ( inode ) )
return - EINVAL ;
if ( f2fs_readonly ( sbi - > sb ) )
return - EROFS ;
ret = mnt_want_write_file ( filp ) ;
if ( ret )
return ret ;
2020-09-08 11:44:10 +09:00
if ( atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) )
2020-03-06 14:35:33 +08:00
goto out ;
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
inode_lock ( inode ) ;
2021-05-25 11:39:35 -07:00
if ( ! is_inode_flag_set ( inode , FI_COMPRESS_RELEASED ) ) {
2020-03-06 14:35:33 +08:00
ret = - EINVAL ;
goto unlock_inode ;
}
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( inode - > i_mapping ) ;
2020-03-06 14:35:33 +08:00
last_idx = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
while ( page_idx < last_idx ) {
struct dnode_of_data dn ;
pgoff_t end_offset , count ;
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
ret = f2fs_get_dnode_of_data ( & dn , page_idx , LOOKUP_NODE ) ;
if ( ret ) {
if ( ret = = - ENOENT ) {
page_idx = f2fs_get_next_page_offset ( & dn ,
page_idx ) ;
ret = 0 ;
continue ;
}
break ;
}
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
count = min ( end_offset - dn . ofs_in_node , last_idx - page_idx ) ;
2020-04-08 19:55:17 +08:00
count = round_up ( count , F2FS_I ( inode ) - > i_cluster_size ) ;
2020-03-06 14:35:33 +08:00
ret = reserve_compress_blocks ( & dn , count ) ;
f2fs_put_dnode ( & dn ) ;
if ( ret < 0 )
break ;
page_idx + = count ;
reserved_blocks + = ret ;
}
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
2021-08-24 08:11:38 +08:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2020-03-06 14:35:33 +08:00
if ( ret > = 0 ) {
2021-05-25 11:39:35 -07:00
clear_inode_flag ( inode , FI_COMPRESS_RELEASED ) ;
2020-03-06 14:35:33 +08:00
inode - > i_ctime = current_time ( inode ) ;
f2fs_mark_inode_dirty_sync ( inode , true ) ;
}
unlock_inode :
inode_unlock ( inode ) ;
out :
mnt_drop_write_file ( filp ) ;
if ( ret > = 0 ) {
ret = put_user ( reserved_blocks , ( u64 __user * ) arg ) ;
2020-09-08 11:44:10 +09:00
} else if ( reserved_blocks & &
atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) ) {
2020-03-06 14:35:33 +08:00
set_sbi_flag ( sbi , SBI_NEED_FSCK ) ;
f2fs_warn ( sbi , " %s: partial blocks were released i_ino=%lx "
2020-09-08 11:44:10 +09:00
" iblocks=%llu, reserved=%u, compr_blocks=%u, "
2020-03-06 14:35:33 +08:00
" run fsck to fix. " ,
__func__ , inode - > i_ino , inode - > i_blocks ,
reserved_blocks ,
2020-09-08 11:44:10 +09:00
atomic_read ( & F2FS_I ( inode ) - > i_compr_blocks ) ) ;
2020-03-06 14:35:33 +08:00
}
return ret ;
}
2020-07-21 12:21:11 +09:00
static int f2fs_secure_erase ( struct block_device * bdev , struct inode * inode ,
pgoff_t off , block_t block , block_t len , u32 flags )
{
struct request_queue * q = bdev_get_queue ( bdev ) ;
sector_t sector = SECTOR_FROM_BLOCK ( block ) ;
sector_t nr_sects = SECTOR_FROM_BLOCK ( len ) ;
int ret = 0 ;
if ( ! q )
return - ENXIO ;
if ( flags & F2FS_TRIM_FILE_DISCARD )
ret = blkdev_issue_discard ( bdev , sector , nr_sects , GFP_NOFS ,
blk_queue_secure_erase ( q ) ?
BLKDEV_DISCARD_SECURE : 0 ) ;
if ( ! ret & & ( flags & F2FS_TRIM_FILE_ZEROOUT ) ) {
if ( IS_ENCRYPTED ( inode ) )
ret = fscrypt_zeroout_range ( inode , off , block , len ) ;
else
ret = blkdev_issue_zeroout ( bdev , sector , nr_sects ,
GFP_NOFS , 0 ) ;
}
return ret ;
}
static int f2fs_sec_trim_file ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct address_space * mapping = inode - > i_mapping ;
struct block_device * prev_bdev = NULL ;
struct f2fs_sectrim_range range ;
pgoff_t index , pg_end , prev_index = 0 ;
block_t prev_block = 0 , len = 0 ;
loff_t end_addr ;
bool to_end = false ;
int ret = 0 ;
if ( ! ( filp - > f_mode & FMODE_WRITE ) )
return - EBADF ;
if ( copy_from_user ( & range , ( struct f2fs_sectrim_range __user * ) arg ,
sizeof ( range ) ) )
return - EFAULT ;
if ( range . flags = = 0 | | ( range . flags & ~ F2FS_TRIM_FILE_MASK ) | |
! S_ISREG ( inode - > i_mode ) )
return - EINVAL ;
if ( ( ( range . flags & F2FS_TRIM_FILE_DISCARD ) & &
! f2fs_hw_support_discard ( sbi ) ) | |
( ( range . flags & F2FS_TRIM_FILE_ZEROOUT ) & &
IS_ENCRYPTED ( inode ) & & f2fs_is_multi_device ( sbi ) ) )
return - EOPNOTSUPP ;
file_start_write ( filp ) ;
inode_lock ( inode ) ;
if ( f2fs_is_atomic_file ( inode ) | | f2fs_compressed_file ( inode ) | |
range . start > = inode - > i_size ) {
ret = - EINVAL ;
goto err ;
}
if ( range . len = = 0 )
goto err ;
if ( inode - > i_size - range . start > range . len ) {
end_addr = range . start + range . len ;
} else {
end_addr = range . len = = ( u64 ) - 1 ?
sbi - > sb - > s_maxbytes : inode - > i_size ;
to_end = true ;
}
if ( ! IS_ALIGNED ( range . start , F2FS_BLKSIZE ) | |
( ! to_end & & ! IS_ALIGNED ( end_addr , F2FS_BLKSIZE ) ) ) {
ret = - EINVAL ;
goto err ;
}
index = F2FS_BYTES_TO_BLK ( range . start ) ;
pg_end = DIV_ROUND_UP ( end_addr , F2FS_BLKSIZE ) ;
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
goto err ;
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-04-13 18:10:37 +02:00
filemap_invalidate_lock ( mapping ) ;
2020-07-21 12:21:11 +09:00
ret = filemap_write_and_wait_range ( mapping , range . start ,
to_end ? LLONG_MAX : end_addr - 1 ) ;
if ( ret )
goto out ;
truncate_inode_pages_range ( mapping , range . start ,
to_end ? - 1 : end_addr - 1 ) ;
while ( index < pg_end ) {
struct dnode_of_data dn ;
pgoff_t end_offset , count ;
int i ;
set_new_dnode ( & dn , inode , NULL , NULL , 0 ) ;
ret = f2fs_get_dnode_of_data ( & dn , index , LOOKUP_NODE ) ;
if ( ret ) {
if ( ret = = - ENOENT ) {
index = f2fs_get_next_page_offset ( & dn , index ) ;
continue ;
}
goto out ;
}
end_offset = ADDRS_PER_PAGE ( dn . node_page , inode ) ;
count = min ( end_offset - dn . ofs_in_node , pg_end - index ) ;
for ( i = 0 ; i < count ; i + + , index + + , dn . ofs_in_node + + ) {
struct block_device * cur_bdev ;
block_t blkaddr = f2fs_data_blkaddr ( & dn ) ;
if ( ! __is_valid_data_blkaddr ( blkaddr ) )
continue ;
if ( ! f2fs_is_valid_blkaddr ( sbi , blkaddr ,
DATA_GENERIC_ENHANCE ) ) {
ret = - EFSCORRUPTED ;
f2fs_put_dnode ( & dn ) ;
goto out ;
}
cur_bdev = f2fs_target_device ( sbi , blkaddr , NULL ) ;
if ( f2fs_is_multi_device ( sbi ) ) {
int di = f2fs_target_device_index ( sbi , blkaddr ) ;
blkaddr - = FDEV ( di ) . start_blk ;
}
if ( len ) {
if ( prev_bdev = = cur_bdev & &
index = = prev_index + len & &
blkaddr = = prev_block + len ) {
len + + ;
} else {
ret = f2fs_secure_erase ( prev_bdev ,
inode , prev_index , prev_block ,
len , range . flags ) ;
if ( ret ) {
f2fs_put_dnode ( & dn ) ;
goto out ;
}
len = 0 ;
}
}
if ( ! len ) {
prev_bdev = cur_bdev ;
prev_index = index ;
prev_block = blkaddr ;
len = 1 ;
}
}
f2fs_put_dnode ( & dn ) ;
if ( fatal_signal_pending ( current ) ) {
ret = - EINTR ;
goto out ;
}
cond_resched ( ) ;
}
if ( len )
ret = f2fs_secure_erase ( prev_bdev , inode , prev_index ,
prev_block , len , range . flags ) ;
out :
2021-04-13 18:10:37 +02:00
filemap_invalidate_unlock ( mapping ) ;
2020-07-21 12:21:11 +09:00
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
err :
inode_unlock ( inode ) ;
file_end_write ( filp ) ;
return ret ;
}
2020-10-30 13:10:34 +09:00
static int f2fs_ioc_get_compress_option ( struct file * filp , unsigned long arg )
2014-09-24 15:37:02 -07:00
{
2020-10-30 13:10:34 +09:00
struct inode * inode = file_inode ( filp ) ;
struct f2fs_comp_option option ;
if ( ! f2fs_sb_has_compression ( F2FS_I_SB ( inode ) ) )
return - EOPNOTSUPP ;
inode_lock_shared ( inode ) ;
if ( ! f2fs_compressed_file ( inode ) ) {
inode_unlock_shared ( inode ) ;
return - ENODATA ;
}
option . algorithm = F2FS_I ( inode ) - > i_compress_algorithm ;
option . log_cluster_size = F2FS_I ( inode ) - > i_log_cluster_size ;
inode_unlock_shared ( inode ) ;
if ( copy_to_user ( ( struct f2fs_comp_option __user * ) arg , & option ,
sizeof ( option ) ) )
return - EFAULT ;
return 0 ;
}
2020-10-30 13:10:35 +09:00
static int f2fs_ioc_set_compress_option ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct f2fs_comp_option option ;
int ret = 0 ;
2017-10-23 23:48:49 +02:00
2020-10-30 13:10:35 +09:00
if ( ! f2fs_sb_has_compression ( sbi ) )
return - EOPNOTSUPP ;
if ( ! ( filp - > f_mode & FMODE_WRITE ) )
return - EBADF ;
if ( copy_from_user ( & option , ( struct f2fs_comp_option __user * ) arg ,
sizeof ( option ) ) )
return - EFAULT ;
if ( ! f2fs_compressed_file ( inode ) | |
option . log_cluster_size < MIN_COMPRESS_LOG_SIZE | |
option . log_cluster_size > MAX_COMPRESS_LOG_SIZE | |
option . algorithm > = COMPRESS_MAX )
return - EINVAL ;
file_start_write ( filp ) ;
inode_lock ( inode ) ;
if ( f2fs_is_mmap_file ( inode ) | | get_dirty_pages ( inode ) ) {
ret = - EBUSY ;
goto out ;
}
if ( inode - > i_size ! = 0 ) {
ret = - EFBIG ;
goto out ;
}
F2FS_I ( inode ) - > i_compress_algorithm = option . algorithm ;
F2FS_I ( inode ) - > i_log_cluster_size = option . log_cluster_size ;
F2FS_I ( inode ) - > i_cluster_size = 1 < < option . log_cluster_size ;
f2fs_mark_inode_dirty_sync ( inode , true ) ;
if ( ! f2fs_is_compress_backend_ready ( inode ) )
f2fs_warn ( sbi , " compression algorithm is successfully set, "
" but current kernel doesn't support this algorithm. " ) ;
out :
inode_unlock ( inode ) ;
file_end_write ( filp ) ;
return ret ;
}
2020-12-03 15:56:15 +09:00
static int redirty_blocks ( struct inode * inode , pgoff_t page_idx , int len )
{
2021-04-07 21:18:55 +01:00
DEFINE_READAHEAD ( ractl , NULL , NULL , inode - > i_mapping , page_idx ) ;
2020-12-03 15:56:15 +09:00
struct address_space * mapping = inode - > i_mapping ;
struct page * page ;
pgoff_t redirty_idx = page_idx ;
int i , page_len = 0 , ret = 0 ;
page_cache_ra_unbounded ( & ractl , len , 0 ) ;
for ( i = 0 ; i < len ; i + + , page_idx + + ) {
page = read_cache_page ( mapping , page_idx , NULL , NULL ) ;
if ( IS_ERR ( page ) ) {
ret = PTR_ERR ( page ) ;
break ;
}
page_len + + ;
}
for ( i = 0 ; i < page_len ; i + + , redirty_idx + + ) {
page = find_lock_page ( mapping , redirty_idx ) ;
2021-01-06 08:49:28 +09:00
if ( ! page ) {
ret = - ENOMEM ;
break ;
}
2020-12-03 15:56:15 +09:00
set_page_dirty ( page ) ;
f2fs_put_page ( page , 1 ) ;
f2fs_put_page ( page , 0 ) ;
}
return ret ;
}
static int f2fs_ioc_decompress_file ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
pgoff_t page_idx = 0 , last_idx ;
unsigned int blk_per_seg = sbi - > blocks_per_seg ;
int cluster_size = F2FS_I ( inode ) - > i_cluster_size ;
int count , ret ;
if ( ! f2fs_sb_has_compression ( sbi ) | |
F2FS_OPTION ( sbi ) . compress_mode ! = COMPR_MODE_USER )
return - EOPNOTSUPP ;
if ( ! ( filp - > f_mode & FMODE_WRITE ) )
return - EBADF ;
if ( ! f2fs_compressed_file ( inode ) )
return - EINVAL ;
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
file_start_write ( filp ) ;
inode_lock ( inode ) ;
if ( ! f2fs_is_compress_backend_ready ( inode ) ) {
ret = - EOPNOTSUPP ;
goto out ;
}
if ( f2fs_is_mmap_file ( inode ) ) {
ret = - EBUSY ;
goto out ;
}
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 , LLONG_MAX ) ;
if ( ret )
goto out ;
if ( ! atomic_read ( & fi - > i_compr_blocks ) )
goto out ;
last_idx = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
count = last_idx - page_idx ;
while ( count ) {
int len = min ( cluster_size , count ) ;
ret = redirty_blocks ( inode , page_idx , len ) ;
if ( ret < 0 )
break ;
if ( get_dirty_pages ( inode ) > = blk_per_seg )
filemap_fdatawrite ( inode - > i_mapping ) ;
count - = len ;
page_idx + = len ;
}
if ( ! ret )
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 ,
LLONG_MAX ) ;
if ( ret )
2021-05-26 13:05:36 -07:00
f2fs_warn ( sbi , " %s: The file might be partially decompressed (errno=%d). Please delete the file. " ,
__func__ , ret ) ;
2020-12-03 15:56:15 +09:00
out :
inode_unlock ( inode ) ;
file_end_write ( filp ) ;
return ret ;
}
static int f2fs_ioc_compress_file ( struct file * filp , unsigned long arg )
{
struct inode * inode = file_inode ( filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
pgoff_t page_idx = 0 , last_idx ;
unsigned int blk_per_seg = sbi - > blocks_per_seg ;
int cluster_size = F2FS_I ( inode ) - > i_cluster_size ;
int count , ret ;
if ( ! f2fs_sb_has_compression ( sbi ) | |
F2FS_OPTION ( sbi ) . compress_mode ! = COMPR_MODE_USER )
return - EOPNOTSUPP ;
if ( ! ( filp - > f_mode & FMODE_WRITE ) )
return - EBADF ;
if ( ! f2fs_compressed_file ( inode ) )
return - EINVAL ;
f2fs_balance_fs ( F2FS_I_SB ( inode ) , true ) ;
file_start_write ( filp ) ;
inode_lock ( inode ) ;
if ( ! f2fs_is_compress_backend_ready ( inode ) ) {
ret = - EOPNOTSUPP ;
goto out ;
}
if ( f2fs_is_mmap_file ( inode ) ) {
ret = - EBUSY ;
goto out ;
}
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 , LLONG_MAX ) ;
if ( ret )
goto out ;
set_inode_flag ( inode , FI_ENABLE_COMPRESS ) ;
last_idx = DIV_ROUND_UP ( i_size_read ( inode ) , PAGE_SIZE ) ;
count = last_idx - page_idx ;
while ( count ) {
int len = min ( cluster_size , count ) ;
ret = redirty_blocks ( inode , page_idx , len ) ;
if ( ret < 0 )
break ;
if ( get_dirty_pages ( inode ) > = blk_per_seg )
filemap_fdatawrite ( inode - > i_mapping ) ;
count - = len ;
page_idx + = len ;
}
if ( ! ret )
ret = filemap_write_and_wait_range ( inode - > i_mapping , 0 ,
LLONG_MAX ) ;
clear_inode_flag ( inode , FI_ENABLE_COMPRESS ) ;
if ( ret )
2021-05-26 13:05:36 -07:00
f2fs_warn ( sbi , " %s: The file might be partially compressed (errno=%d). Please delete the file. " ,
__func__ , ret ) ;
2020-12-03 15:56:15 +09:00
out :
inode_unlock ( inode ) ;
file_end_write ( filp ) ;
return ret ;
}
2020-11-10 09:24:37 +08:00
static long __f2fs_ioctl ( struct file * filp , unsigned int cmd , unsigned long arg )
2014-09-24 15:37:02 -07:00
{
switch ( cmd ) {
2020-07-14 15:18:12 -07:00
case FS_IOC_GETVERSION :
2015-01-23 20:36:04 +08:00
return f2fs_ioc_getversion ( filp , arg ) ;
2014-10-06 17:39:50 -07:00
case F2FS_IOC_START_ATOMIC_WRITE :
return f2fs_ioc_start_atomic_write ( filp ) ;
case F2FS_IOC_COMMIT_ATOMIC_WRITE :
return f2fs_ioc_commit_atomic_write ( filp ) ;
2014-10-06 16:11:16 -07:00
case F2FS_IOC_START_VOLATILE_WRITE :
return f2fs_ioc_start_volatile_write ( filp ) ;
2014-12-09 06:08:59 -08:00
case F2FS_IOC_RELEASE_VOLATILE_WRITE :
return f2fs_ioc_release_volatile_write ( filp ) ;
case F2FS_IOC_ABORT_VOLATILE_WRITE :
return f2fs_ioc_abort_volatile_write ( filp ) ;
2015-01-08 19:15:53 -08:00
case F2FS_IOC_SHUTDOWN :
return f2fs_ioc_shutdown ( filp , arg ) ;
2014-09-24 15:37:02 -07:00
case FITRIM :
return f2fs_ioc_fitrim ( filp , arg ) ;
2020-07-14 15:18:12 -07:00
case FS_IOC_SET_ENCRYPTION_POLICY :
2015-04-20 15:19:06 -07:00
return f2fs_ioc_set_encryption_policy ( filp , arg ) ;
2020-07-14 15:18:12 -07:00
case FS_IOC_GET_ENCRYPTION_POLICY :
2015-04-20 15:19:06 -07:00
return f2fs_ioc_get_encryption_policy ( filp , arg ) ;
2020-07-14 15:18:12 -07:00
case FS_IOC_GET_ENCRYPTION_PWSALT :
2015-04-20 15:19:06 -07:00
return f2fs_ioc_get_encryption_pwsalt ( filp , arg ) ;
2019-08-04 19:35:48 -07:00
case FS_IOC_GET_ENCRYPTION_POLICY_EX :
return f2fs_ioc_get_encryption_policy_ex ( filp , arg ) ;
case FS_IOC_ADD_ENCRYPTION_KEY :
return f2fs_ioc_add_encryption_key ( filp , arg ) ;
case FS_IOC_REMOVE_ENCRYPTION_KEY :
return f2fs_ioc_remove_encryption_key ( filp , arg ) ;
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS :
return f2fs_ioc_remove_encryption_key_all_users ( filp , arg ) ;
case FS_IOC_GET_ENCRYPTION_KEY_STATUS :
return f2fs_ioc_get_encryption_key_status ( filp , arg ) ;
2020-03-14 13:50:51 -07:00
case FS_IOC_GET_ENCRYPTION_NONCE :
return f2fs_ioc_get_encryption_nonce ( filp , arg ) ;
2015-07-10 18:08:10 +08:00
case F2FS_IOC_GARBAGE_COLLECT :
return f2fs_ioc_gc ( filp , arg ) ;
2017-06-15 16:44:42 -07:00
case F2FS_IOC_GARBAGE_COLLECT_RANGE :
return f2fs_ioc_gc_range ( filp , arg ) ;
2015-10-05 22:24:19 +08:00
case F2FS_IOC_WRITE_CHECKPOINT :
2018-07-17 20:41:49 +08:00
return f2fs_ioc_write_checkpoint ( filp , arg ) ;
2015-10-27 09:53:45 +08:00
case F2FS_IOC_DEFRAGMENT :
return f2fs_ioc_defragment ( filp , arg ) ;
2016-07-08 15:16:47 -07:00
case F2FS_IOC_MOVE_RANGE :
return f2fs_ioc_move_range ( filp , arg ) ;
2017-04-13 15:17:00 -07:00
case F2FS_IOC_FLUSH_DEVICE :
return f2fs_ioc_flush_device ( filp , arg ) ;
2017-07-21 12:58:59 -07:00
case F2FS_IOC_GET_FEATURES :
return f2fs_ioc_get_features ( filp , arg ) ;
2017-12-07 16:25:39 -08:00
case F2FS_IOC_GET_PIN_FILE :
return f2fs_ioc_get_pin_file ( filp , arg ) ;
case F2FS_IOC_SET_PIN_FILE :
return f2fs_ioc_set_pin_file ( filp , arg ) ;
2018-01-11 14:42:30 +08:00
case F2FS_IOC_PRECACHE_EXTENTS :
return f2fs_ioc_precache_extents ( filp , arg ) ;
2019-06-05 11:33:25 +08:00
case F2FS_IOC_RESIZE_FS :
return f2fs_ioc_resize_fs ( filp , arg ) ;
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
case FS_IOC_ENABLE_VERITY :
return f2fs_ioc_enable_verity ( filp , arg ) ;
case FS_IOC_MEASURE_VERITY :
return f2fs_ioc_measure_verity ( filp , arg ) ;
fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl
Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity
metadata from a file that has fs-verity enabled, including:
- The Merkle tree
- The fsverity_descriptor (not including the signature if present)
- The built-in signature, if present
This ioctl has similar semantics to pread(). It is passed the type of
metadata to read (one of the above three), and a buffer, offset, and
size. It returns the number of bytes read or an error.
Separate patches will add support for each of the above metadata types.
This patch just adds the ioctl itself.
This ioctl doesn't make any assumption about where the metadata is
stored on-disk. It does assume the metadata is in a stable format, but
that's basically already the case:
- The Merkle tree and fsverity_descriptor are defined by how fs-verity
file digests are computed; see the "File digest computation" section
of Documentation/filesystems/fsverity.rst. Technically, the way in
which the levels of the tree are ordered relative to each other wasn't
previously specified, but it's logical to put the root level first.
- The built-in signature is the value passed to FS_IOC_ENABLE_VERITY.
This ioctl is useful because it allows writing a server program that
takes a verity file and serves it to a client program, such that the
client can do its own fs-verity compatible verification of the file.
This only makes sense if the client doesn't trust the server and if the
server needs to provide the storage for the client.
More concretely, there is interest in using this ability in Android to
export APK files (which are protected by fs-verity) to "protected VMs".
This would use Protected KVM (https://lwn.net/Articles/836693), which
provides an isolated execution environment without having to trust the
traditional "host". A "guest" VM can boot from a signed image and
perform specific tasks in a minimum trusted environment using files that
have fs-verity enabled on the host, without trusting the host or
requiring that the guest has its own trusted storage.
Technically, it would be possible to duplicate the metadata and store it
in separate files for serving. However, that would be less efficient
and would require extra care in userspace to maintain file consistency.
In addition to the above, the ability to read the built-in signatures is
useful because it allows a system that is using the in-kernel signature
verification to migrate to userspace signature verification.
Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org
Reviewed-by: Victor Hsieh <victorhsieh@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2021-01-15 10:18:16 -08:00
case FS_IOC_READ_VERITY_METADATA :
return f2fs_ioc_read_verity_metadata ( filp , arg ) ;
2020-07-14 15:18:12 -07:00
case FS_IOC_GETFSLABEL :
return f2fs_ioc_getfslabel ( filp , arg ) ;
case FS_IOC_SETFSLABEL :
return f2fs_ioc_setfslabel ( filp , arg ) ;
2020-02-21 18:09:21 +08:00
case F2FS_IOC_GET_COMPRESS_BLOCKS :
return f2fs_get_compress_blocks ( filp , arg ) ;
2020-03-06 15:36:09 +08:00
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS :
return f2fs_release_compress_blocks ( filp , arg ) ;
2020-03-06 14:35:33 +08:00
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS :
return f2fs_reserve_compress_blocks ( filp , arg ) ;
2020-07-21 12:21:11 +09:00
case F2FS_IOC_SEC_TRIM_FILE :
return f2fs_sec_trim_file ( filp , arg ) ;
2020-10-30 13:10:34 +09:00
case F2FS_IOC_GET_COMPRESS_OPTION :
return f2fs_ioc_get_compress_option ( filp , arg ) ;
2020-10-30 13:10:35 +09:00
case F2FS_IOC_SET_COMPRESS_OPTION :
return f2fs_ioc_set_compress_option ( filp , arg ) ;
2020-12-03 15:56:15 +09:00
case F2FS_IOC_DECOMPRESS_FILE :
return f2fs_ioc_decompress_file ( filp , arg ) ;
case F2FS_IOC_COMPRESS_FILE :
return f2fs_ioc_compress_file ( filp , arg ) ;
2012-11-02 17:09:44 +09:00
default :
return - ENOTTY ;
}
}
2020-11-10 09:24:37 +08:00
long f2fs_ioctl ( struct file * filp , unsigned int cmd , unsigned long arg )
{
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( file_inode ( filp ) ) ) ) )
return - EIO ;
if ( ! f2fs_is_checkpoint_ready ( F2FS_I_SB ( file_inode ( filp ) ) ) )
return - ENOSPC ;
return __f2fs_ioctl ( filp , cmd , arg ) ;
}
2021-07-23 00:59:21 -07:00
/*
* Return % true if the given read or write request should use direct I / O , or
* % false if it should use buffered I / O .
*/
static bool f2fs_should_use_dio ( struct inode * inode , struct kiocb * iocb ,
struct iov_iter * iter )
{
unsigned int align ;
if ( ! ( iocb - > ki_flags & IOCB_DIRECT ) )
return false ;
if ( f2fs_force_buffered_io ( inode , iocb , iter ) )
return false ;
/*
* Direct I / O not aligned to the disk ' s logical_block_size will be
* attempted , but will fail with - EINVAL .
*
* f2fs additionally requires that direct I / O be aligned to the
* filesystem block size , which is often a stricter requirement .
* However , f2fs traditionally falls back to buffered I / O on requests
* that are logical_block_size - aligned but not fs - block aligned .
*
* The below logic implements this behavior .
*/
align = iocb - > ki_pos | iov_iter_alignment ( iter ) ;
if ( ! IS_ALIGNED ( align , i_blocksize ( inode ) ) & &
IS_ALIGNED ( align , bdev_logical_block_size ( inode - > i_sb - > s_bdev ) ) )
return false ;
return true ;
}
static int f2fs_dio_read_end_io ( struct kiocb * iocb , ssize_t size , int error ,
unsigned int flags )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( file_inode ( iocb - > ki_filp ) ) ;
dec_page_count ( sbi , F2FS_DIO_READ ) ;
if ( error )
return error ;
f2fs_update_iostat ( sbi , APP_DIRECT_READ_IO , size ) ;
return 0 ;
}
static const struct iomap_dio_ops f2fs_iomap_dio_read_ops = {
. end_io = f2fs_dio_read_end_io ,
} ;
static ssize_t f2fs_dio_read_iter ( struct kiocb * iocb , struct iov_iter * to )
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
2021-07-23 00:59:21 -07:00
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
const loff_t pos = iocb - > ki_pos ;
const size_t count = iov_iter_count ( to ) ;
struct iomap_dio * dio ;
ssize_t ret ;
if ( count = = 0 )
return 0 ; /* skip atime update */
trace_f2fs_direct_IO_enter ( inode , pos , count , READ ) ;
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
if ( ! down_read_trylock ( & fi - > i_gc_rwsem [ READ ] ) ) {
ret = - EAGAIN ;
goto out ;
}
} else {
down_read ( & fi - > i_gc_rwsem [ READ ] ) ;
}
/*
* We have to use __iomap_dio_rw ( ) and iomap_dio_complete ( ) instead of
* the higher - level function iomap_dio_rw ( ) in order to ensure that the
* F2FS_DIO_READ counter will be decremented correctly in all cases .
*/
inc_page_count ( sbi , F2FS_DIO_READ ) ;
dio = __iomap_dio_rw ( iocb , to , & f2fs_iomap_ops ,
& f2fs_iomap_dio_read_ops , 0 , 0 ) ;
if ( IS_ERR_OR_NULL ( dio ) ) {
ret = PTR_ERR_OR_ZERO ( dio ) ;
if ( ret ! = - EIOCBQUEUED )
dec_page_count ( sbi , F2FS_DIO_READ ) ;
} else {
ret = iomap_dio_complete ( dio ) ;
}
up_read ( & fi - > i_gc_rwsem [ READ ] ) ;
file_accessed ( file ) ;
out :
trace_f2fs_direct_IO_exit ( inode , pos , count , READ , ret ) ;
return ret ;
}
static ssize_t f2fs_file_read_iter ( struct kiocb * iocb , struct iov_iter * to )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
ssize_t ret ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
if ( ! f2fs_is_compress_backend_ready ( inode ) )
return - EOPNOTSUPP ;
2021-07-23 00:59:21 -07:00
if ( f2fs_should_use_dio ( inode , iocb , to ) )
return f2fs_dio_read_iter ( iocb , to ) ;
2020-04-16 18:16:56 +08:00
2021-07-23 00:59:21 -07:00
ret = filemap_read ( iocb , to , 0 ) ;
2020-04-16 18:16:56 +08:00
if ( ret > 0 )
2021-07-23 00:59:21 -07:00
f2fs_update_iostat ( F2FS_I_SB ( inode ) , APP_BUFFERED_READ_IO , ret ) ;
2020-04-16 18:16:56 +08:00
return ret ;
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
}
2021-07-23 00:59:21 -07:00
static ssize_t f2fs_write_checks ( struct kiocb * iocb , struct iov_iter * from )
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
ssize_t count ;
int err ;
if ( IS_IMMUTABLE ( inode ) )
return - EPERM ;
if ( is_inode_flag_set ( inode , FI_COMPRESS_RELEASED ) )
return - EPERM ;
count = generic_write_checks ( iocb , from ) ;
if ( count < = 0 )
return count ;
err = file_modified ( file ) ;
if ( err )
return err ;
return count ;
}
2021-07-16 09:39:13 -05:00
/*
* Preallocate blocks for a write request , if it is possible and helpful to do
* so . Returns a positive number if blocks may have been preallocated , 0 if no
* blocks were preallocated , or a negative errno value if something went
* seriously wrong . Also sets FI_PREALLOCATED_ALL on the inode if * all * the
* requested blocks ( not just some of them ) have been allocated .
*/
2021-07-23 00:59:21 -07:00
static int f2fs_preallocate_blocks ( struct kiocb * iocb , struct iov_iter * iter ,
bool dio )
2021-07-16 09:39:13 -05:00
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
const loff_t pos = iocb - > ki_pos ;
const size_t count = iov_iter_count ( iter ) ;
struct f2fs_map_blocks map = { } ;
int flag ;
int ret ;
/* If it will be an out-of-place direct write, don't bother. */
if ( dio & & f2fs_lfs_mode ( sbi ) )
return 0 ;
2021-11-12 14:31:16 -08:00
/*
* Don ' t preallocate holes aligned to DIO_SKIP_HOLES which turns into
* buffered IO , if DIO meets any holes .
*/
if ( dio & & i_size_read ( inode ) & &
( F2FS_BYTES_TO_BLK ( pos ) < F2FS_BLK_ALIGN ( i_size_read ( inode ) ) ) )
return 0 ;
2021-07-16 09:39:13 -05:00
/* No-wait I/O can't allocate blocks. */
if ( iocb - > ki_flags & IOCB_NOWAIT )
return 0 ;
/* If it will be a short write, don't bother. */
if ( fault_in_iov_iter_readable ( iter , count ) )
return 0 ;
if ( f2fs_has_inline_data ( inode ) ) {
/* If the data will fit inline, don't bother. */
if ( pos + count < = MAX_INLINE_DATA ( inode ) )
return 0 ;
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
return ret ;
}
/* Do not preallocate blocks that will be written partially in 4KB. */
map . m_lblk = F2FS_BLK_ALIGN ( pos ) ;
map . m_len = F2FS_BYTES_TO_BLK ( pos + count ) ;
if ( map . m_len > map . m_lblk )
map . m_len - = map . m_lblk ;
else
map . m_len = 0 ;
map . m_may_create = true ;
if ( dio ) {
map . m_seg_type = f2fs_rw_hint_to_seg_type ( inode - > i_write_hint ) ;
flag = F2FS_GET_BLOCK_PRE_DIO ;
} else {
map . m_seg_type = NO_CHECK_TYPE ;
flag = F2FS_GET_BLOCK_PRE_AIO ;
}
ret = f2fs_map_blocks ( inode , & map , 1 , flag ) ;
2021-11-12 14:31:16 -08:00
/* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */
if ( ret < 0 & & ! ( ( ret = = - ENOSPC | | ret = = - EDQUOT ) & & map . m_len > 0 ) )
2021-07-16 09:39:13 -05:00
return ret ;
if ( ret = = 0 )
set_inode_flag ( inode , FI_PREALLOCATED_ALL ) ;
return map . m_len ;
}
2021-07-23 00:59:21 -07:00
static ssize_t f2fs_buffered_write_iter ( struct kiocb * iocb ,
struct iov_iter * from )
2015-04-21 20:39:58 -07:00
{
2016-02-03 13:09:09 -08:00
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
2021-07-23 00:59:21 -07:00
ssize_t ret ;
if ( iocb - > ki_flags & IOCB_NOWAIT )
return - EOPNOTSUPP ;
current - > backing_dev_info = inode_to_bdi ( inode ) ;
ret = generic_perform_write ( file , from , iocb - > ki_pos ) ;
current - > backing_dev_info = NULL ;
if ( ret > 0 ) {
iocb - > ki_pos + = ret ;
f2fs_update_iostat ( F2FS_I_SB ( inode ) , APP_BUFFERED_IO , ret ) ;
}
return ret ;
}
static int f2fs_dio_write_end_io ( struct kiocb * iocb , ssize_t size , int error ,
unsigned int flags )
{
struct f2fs_sb_info * sbi = F2FS_I_SB ( file_inode ( iocb - > ki_filp ) ) ;
dec_page_count ( sbi , F2FS_DIO_WRITE ) ;
if ( error )
return error ;
f2fs_update_iostat ( sbi , APP_DIRECT_IO , size ) ;
return 0 ;
}
static const struct iomap_dio_ops f2fs_iomap_dio_write_ops = {
. end_io = f2fs_dio_write_end_io ,
} ;
static ssize_t f2fs_dio_write_iter ( struct kiocb * iocb , struct iov_iter * from ,
bool * may_need_sync )
{
struct file * file = iocb - > ki_filp ;
struct inode * inode = file_inode ( file ) ;
struct f2fs_inode_info * fi = F2FS_I ( inode ) ;
struct f2fs_sb_info * sbi = F2FS_I_SB ( inode ) ;
const bool do_opu = f2fs_lfs_mode ( sbi ) ;
const int whint_mode = F2FS_OPTION ( sbi ) . whint_mode ;
const loff_t pos = iocb - > ki_pos ;
const ssize_t count = iov_iter_count ( from ) ;
const enum rw_hint hint = iocb - > ki_hint ;
unsigned int dio_flags ;
struct iomap_dio * dio ;
ssize_t ret ;
trace_f2fs_direct_IO_enter ( inode , pos , count , WRITE ) ;
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
/* f2fs_convert_inline_inode() and block allocation can block */
if ( f2fs_has_inline_data ( inode ) | |
! f2fs_overwrite_io ( inode , pos , count ) ) {
ret = - EAGAIN ;
goto out ;
}
if ( ! down_read_trylock ( & fi - > i_gc_rwsem [ WRITE ] ) ) {
ret = - EAGAIN ;
goto out ;
}
if ( do_opu & & ! down_read_trylock ( & fi - > i_gc_rwsem [ READ ] ) ) {
up_read ( & fi - > i_gc_rwsem [ WRITE ] ) ;
ret = - EAGAIN ;
goto out ;
}
} else {
ret = f2fs_convert_inline_inode ( inode ) ;
if ( ret )
goto out ;
down_read ( & fi - > i_gc_rwsem [ WRITE ] ) ;
if ( do_opu )
down_read ( & fi - > i_gc_rwsem [ READ ] ) ;
}
if ( whint_mode = = WHINT_MODE_OFF )
iocb - > ki_hint = WRITE_LIFE_NOT_SET ;
/*
* We have to use __iomap_dio_rw ( ) and iomap_dio_complete ( ) instead of
* the higher - level function iomap_dio_rw ( ) in order to ensure that the
* F2FS_DIO_WRITE counter will be decremented correctly in all cases .
*/
inc_page_count ( sbi , F2FS_DIO_WRITE ) ;
dio_flags = 0 ;
if ( pos + count > inode - > i_size )
dio_flags | = IOMAP_DIO_FORCE_WAIT ;
dio = __iomap_dio_rw ( iocb , from , & f2fs_iomap_ops ,
& f2fs_iomap_dio_write_ops , dio_flags , 0 ) ;
if ( IS_ERR_OR_NULL ( dio ) ) {
ret = PTR_ERR_OR_ZERO ( dio ) ;
if ( ret = = - ENOTBLK )
ret = 0 ;
if ( ret ! = - EIOCBQUEUED )
dec_page_count ( sbi , F2FS_DIO_WRITE ) ;
} else {
ret = iomap_dio_complete ( dio ) ;
}
if ( whint_mode = = WHINT_MODE_OFF )
iocb - > ki_hint = hint ;
if ( do_opu )
up_read ( & fi - > i_gc_rwsem [ READ ] ) ;
up_read ( & fi - > i_gc_rwsem [ WRITE ] ) ;
if ( ret < 0 )
goto out ;
if ( pos + ret > inode - > i_size )
f2fs_i_size_write ( inode , pos + ret ) ;
if ( ! do_opu )
set_inode_flag ( inode , FI_UPDATE_WRITE ) ;
if ( iov_iter_count ( from ) ) {
ssize_t ret2 ;
loff_t bufio_start_pos = iocb - > ki_pos ;
/*
* The direct write was partial , so we need to fall back to a
* buffered write for the remainder .
*/
ret2 = f2fs_buffered_write_iter ( iocb , from ) ;
if ( iov_iter_count ( from ) )
f2fs_write_failed ( inode , iocb - > ki_pos ) ;
if ( ret2 < 0 )
goto out ;
/*
* Ensure that the pagecache pages are written to disk and
* invalidated to preserve the expected O_DIRECT semantics .
*/
if ( ret2 > 0 ) {
loff_t bufio_end_pos = bufio_start_pos + ret2 - 1 ;
ret + = ret2 ;
ret2 = filemap_write_and_wait_range ( file - > f_mapping ,
bufio_start_pos ,
bufio_end_pos ) ;
if ( ret2 < 0 )
goto out ;
invalidate_mapping_pages ( file - > f_mapping ,
bufio_start_pos > > PAGE_SHIFT ,
bufio_end_pos > > PAGE_SHIFT ) ;
}
} else {
/* iomap_dio_rw() already handled the generic_write_sync(). */
* may_need_sync = false ;
}
out :
trace_f2fs_direct_IO_exit ( inode , pos , count , WRITE , ret ) ;
return ret ;
}
static ssize_t f2fs_file_write_iter ( struct kiocb * iocb , struct iov_iter * from )
{
struct inode * inode = file_inode ( iocb - > ki_filp ) ;
2021-07-16 09:39:15 -05:00
const loff_t orig_pos = iocb - > ki_pos ;
const size_t orig_count = iov_iter_count ( from ) ;
2021-07-16 09:39:13 -05:00
loff_t target_size ;
2021-07-23 00:59:21 -07:00
bool dio ;
bool may_need_sync = true ;
2021-07-16 09:39:13 -05:00
int preallocated ;
2016-02-03 13:09:09 -08:00
ssize_t ret ;
2015-04-21 20:39:58 -07:00
2019-04-02 18:52:22 +08:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( inode ) ) ) ) {
ret = - EIO ;
goto out ;
}
2017-10-23 23:48:49 +02:00
2020-02-24 19:20:15 +08:00
if ( ! f2fs_is_compress_backend_ready ( inode ) ) {
ret = - EOPNOTSUPP ;
goto out ;
}
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
2019-09-11 11:45:17 -05:00
if ( iocb - > ki_flags & IOCB_NOWAIT ) {
if ( ! inode_trylock ( inode ) ) {
2019-04-02 18:52:22 +08:00
ret = - EAGAIN ;
goto out ;
}
2019-09-11 11:45:17 -05:00
} else {
2018-03-08 19:34:38 +09:00
inode_lock ( inode ) ;
}
2021-07-23 00:59:21 -07:00
ret = f2fs_write_checks ( iocb , from ) ;
2021-07-16 09:39:14 -05:00
if ( ret < = 0 )
goto out_unlock ;
2021-07-23 00:59:21 -07:00
/* Determine whether we will do a direct write or a buffered write. */
dio = f2fs_should_use_dio ( inode , iocb , from ) ;
2021-07-16 09:39:13 -05:00
2021-07-16 09:39:14 -05:00
/* Possibly preallocate the blocks for the write. */
target_size = iocb - > ki_pos + iov_iter_count ( from ) ;
2021-07-23 00:59:21 -07:00
preallocated = f2fs_preallocate_blocks ( iocb , from , dio ) ;
2021-11-12 14:31:16 -08:00
if ( preallocated < 0 )
2021-07-16 09:39:14 -05:00
ret = preallocated ;
2021-11-12 14:31:16 -08:00
else
2021-07-23 00:59:21 -07:00
/* Do the actual write. */
ret = dio ?
f2fs_dio_write_iter ( iocb , from , & may_need_sync ) :
f2fs_buffered_write_iter ( iocb , from ) ;
2018-03-30 17:58:13 -07:00
2021-07-16 09:39:14 -05:00
/* Don't leave any preallocated blocks around past i_size. */
2021-11-12 14:31:16 -08:00
if ( preallocated & & i_size_read ( inode ) < target_size ) {
2021-07-16 09:39:14 -05:00
down_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
filemap_invalidate_lock ( inode - > i_mapping ) ;
2021-11-12 14:31:16 -08:00
if ( ! f2fs_truncate ( inode ) )
file_dont_truncate ( inode ) ;
2021-07-16 09:39:14 -05:00
filemap_invalidate_unlock ( inode - > i_mapping ) ;
up_write ( & F2FS_I ( inode ) - > i_gc_rwsem [ WRITE ] ) ;
2021-11-12 14:31:16 -08:00
} else {
file_dont_truncate ( inode ) ;
2016-02-03 13:09:09 -08:00
}
2021-07-16 09:39:14 -05:00
clear_inode_flag ( inode , FI_PREALLOCATED_ALL ) ;
2021-07-16 09:39:13 -05:00
out_unlock :
2016-02-03 13:09:09 -08:00
inode_unlock ( inode ) ;
2019-04-02 18:52:22 +08:00
out :
2021-07-16 09:39:15 -05:00
trace_f2fs_file_write_iter ( inode , orig_pos , orig_count , ret ) ;
2021-07-23 00:59:21 -07:00
if ( ret > 0 & & may_need_sync )
2016-04-07 08:52:01 -07:00
ret = generic_write_sync ( iocb , ret ) ;
2016-02-03 13:09:09 -08:00
return ret ;
2015-04-21 20:39:58 -07:00
}
2021-08-02 21:22:45 -07:00
static int f2fs_file_fadvise ( struct file * filp , loff_t offset , loff_t len ,
int advice )
{
struct inode * inode ;
struct address_space * mapping ;
struct backing_dev_info * bdi ;
if ( advice = = POSIX_FADV_SEQUENTIAL ) {
inode = file_inode ( filp ) ;
if ( S_ISFIFO ( inode - > i_mode ) )
return - ESPIPE ;
mapping = filp - > f_mapping ;
if ( ! mapping | | len < 0 )
return - EINVAL ;
bdi = inode_to_bdi ( mapping - > host ) ;
filp - > f_ra . ra_pages = bdi - > ra_pages *
F2FS_I_SB ( inode ) - > seq_file_ra_mul ;
spin_lock ( & filp - > f_lock ) ;
filp - > f_mode & = ~ FMODE_RANDOM ;
spin_unlock ( & filp - > f_lock ) ;
return 0 ;
}
return generic_fadvise ( filp , offset , len , advice ) ;
}
2013-02-04 23:41:41 +09:00
# ifdef CONFIG_COMPAT
2020-11-10 09:24:37 +08:00
struct compat_f2fs_gc_range {
u32 sync ;
compat_u64 start ;
compat_u64 len ;
} ;
# define F2FS_IOC32_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11,\
struct compat_f2fs_gc_range )
static int f2fs_compat_ioc_gc_range ( struct file * file , unsigned long arg )
{
struct compat_f2fs_gc_range __user * urange ;
struct f2fs_gc_range range ;
int err ;
urange = compat_ptr ( arg ) ;
err = get_user ( range . sync , & urange - > sync ) ;
err | = get_user ( range . start , & urange - > start ) ;
err | = get_user ( range . len , & urange - > len ) ;
if ( err )
return - EFAULT ;
return __f2fs_ioc_gc_range ( file , & range ) ;
}
struct compat_f2fs_move_range {
u32 dst_fd ;
compat_u64 pos_in ;
compat_u64 pos_out ;
compat_u64 len ;
} ;
# define F2FS_IOC32_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \
struct compat_f2fs_move_range )
static int f2fs_compat_ioc_move_range ( struct file * file , unsigned long arg )
{
struct compat_f2fs_move_range __user * urange ;
struct f2fs_move_range range ;
int err ;
urange = compat_ptr ( arg ) ;
err = get_user ( range . dst_fd , & urange - > dst_fd ) ;
err | = get_user ( range . pos_in , & urange - > pos_in ) ;
err | = get_user ( range . pos_out , & urange - > pos_out ) ;
err | = get_user ( range . len , & urange - > len ) ;
if ( err )
return - EFAULT ;
return __f2fs_ioc_move_range ( file , & range ) ;
}
2013-02-04 23:41:41 +09:00
long f2fs_compat_ioctl ( struct file * file , unsigned int cmd , unsigned long arg )
{
2020-11-10 09:24:37 +08:00
if ( unlikely ( f2fs_cp_error ( F2FS_I_SB ( file_inode ( file ) ) ) ) )
return - EIO ;
if ( ! f2fs_is_checkpoint_ready ( F2FS_I_SB ( file_inode ( file ) ) ) )
return - ENOSPC ;
2013-02-04 23:41:41 +09:00
switch ( cmd ) {
2020-07-14 15:18:12 -07:00
case FS_IOC32_GETVERSION :
cmd = FS_IOC_GETVERSION ;
2015-11-10 18:44:20 +08:00
break ;
2020-11-10 09:24:37 +08:00
case F2FS_IOC32_GARBAGE_COLLECT_RANGE :
return f2fs_compat_ioc_gc_range ( file , arg ) ;
case F2FS_IOC32_MOVE_RANGE :
return f2fs_compat_ioc_move_range ( file , arg ) ;
2015-11-10 18:44:20 +08:00
case F2FS_IOC_START_ATOMIC_WRITE :
case F2FS_IOC_COMMIT_ATOMIC_WRITE :
case F2FS_IOC_START_VOLATILE_WRITE :
case F2FS_IOC_RELEASE_VOLATILE_WRITE :
case F2FS_IOC_ABORT_VOLATILE_WRITE :
case F2FS_IOC_SHUTDOWN :
2019-06-03 13:51:58 +02:00
case FITRIM :
2020-07-14 15:18:12 -07:00
case FS_IOC_SET_ENCRYPTION_POLICY :
case FS_IOC_GET_ENCRYPTION_PWSALT :
case FS_IOC_GET_ENCRYPTION_POLICY :
2019-08-04 19:35:48 -07:00
case FS_IOC_GET_ENCRYPTION_POLICY_EX :
case FS_IOC_ADD_ENCRYPTION_KEY :
case FS_IOC_REMOVE_ENCRYPTION_KEY :
case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS :
case FS_IOC_GET_ENCRYPTION_KEY_STATUS :
2020-03-14 13:50:51 -07:00
case FS_IOC_GET_ENCRYPTION_NONCE :
2015-11-10 18:44:20 +08:00
case F2FS_IOC_GARBAGE_COLLECT :
case F2FS_IOC_WRITE_CHECKPOINT :
case F2FS_IOC_DEFRAGMENT :
2017-04-13 15:17:00 -07:00
case F2FS_IOC_FLUSH_DEVICE :
2017-07-21 12:58:59 -07:00
case F2FS_IOC_GET_FEATURES :
2017-12-07 16:25:39 -08:00
case F2FS_IOC_GET_PIN_FILE :
case F2FS_IOC_SET_PIN_FILE :
2018-01-11 14:42:30 +08:00
case F2FS_IOC_PRECACHE_EXTENTS :
2019-06-05 11:33:25 +08:00
case F2FS_IOC_RESIZE_FS :
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-22 09:26:24 -07:00
case FS_IOC_ENABLE_VERITY :
case FS_IOC_MEASURE_VERITY :
fs-verity: add FS_IOC_READ_VERITY_METADATA ioctl
Add an ioctl FS_IOC_READ_VERITY_METADATA which will allow reading verity
metadata from a file that has fs-verity enabled, including:
- The Merkle tree
- The fsverity_descriptor (not including the signature if present)
- The built-in signature, if present
This ioctl has similar semantics to pread(). It is passed the type of
metadata to read (one of the above three), and a buffer, offset, and
size. It returns the number of bytes read or an error.
Separate patches will add support for each of the above metadata types.
This patch just adds the ioctl itself.
This ioctl doesn't make any assumption about where the metadata is
stored on-disk. It does assume the metadata is in a stable format, but
that's basically already the case:
- The Merkle tree and fsverity_descriptor are defined by how fs-verity
file digests are computed; see the "File digest computation" section
of Documentation/filesystems/fsverity.rst. Technically, the way in
which the levels of the tree are ordered relative to each other wasn't
previously specified, but it's logical to put the root level first.
- The built-in signature is the value passed to FS_IOC_ENABLE_VERITY.
This ioctl is useful because it allows writing a server program that
takes a verity file and serves it to a client program, such that the
client can do its own fs-verity compatible verification of the file.
This only makes sense if the client doesn't trust the server and if the
server needs to provide the storage for the client.
More concretely, there is interest in using this ability in Android to
export APK files (which are protected by fs-verity) to "protected VMs".
This would use Protected KVM (https://lwn.net/Articles/836693), which
provides an isolated execution environment without having to trust the
traditional "host". A "guest" VM can boot from a signed image and
perform specific tasks in a minimum trusted environment using files that
have fs-verity enabled on the host, without trusting the host or
requiring that the guest has its own trusted storage.
Technically, it would be possible to duplicate the metadata and store it
in separate files for serving. However, that would be less efficient
and would require extra care in userspace to maintain file consistency.
In addition to the above, the ability to read the built-in signatures is
useful because it allows a system that is using the in-kernel signature
verification to migrate to userspace signature verification.
Link: https://lore.kernel.org/r/20210115181819.34732-4-ebiggers@kernel.org
Reviewed-by: Victor Hsieh <victorhsieh@google.com>
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2021-01-15 10:18:16 -08:00
case FS_IOC_READ_VERITY_METADATA :
2020-07-14 15:18:12 -07:00
case FS_IOC_GETFSLABEL :
case FS_IOC_SETFSLABEL :
2020-02-21 18:09:21 +08:00
case F2FS_IOC_GET_COMPRESS_BLOCKS :
2020-03-06 15:36:09 +08:00
case F2FS_IOC_RELEASE_COMPRESS_BLOCKS :
2020-03-06 14:35:33 +08:00
case F2FS_IOC_RESERVE_COMPRESS_BLOCKS :
2020-07-21 12:21:11 +09:00
case F2FS_IOC_SEC_TRIM_FILE :
2020-10-30 13:10:34 +09:00
case F2FS_IOC_GET_COMPRESS_OPTION :
2020-10-30 13:10:35 +09:00
case F2FS_IOC_SET_COMPRESS_OPTION :
2020-12-03 15:56:15 +09:00
case F2FS_IOC_DECOMPRESS_FILE :
case F2FS_IOC_COMPRESS_FILE :
2016-07-08 15:16:47 -07:00
break ;
2013-02-04 23:41:41 +09:00
default :
return - ENOIOCTLCMD ;
}
2020-11-10 09:24:37 +08:00
return __f2fs_ioctl ( file , cmd , ( unsigned long ) compat_ptr ( arg ) ) ;
2013-02-04 23:41:41 +09:00
}
# endif
2012-11-02 17:09:44 +09:00
const struct file_operations f2fs_file_operations = {
2014-04-23 14:10:24 +08:00
. llseek = f2fs_llseek ,
f2fs: support data compression
This patch tries to support compression in f2fs.
- New term named cluster is defined as basic unit of compression, file can
be divided into multiple clusters logically. One cluster includes 4 << n
(n >= 0) logical pages, compression size is also cluster size, each of
cluster can be compressed or not.
- In cluster metadata layout, one special flag is used to indicate cluster
is compressed one or normal one, for compressed cluster, following metadata
maps cluster to [1, 4 << n - 1] physical blocks, in where f2fs stores
data including compress header and compressed data.
- In order to eliminate write amplification during overwrite, F2FS only
support compression on write-once file, data can be compressed only when
all logical blocks in file are valid and cluster compress ratio is lower
than specified threshold.
- To enable compression on regular inode, there are three ways:
* chattr +c file
* chattr +c dir; touch dir/file
* mount w/ -o compress_extension=ext; touch file.ext
Compress metadata layout:
[Dnode Structure]
+-----------------------------------------------+
| cluster 1 | cluster 2 | ......... | cluster N |
+-----------------------------------------------+
. . . .
. . . .
. Compressed Cluster . . Normal Cluster .
+----------+---------+---------+---------+ +---------+---------+---------+---------+
|compr flag| block 1 | block 2 | block 3 | | block 1 | block 2 | block 3 | block 4 |
+----------+---------+---------+---------+ +---------+---------+---------+---------+
. .
. .
. .
+-------------+-------------+----------+----------------------------+
| data length | data chksum | reserved | compressed data |
+-------------+-------------+----------+----------------------------+
Changelog:
20190326:
- fix error handling of read_end_io().
- remove unneeded comments in f2fs_encrypt_one_page().
20190327:
- fix wrong use of f2fs_cluster_is_full() in f2fs_mpage_readpages().
- don't jump into loop directly to avoid uninitialized variables.
- add TODO tag in error path of f2fs_write_cache_pages().
20190328:
- fix wrong merge condition in f2fs_read_multi_pages().
- check compressed file in f2fs_post_read_required().
20190401
- allow overwrite on non-compressed cluster.
- check cluster meta before writing compressed data.
20190402
- don't preallocate blocks for compressed file.
- add lz4 compress algorithm
- process multiple post read works in one workqueue
Now f2fs supports processing post read work in multiple workqueue,
it shows low performance due to schedule overhead of multiple
workqueue executing orderly.
20190921
- compress: support buffered overwrite
C: compress cluster flag
V: valid block address
N: NEW_ADDR
One cluster contain 4 blocks
before overwrite after overwrite
- VVVV -> CVNN
- CVNN -> VVVV
- CVNN -> CVNN
- CVNN -> CVVV
- CVVV -> CVNN
- CVVV -> CVVV
20191029
- add kconfig F2FS_FS_COMPRESSION to isolate compression related
codes, add kconfig F2FS_FS_{LZO,LZ4} to cover backend algorithm.
note that: will remove lzo backend if Jaegeuk agreed that too.
- update codes according to Eric's comments.
20191101
- apply fixes from Jaegeuk
20191113
- apply fixes from Jaegeuk
- split workqueue for fsverity
20191216
- apply fixes from Jaegeuk
20200117
- fix to avoid NULL pointer dereference
[Jaegeuk Kim]
- add tracepoint for f2fs_{,de}compress_pages()
- fix many bugs and add some compression stats
- fix overwrite/mmap bugs
- address 32bit build error, reported by Geert.
- bug fixes when handling errors and i_compressed_blocks
Reported-by: <noreply@ellerman.id.au>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-11-01 18:07:14 +08:00
. read_iter = f2fs_file_read_iter ,
2015-04-21 20:39:58 -07:00
. write_iter = f2fs_file_write_iter ,
. open = f2fs_file_open ,
2014-12-05 14:37:37 -08:00
. release = f2fs_release_file ,
2012-11-02 17:09:44 +09:00
. mmap = f2fs_file_mmap ,
2017-07-24 19:46:29 -07:00
. flush = f2fs_file_flush ,
2012-11-02 17:09:44 +09:00
. fsync = f2fs_sync_file ,
. fallocate = f2fs_fallocate ,
. unlocked_ioctl = f2fs_ioctl ,
2013-02-04 23:41:41 +09:00
# ifdef CONFIG_COMPAT
. compat_ioctl = f2fs_compat_ioctl ,
# endif
2012-11-02 17:09:44 +09:00
. splice_read = generic_file_splice_read ,
2014-04-05 04:27:08 -04:00
. splice_write = iter_file_splice_write ,
2021-08-02 21:22:45 -07:00
. fadvise = f2fs_file_fadvise ,
2012-11-02 17:09:44 +09:00
} ;