2017-12-18 06:00:59 +03:00
// SPDX-License-Identifier: LGPL-2.1
2012-12-10 23:04:46 +04:00
/*
* Copyright ( c ) 2012 Taobao .
* Written by Tao Ma < boyu . mt @ taobao . com >
*/
2015-04-12 07:56:28 +03:00
2017-10-02 00:57:54 +03:00
# include <linux/iomap.h>
2015-04-12 07:56:28 +03:00
# include <linux/fiemap.h>
2022-06-30 12:01:00 +03:00
# include <linux/namei.h>
2018-01-09 16:21:39 +03:00
# include <linux/iversion.h>
mm: introduce memalloc_retry_wait()
Various places in the kernel - largely in filesystems - respond to a
memory allocation failure by looping around and re-trying. Some of
these cannot conveniently use __GFP_NOFAIL, for reasons such as:
- a GFP_ATOMIC allocation, which __GFP_NOFAIL doesn't work on
- a need to check for the process being signalled between failures
- the possibility that other recovery actions could be performed
- the allocation is quite deep in support code, and passing down an
extra flag to say if __GFP_NOFAIL is wanted would be clumsy.
Many of these currently use congestion_wait() which (in almost all
cases) simply waits the given timeout - congestion isn't tracked for
most devices.
It isn't clear what the best delay is for loops, but it is clear that
the various filesystems shouldn't be responsible for choosing a timeout.
This patch introduces memalloc_retry_wait() with takes on that
responsibility. Code that wants to retry a memory allocation can call
this function passing the GFP flags that were used. It will wait
however is appropriate.
For now, it only considers __GFP_NORETRY and whatever
gfpflags_allow_blocking() tests. If blocking is allowed without
__GFP_NORETRY, then alloc_page either made some reclaim progress, or
waited for a while, before failing. So there is no need for much
further waiting. memalloc_retry_wait() will wait until the current
jiffie ends. If this condition is not met, then alloc_page() won't have
waited much if at all. In that case memalloc_retry_wait() waits about
200ms. This is the delay that most current loops uses.
linux/sched/mm.h needs to be included in some files now,
but linux/backing-dev.h does not.
Link: https://lkml.kernel.org/r/163754371968.13692.1277530886009912421@noble.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Chao Yu <chao@kernel.org>
Cc: Darrick J. Wong <djwong@kernel.org>
Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2022-01-15 01:07:14 +03:00
# include <linux/sched/mm.h>
2015-04-12 07:56:28 +03:00
2012-12-10 23:04:46 +04:00
# include "ext4_jbd2.h"
# include "ext4.h"
# include "xattr.h"
2012-12-10 23:05:51 +04:00
# include "truncate.h"
2012-12-10 23:04:46 +04:00
# define EXT4_XATTR_SYSTEM_DATA "data"
# define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
2013-04-20 01:53:09 +04:00
# define EXT4_INLINE_DOTDOT_OFFSET 2
# define EXT4_INLINE_DOTDOT_SIZE 4
2012-12-10 23:04:46 +04:00
2014-05-12 18:50:23 +04:00
static int ext4_get_inline_size ( struct inode * inode )
2012-12-10 23:04:46 +04:00
{
if ( EXT4_I ( inode ) - > i_inline_off )
return EXT4_I ( inode ) - > i_inline_size ;
return 0 ;
}
static int get_max_inline_xattr_value_size ( struct inode * inode ,
struct ext4_iloc * iloc )
{
struct ext4_xattr_ibody_header * header ;
struct ext4_xattr_entry * entry ;
struct ext4_inode * raw_inode ;
2023-05-12 22:11:02 +03:00
void * end ;
2012-12-10 23:04:46 +04:00
int free , min_offs ;
2022-06-16 05:13:57 +03:00
if ( ! EXT4_INODE_HAS_XATTR_SPACE ( inode ) )
return 0 ;
2012-12-10 23:04:46 +04:00
min_offs = EXT4_SB ( inode - > i_sb ) - > s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I ( inode ) - > i_extra_isize -
sizeof ( struct ext4_xattr_ibody_header ) ;
/*
* We need to subtract another sizeof ( __u32 ) since an in - inode xattr
* needs an empty 4 bytes to indicate the gap between the xattr entry
* and the name / value pair .
*/
if ( ! ext4_test_inode_state ( inode , EXT4_STATE_XATTR ) )
return EXT4_XATTR_SIZE ( min_offs -
EXT4_XATTR_LEN ( strlen ( EXT4_XATTR_SYSTEM_DATA ) ) -
EXT4_XATTR_ROUND - sizeof ( __u32 ) ) ;
raw_inode = ext4_raw_inode ( iloc ) ;
header = IHDR ( inode , raw_inode ) ;
entry = IFIRST ( header ) ;
2023-05-12 22:11:02 +03:00
end = ( void * ) raw_inode + EXT4_SB ( inode - > i_sb ) - > s_inode_size ;
2012-12-10 23:04:46 +04:00
/* Compute min_offs. */
2023-05-12 22:11:02 +03:00
while ( ! IS_LAST_ENTRY ( entry ) ) {
void * next = EXT4_XATTR_NEXT ( entry ) ;
if ( next > = end ) {
EXT4_ERROR_INODE ( inode ,
" corrupt xattr in inline inode " ) ;
return 0 ;
}
2017-06-22 04:10:32 +03:00
if ( ! entry - > e_value_inum & & entry - > e_value_size ) {
2012-12-10 23:04:46 +04:00
size_t offs = le16_to_cpu ( entry - > e_value_offs ) ;
if ( offs < min_offs )
min_offs = offs ;
}
2023-05-12 22:11:02 +03:00
entry = next ;
2012-12-10 23:04:46 +04:00
}
free = min_offs -
( ( void * ) entry - ( void * ) IFIRST ( header ) ) - sizeof ( __u32 ) ;
if ( EXT4_I ( inode ) - > i_inline_off ) {
entry = ( struct ext4_xattr_entry * )
( ( void * ) raw_inode + EXT4_I ( inode ) - > i_inline_off ) ;
2013-07-01 16:12:37 +04:00
free + = EXT4_XATTR_SIZE ( le32_to_cpu ( entry - > e_value_size ) ) ;
2012-12-10 23:04:46 +04:00
goto out ;
}
free - = EXT4_XATTR_LEN ( strlen ( EXT4_XATTR_SYSTEM_DATA ) ) ;
if ( free > EXT4_XATTR_ROUND )
free = EXT4_XATTR_SIZE ( free - EXT4_XATTR_ROUND ) ;
else
free = 0 ;
out :
return free ;
}
/*
* Get the maximum size we now can store in an inode .
* If we can ' t find the space for a xattr entry , don ' t use the space
* of the extents since we have no space to indicate the inline data .
*/
int ext4_get_max_inline_size ( struct inode * inode )
{
int error , max_inline_size ;
struct ext4_iloc iloc ;
if ( EXT4_I ( inode ) - > i_extra_isize = = 0 )
return 0 ;
error = ext4_get_inode_loc ( inode , & iloc ) ;
if ( error ) {
2020-03-29 02:33:43 +03:00
ext4_error_inode_err ( inode , __func__ , __LINE__ , 0 , - error ,
" can't get inode location %lu " ,
inode - > i_ino ) ;
2012-12-10 23:04:46 +04:00
return 0 ;
}
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
max_inline_size = get_max_inline_xattr_value_size ( inode , & iloc ) ;
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
brelse ( iloc . bh ) ;
if ( ! max_inline_size )
return 0 ;
return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE ;
}
/*
* this function does not take xattr_sem , which is OK because it is
* currently only used in a code path coming form ext4_iget , before
* the new inode has been unlocked
*/
int ext4_find_inline_data_nolock ( struct inode * inode )
{
struct ext4_xattr_ibody_find is = {
. s = { . not_found = - ENODATA , } ,
} ;
struct ext4_xattr_info i = {
. name_index = EXT4_XATTR_INDEX_SYSTEM ,
. name = EXT4_XATTR_SYSTEM_DATA ,
} ;
int error ;
if ( EXT4_I ( inode ) - > i_extra_isize = = 0 )
return 0 ;
error = ext4_get_inode_loc ( inode , & is . iloc ) ;
if ( error )
return error ;
error = ext4_xattr_ibody_find ( inode , & i , & is ) ;
if ( error )
goto out ;
if ( ! is . s . not_found ) {
2018-05-22 23:15:24 +03:00
if ( is . s . here - > e_value_inum ) {
EXT4_ERROR_INODE ( inode , " inline data xattr refers "
" to an external xattr inode " ) ;
error = - EFSCORRUPTED ;
goto out ;
}
2012-12-10 23:04:46 +04:00
EXT4_I ( inode ) - > i_inline_off = ( u16 ) ( ( void * ) is . s . here -
( void * ) ext4_raw_inode ( & is . iloc ) ) ;
EXT4_I ( inode ) - > i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
le32_to_cpu ( is . s . here - > e_value_size ) ;
}
out :
brelse ( is . iloc . bh ) ;
return error ;
}
static int ext4_read_inline_data ( struct inode * inode , void * buffer ,
unsigned int len ,
struct ext4_iloc * iloc )
{
struct ext4_xattr_entry * entry ;
struct ext4_xattr_ibody_header * header ;
int cp_len = 0 ;
struct ext4_inode * raw_inode ;
if ( ! len )
return 0 ;
BUG_ON ( len > EXT4_I ( inode ) - > i_inline_size ) ;
2022-08-17 05:59:28 +03:00
cp_len = min_t ( unsigned int , len , EXT4_MIN_INLINE_DATA_SIZE ) ;
2012-12-10 23:04:46 +04:00
raw_inode = ext4_raw_inode ( iloc ) ;
memcpy ( buffer , ( void * ) ( raw_inode - > i_block ) , cp_len ) ;
len - = cp_len ;
buffer + = cp_len ;
if ( ! len )
goto out ;
header = IHDR ( inode , raw_inode ) ;
entry = ( struct ext4_xattr_entry * ) ( ( void * ) raw_inode +
EXT4_I ( inode ) - > i_inline_off ) ;
len = min_t ( unsigned int , len ,
( unsigned int ) le32_to_cpu ( entry - > e_value_size ) ) ;
memcpy ( buffer ,
( void * ) IFIRST ( header ) + le16_to_cpu ( entry - > e_value_offs ) , len ) ;
cp_len + = len ;
out :
return cp_len ;
}
/*
* write the buffer to the inline inode .
* If ' create ' is set , we don ' t need to do the extra copy in the xattr
2021-06-03 05:03:02 +03:00
* value since it is already handled by ext4_xattr_ibody_set .
2012-12-10 23:06:02 +04:00
* That saves us one memcpy .
2012-12-10 23:04:46 +04:00
*/
2014-05-12 18:50:23 +04:00
static void ext4_write_inline_data ( struct inode * inode , struct ext4_iloc * iloc ,
void * buffer , loff_t pos , unsigned int len )
2012-12-10 23:04:46 +04:00
{
struct ext4_xattr_entry * entry ;
struct ext4_xattr_ibody_header * header ;
struct ext4_inode * raw_inode ;
int cp_len = 0 ;
2023-06-16 19:50:49 +03:00
if ( unlikely ( ext4_forced_shutdown ( inode - > i_sb ) ) )
2017-02-05 09:28:48 +03:00
return ;
2012-12-10 23:04:46 +04:00
BUG_ON ( ! EXT4_I ( inode ) - > i_inline_off ) ;
BUG_ON ( pos + len > EXT4_I ( inode ) - > i_inline_size ) ;
raw_inode = ext4_raw_inode ( iloc ) ;
buffer + = pos ;
if ( pos < EXT4_MIN_INLINE_DATA_SIZE ) {
cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
EXT4_MIN_INLINE_DATA_SIZE - pos : len ;
memcpy ( ( void * ) raw_inode - > i_block + pos , buffer , cp_len ) ;
len - = cp_len ;
buffer + = cp_len ;
pos + = cp_len ;
}
if ( ! len )
return ;
pos - = EXT4_MIN_INLINE_DATA_SIZE ;
header = IHDR ( inode , raw_inode ) ;
entry = ( struct ext4_xattr_entry * ) ( ( void * ) raw_inode +
EXT4_I ( inode ) - > i_inline_off ) ;
memcpy ( ( void * ) IFIRST ( header ) + le16_to_cpu ( entry - > e_value_offs ) + pos ,
buffer , len ) ;
}
static int ext4_create_inline_data ( handle_t * handle ,
struct inode * inode , unsigned len )
{
int error ;
void * value = NULL ;
struct ext4_xattr_ibody_find is = {
. s = { . not_found = - ENODATA , } ,
} ;
struct ext4_xattr_info i = {
. name_index = EXT4_XATTR_INDEX_SYSTEM ,
. name = EXT4_XATTR_SYSTEM_DATA ,
} ;
error = ext4_get_inode_loc ( inode , & is . iloc ) ;
if ( error )
return error ;
2014-05-13 06:06:43 +04:00
BUFFER_TRACE ( is . iloc . bh , " get_write_access " ) ;
2021-08-16 12:57:04 +03:00
error = ext4_journal_get_write_access ( handle , inode - > i_sb , is . iloc . bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:04:46 +04:00
if ( error )
goto out ;
if ( len > EXT4_MIN_INLINE_DATA_SIZE ) {
2012-12-11 12:31:49 +04:00
value = EXT4_ZERO_XATTR_VALUE ;
2012-12-10 23:04:46 +04:00
len - = EXT4_MIN_INLINE_DATA_SIZE ;
} else {
value = " " ;
len = 0 ;
}
2020-04-24 20:16:24 +03:00
/* Insert the xttr entry. */
2012-12-10 23:04:46 +04:00
i . value = value ;
i . value_len = len ;
error = ext4_xattr_ibody_find ( inode , & i , & is ) ;
if ( error )
goto out ;
BUG_ON ( ! is . s . not_found ) ;
2021-06-03 05:03:02 +03:00
error = ext4_xattr_ibody_set ( handle , inode , & i , & is ) ;
2012-12-10 23:04:46 +04:00
if ( error ) {
if ( error = = - ENOSPC )
ext4_clear_inode_state ( inode ,
EXT4_STATE_MAY_INLINE_DATA ) ;
goto out ;
}
memset ( ( void * ) ext4_raw_inode ( & is . iloc ) - > i_block ,
0 , EXT4_MIN_INLINE_DATA_SIZE ) ;
EXT4_I ( inode ) - > i_inline_off = ( u16 ) ( ( void * ) is . s . here -
( void * ) ext4_raw_inode ( & is . iloc ) ) ;
EXT4_I ( inode ) - > i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE ;
ext4_clear_inode_flag ( inode , EXT4_INODE_EXTENTS ) ;
ext4_set_inode_flag ( inode , EXT4_INODE_INLINE_DATA ) ;
get_bh ( is . iloc . bh ) ;
error = ext4_mark_iloc_dirty ( handle , inode , & is . iloc ) ;
out :
brelse ( is . iloc . bh ) ;
return error ;
}
static int ext4_update_inline_data ( handle_t * handle , struct inode * inode ,
unsigned int len )
{
int error ;
void * value = NULL ;
struct ext4_xattr_ibody_find is = {
. s = { . not_found = - ENODATA , } ,
} ;
struct ext4_xattr_info i = {
. name_index = EXT4_XATTR_INDEX_SYSTEM ,
. name = EXT4_XATTR_SYSTEM_DATA ,
} ;
/* If the old space is ok, write the data directly. */
if ( len < = EXT4_I ( inode ) - > i_inline_size )
return 0 ;
error = ext4_get_inode_loc ( inode , & is . iloc ) ;
if ( error )
return error ;
error = ext4_xattr_ibody_find ( inode , & i , & is ) ;
if ( error )
goto out ;
BUG_ON ( is . s . not_found ) ;
len - = EXT4_MIN_INLINE_DATA_SIZE ;
value = kzalloc ( len , GFP_NOFS ) ;
2016-12-10 17:56:01 +03:00
if ( ! value ) {
error = - ENOMEM ;
2012-12-10 23:04:46 +04:00
goto out ;
2016-12-10 17:56:01 +03:00
}
2012-12-10 23:04:46 +04:00
error = ext4_xattr_ibody_get ( inode , i . name_index , i . name ,
value , len ) ;
2023-05-12 22:16:27 +03:00
if ( error < 0 )
2012-12-10 23:04:46 +04:00
goto out ;
2014-05-13 06:06:43 +04:00
BUFFER_TRACE ( is . iloc . bh , " get_write_access " ) ;
2021-08-16 12:57:04 +03:00
error = ext4_journal_get_write_access ( handle , inode - > i_sb , is . iloc . bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:04:46 +04:00
if ( error )
goto out ;
2020-08-05 05:48:50 +03:00
/* Update the xattr entry. */
2012-12-10 23:04:46 +04:00
i . value = value ;
i . value_len = len ;
2021-06-03 05:03:02 +03:00
error = ext4_xattr_ibody_set ( handle , inode , & i , & is ) ;
2012-12-10 23:04:46 +04:00
if ( error )
goto out ;
EXT4_I ( inode ) - > i_inline_off = ( u16 ) ( ( void * ) is . s . here -
( void * ) ext4_raw_inode ( & is . iloc ) ) ;
EXT4_I ( inode ) - > i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
le32_to_cpu ( is . s . here - > e_value_size ) ;
ext4_set_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
get_bh ( is . iloc . bh ) ;
error = ext4_mark_iloc_dirty ( handle , inode , & is . iloc ) ;
out :
kfree ( value ) ;
brelse ( is . iloc . bh ) ;
return error ;
}
2014-05-12 18:50:23 +04:00
static int ext4_prepare_inline_data ( handle_t * handle , struct inode * inode ,
unsigned int len )
2012-12-10 23:04:46 +04:00
{
2017-01-12 05:50:46 +03:00
int ret , size , no_expand ;
2012-12-10 23:04:46 +04:00
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
if ( ! ext4_test_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) )
return - ENOSPC ;
size = ext4_get_max_inline_size ( inode ) ;
if ( size < len )
return - ENOSPC ;
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
2012-12-10 23:04:46 +04:00
if ( ei - > i_inline_off )
ret = ext4_update_inline_data ( handle , inode , len ) ;
else
ret = ext4_create_inline_data ( handle , inode , len ) ;
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:04:46 +04:00
return ret ;
}
static int ext4_destroy_inline_data_nolock ( handle_t * handle ,
struct inode * inode )
{
struct ext4_inode_info * ei = EXT4_I ( inode ) ;
struct ext4_xattr_ibody_find is = {
. s = { . not_found = 0 , } ,
} ;
struct ext4_xattr_info i = {
. name_index = EXT4_XATTR_INDEX_SYSTEM ,
. name = EXT4_XATTR_SYSTEM_DATA ,
. value = NULL ,
. value_len = 0 ,
} ;
int error ;
if ( ! ei - > i_inline_off )
return 0 ;
error = ext4_get_inode_loc ( inode , & is . iloc ) ;
if ( error )
return error ;
error = ext4_xattr_ibody_find ( inode , & i , & is ) ;
if ( error )
goto out ;
2014-05-13 06:06:43 +04:00
BUFFER_TRACE ( is . iloc . bh , " get_write_access " ) ;
2021-08-16 12:57:04 +03:00
error = ext4_journal_get_write_access ( handle , inode - > i_sb , is . iloc . bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:04:46 +04:00
if ( error )
goto out ;
2021-06-03 05:03:02 +03:00
error = ext4_xattr_ibody_set ( handle , inode , & i , & is ) ;
2012-12-10 23:04:46 +04:00
if ( error )
goto out ;
memset ( ( void * ) ext4_raw_inode ( & is . iloc ) - > i_block ,
0 , EXT4_MIN_INLINE_DATA_SIZE ) ;
2018-06-15 19:28:16 +03:00
memset ( ei - > i_data , 0 , EXT4_MIN_INLINE_DATA_SIZE ) ;
2012-12-10 23:04:46 +04:00
2015-10-17 23:18:43 +03:00
if ( ext4_has_feature_extents ( inode - > i_sb ) ) {
2012-12-10 23:04:46 +04:00
if ( S_ISDIR ( inode - > i_mode ) | |
S_ISREG ( inode - > i_mode ) | | S_ISLNK ( inode - > i_mode ) ) {
ext4_set_inode_flag ( inode , EXT4_INODE_EXTENTS ) ;
ext4_ext_tree_init ( handle , inode ) ;
}
}
ext4_clear_inode_flag ( inode , EXT4_INODE_INLINE_DATA ) ;
get_bh ( is . iloc . bh ) ;
error = ext4_mark_iloc_dirty ( handle , inode , & is . iloc ) ;
EXT4_I ( inode ) - > i_inline_off = 0 ;
EXT4_I ( inode ) - > i_inline_size = 0 ;
ext4_clear_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
out :
brelse ( is . iloc . bh ) ;
if ( error = = - ENODATA )
error = 0 ;
return error ;
}
2023-03-24 21:01:14 +03:00
static int ext4_read_inline_folio ( struct inode * inode , struct folio * folio )
2012-12-10 23:04:52 +04:00
{
void * kaddr ;
int ret = 0 ;
size_t len ;
struct ext4_iloc iloc ;
2023-03-24 21:01:14 +03:00
BUG_ON ( ! folio_test_locked ( folio ) ) ;
2012-12-10 23:04:52 +04:00
BUG_ON ( ! ext4_has_inline_data ( inode ) ) ;
2023-03-24 21:01:14 +03:00
BUG_ON ( folio - > index ) ;
2012-12-10 23:04:52 +04:00
if ( ! EXT4_I ( inode ) - > i_inline_off ) {
ext4_warning ( inode - > i_sb , " inode %lu doesn't have inline data. " ,
inode - > i_ino ) ;
goto out ;
}
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
goto out ;
len = min_t ( size_t , ext4_get_inline_size ( inode ) , i_size_read ( inode ) ) ;
2023-03-24 21:01:14 +03:00
BUG_ON ( len > PAGE_SIZE ) ;
kaddr = kmap_local_folio ( folio , 0 ) ;
2012-12-10 23:04:52 +04:00
ret = ext4_read_inline_data ( inode , kaddr , len , & iloc ) ;
2023-03-24 21:01:14 +03:00
flush_dcache_folio ( folio ) ;
kunmap_local ( kaddr ) ;
folio_zero_segment ( folio , len , folio_size ( folio ) ) ;
folio_mark_uptodate ( folio ) ;
2012-12-10 23:04:52 +04:00
brelse ( iloc . bh ) ;
out :
return ret ;
}
2023-03-24 21:01:09 +03:00
int ext4_readpage_inline ( struct inode * inode , struct folio * folio )
2012-12-10 23:04:52 +04:00
{
int ret = 0 ;
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
return - EAGAIN ;
}
/*
* Current inline data can only exist in the 1 st page ,
* So for all the other pages , just set them uptodate .
*/
2023-03-24 21:01:09 +03:00
if ( ! folio - > index )
2023-03-24 21:01:14 +03:00
ret = ext4_read_inline_folio ( inode , folio ) ;
2023-03-24 21:01:09 +03:00
else if ( ! folio_test_uptodate ( folio ) ) {
folio_zero_segment ( folio , 0 , folio_size ( folio ) ) ;
folio_mark_uptodate ( folio ) ;
2012-12-10 23:04:52 +04:00
}
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
2023-03-24 21:01:09 +03:00
folio_unlock ( folio ) ;
2012-12-10 23:04:52 +04:00
return ret > = 0 ? 0 : ret ;
}
2012-12-10 23:05:51 +04:00
static int ext4_convert_inline_data_to_extent ( struct address_space * mapping ,
2022-02-22 18:36:28 +03:00
struct inode * inode )
2012-12-10 23:05:51 +04:00
{
2017-01-12 05:50:46 +03:00
int ret , needed_blocks , no_expand ;
2012-12-10 23:05:51 +04:00
handle_t * handle = NULL ;
int retries = 0 , sem_held = 0 ;
2023-03-24 21:01:10 +03:00
struct folio * folio = NULL ;
2012-12-10 23:05:51 +04:00
unsigned from , to ;
struct ext4_iloc iloc ;
if ( ! ext4_has_inline_data ( inode ) ) {
/*
* clear the flag so that no new write
* will trap here again .
*/
ext4_clear_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
return 0 ;
}
needed_blocks = ext4_writepage_trans_blocks ( inode ) ;
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
retry :
2013-02-09 06:59:22 +04:00
handle = ext4_journal_start ( inode , EXT4_HT_WRITE_PAGE , needed_blocks ) ;
2012-12-10 23:05:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
handle = NULL ;
goto out ;
}
/* We cannot recurse into the filesystem as the transaction is already
* started */
2023-03-24 21:01:10 +03:00
folio = __filemap_get_folio ( mapping , 0 , FGP_WRITEBEGIN | FGP_NOFS ,
mapping_gfp_mask ( mapping ) ) ;
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 05:42:02 +03:00
if ( IS_ERR ( folio ) ) {
ret = PTR_ERR ( folio ) ;
goto out_nofolio ;
2012-12-10 23:05:51 +04:00
}
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
2012-12-10 23:05:51 +04:00
sem_held = 1 ;
/* If some one has already done this for us, just exit. */
if ( ! ext4_has_inline_data ( inode ) ) {
ret = 0 ;
goto out ;
}
from = 0 ;
to = ext4_get_inline_size ( inode ) ;
2023-03-24 21:01:10 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
2023-03-24 21:01:14 +03:00
ret = ext4_read_inline_folio ( inode , folio ) ;
2012-12-10 23:05:51 +04:00
if ( ret < 0 )
goto out ;
}
ret = ext4_destroy_inline_data_nolock ( handle , inode ) ;
if ( ret )
goto out ;
2016-03-09 07:08:10 +03:00
if ( ext4_should_dioread_nolock ( inode ) ) {
2023-03-24 21:01:10 +03:00
ret = __block_write_begin ( & folio - > page , from , to ,
2016-03-09 07:08:10 +03:00
ext4_get_block_unwritten ) ;
} else
2023-03-24 21:01:10 +03:00
ret = __block_write_begin ( & folio - > page , from , to , ext4_get_block ) ;
2012-12-10 23:05:51 +04:00
if ( ! ret & & ext4_should_journal_data ( inode ) ) {
2023-03-24 21:01:10 +03:00
ret = ext4_walk_page_buffers ( handle , inode ,
folio_buffers ( folio ) , from , to ,
NULL , do_journal_get_write_access ) ;
2012-12-10 23:05:51 +04:00
}
if ( ret ) {
2023-03-24 21:01:10 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
folio = NULL ;
2012-12-10 23:05:51 +04:00
ext4_orphan_add ( handle , inode ) ;
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:05:51 +04:00
sem_held = 0 ;
ext4_journal_stop ( handle ) ;
handle = NULL ;
ext4_truncate_failed_write ( inode ) ;
/*
* If truncate failed early the inode might
* still be on the orphan list ; we need to
* make sure the inode is removed from the
* orphan list in that case .
*/
if ( inode - > i_nlink )
ext4_orphan_del ( NULL , inode ) ;
}
if ( ret = = - ENOSPC & & ext4_should_retry_alloc ( inode - > i_sb , & retries ) )
goto retry ;
2023-03-24 21:01:10 +03:00
if ( folio )
block_commit_write ( & folio - > page , from , to ) ;
2012-12-10 23:05:51 +04:00
out :
2023-03-24 21:01:10 +03:00
if ( folio ) {
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:51 +04:00
}
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 05:42:02 +03:00
out_nofolio :
2012-12-10 23:05:51 +04:00
if ( sem_held )
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:05:51 +04:00
if ( handle )
ext4_journal_stop ( handle ) ;
brelse ( iloc . bh ) ;
return ret ;
}
/*
* Try to write data in the inode .
* If the inode has inline data , check whether the new write can be
* in the inode also . If not , create the page the handle , move the data
* to the page make it update and let the later codes create extent for it .
*/
int ext4_try_to_write_inline_data ( struct address_space * mapping ,
struct inode * inode ,
loff_t pos , unsigned len ,
struct page * * pagep )
{
int ret ;
handle_t * handle ;
2023-03-24 21:01:11 +03:00
struct folio * folio ;
2012-12-10 23:05:51 +04:00
struct ext4_iloc iloc ;
if ( pos + len > ext4_get_max_inline_size ( inode ) )
goto convert ;
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
/*
* The possible write could happen in the inode ,
* so try to reserve the space in inode first .
*/
2013-02-09 06:59:22 +04:00
handle = ext4_journal_start ( inode , EXT4_HT_INODE , 1 ) ;
2012-12-10 23:05:51 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
handle = NULL ;
goto out ;
}
ret = ext4_prepare_inline_data ( handle , inode , pos + len ) ;
if ( ret & & ret ! = - ENOSPC )
goto out ;
/* We don't have space in inline inode, so convert it to extent. */
if ( ret = = - ENOSPC ) {
ext4_journal_stop ( handle ) ;
brelse ( iloc . bh ) ;
goto convert ;
}
2021-08-16 12:57:04 +03:00
ret = ext4_journal_get_write_access ( handle , inode - > i_sb , iloc . bh ,
EXT4_JTR_NONE ) ;
2018-07-10 08:07:43 +03:00
if ( ret )
goto out ;
2023-03-24 21:01:11 +03:00
folio = __filemap_get_folio ( mapping , 0 , FGP_WRITEBEGIN | FGP_NOFS ,
mapping_gfp_mask ( mapping ) ) ;
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 05:42:02 +03:00
if ( IS_ERR ( folio ) ) {
ret = PTR_ERR ( folio ) ;
2012-12-10 23:05:51 +04:00
goto out ;
}
2023-03-24 21:01:11 +03:00
* pagep = & folio - > page ;
2012-12-10 23:05:51 +04:00
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
ret = 0 ;
2023-03-24 21:01:11 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:51 +04:00
goto out_up_read ;
}
2023-03-24 21:01:11 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
2023-03-24 21:01:14 +03:00
ret = ext4_read_inline_folio ( inode , folio ) ;
2018-12-04 08:06:53 +03:00
if ( ret < 0 ) {
2023-03-24 21:01:11 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:51 +04:00
goto out_up_read ;
2018-12-04 08:06:53 +03:00
}
2012-12-10 23:05:51 +04:00
}
ret = 1 ;
handle = NULL ;
out_up_read :
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
out :
2018-07-10 08:07:43 +03:00
if ( handle & & ( ret ! = 1 ) )
2012-12-10 23:05:51 +04:00
ext4_journal_stop ( handle ) ;
brelse ( iloc . bh ) ;
return ret ;
convert :
2022-02-22 18:36:28 +03:00
return ext4_convert_inline_data_to_extent ( mapping , inode ) ;
2012-12-10 23:05:51 +04:00
}
int ext4_write_inline_data_end ( struct inode * inode , loff_t pos , unsigned len ,
2023-05-15 13:40:44 +03:00
unsigned copied , struct folio * folio )
2012-12-10 23:05:51 +04:00
{
2021-07-16 15:20:23 +03:00
handle_t * handle = ext4_journal_current_handle ( ) ;
int no_expand ;
2012-12-10 23:05:51 +04:00
void * kaddr ;
struct ext4_iloc iloc ;
2021-07-16 15:20:23 +03:00
int ret = 0 , ret2 ;
2012-12-10 23:05:51 +04:00
2023-03-24 21:01:15 +03:00
if ( unlikely ( copied < len ) & & ! folio_test_uptodate ( folio ) )
2021-07-16 15:20:23 +03:00
copied = 0 ;
2012-12-10 23:05:51 +04:00
2021-07-16 15:20:23 +03:00
if ( likely ( copied ) ) {
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret ) {
2023-03-24 21:01:15 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2021-07-16 15:20:23 +03:00
ext4_std_error ( inode - > i_sb , ret ) ;
2012-12-10 23:05:51 +04:00
goto out ;
}
2021-07-16 15:20:23 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
BUG_ON ( ! ext4_has_inline_data ( inode ) ) ;
2012-12-10 23:05:51 +04:00
2021-09-05 06:46:32 +03:00
/*
* ei - > i_inline_off may have changed since
* ext4_write_begin ( ) called
* ext4_try_to_write_inline_data ( )
*/
( void ) ext4_find_inline_data_nolock ( inode ) ;
2012-12-10 23:05:51 +04:00
2023-03-24 21:01:15 +03:00
kaddr = kmap_local_folio ( folio , 0 ) ;
2021-07-16 15:20:23 +03:00
ext4_write_inline_data ( inode , & iloc , kaddr , pos , copied ) ;
2023-03-24 21:01:15 +03:00
kunmap_local ( kaddr ) ;
folio_mark_uptodate ( folio ) ;
/* clear dirty flag so that writepages wouldn't work for us. */
folio_clear_dirty ( folio ) ;
2012-12-10 23:05:51 +04:00
2021-07-16 15:20:23 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
brelse ( iloc . bh ) ;
2021-08-21 06:44:17 +03:00
2021-07-16 15:20:23 +03:00
/*
2023-03-24 21:01:15 +03:00
* It ' s important to update i_size while still holding folio
2021-07-16 15:20:23 +03:00
* lock : page writeout could otherwise come in and zero
* beyond i_size .
*/
ext4_update_inode_size ( inode , pos + copied ) ;
}
2023-03-24 21:01:15 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:51 +04:00
2021-07-16 15:20:23 +03:00
/*
2023-03-24 21:01:15 +03:00
* Don ' t mark the inode dirty under folio lock . First , it unnecessarily
* makes the holding time of folio lock longer . Second , it forces lock
* ordering of folio lock and transaction start for journaling
2021-07-16 15:20:23 +03:00
* filesystems .
*/
if ( likely ( copied ) )
mark_inode_dirty ( inode ) ;
2012-12-10 23:05:51 +04:00
out :
2021-07-16 15:20:23 +03:00
/*
* If we didn ' t copy as much data as expected , we need to trim back
* size of xattr containing inline data .
*/
if ( pos + len > inode - > i_size & & ext4_can_truncate ( inode ) )
ext4_orphan_add ( handle , inode ) ;
2021-07-16 15:20:22 +03:00
2021-07-16 15:20:23 +03:00
ret2 = ext4_journal_stop ( handle ) ;
if ( ! ret )
ret = ret2 ;
if ( pos + len > inode - > i_size ) {
ext4_truncate_failed_write ( inode ) ;
/*
* If truncate failed early the inode might still be
* on the orphan list ; we need to make sure the inode
* is removed from the orphan list in that case .
*/
if ( inode - > i_nlink )
ext4_orphan_del ( NULL , inode ) ;
}
return ret ? ret : copied ;
2012-12-10 23:05:51 +04:00
}
2012-12-10 23:05:57 +04:00
/*
* Try to make the page cache and handle ready for the inline data case .
* We can call this function in 2 cases :
* 1. The inode is created and the first write exceeds inline size . We can
* clear the inode state safely .
* 2. The inode has inline data , then we need to read the data , make it
* update and dirty so that ext4_da_writepages can handle it . We don ' t
2021-03-27 13:30:05 +03:00
* need to start the journal since the file ' s metadata isn ' t changed now .
2012-12-10 23:05:57 +04:00
*/
static int ext4_da_convert_inline_data_to_extent ( struct address_space * mapping ,
struct inode * inode ,
void * * fsdata )
{
int ret = 0 , inline_size ;
2023-03-24 21:01:12 +03:00
struct folio * folio ;
2012-12-10 23:05:57 +04:00
2023-03-24 21:01:12 +03:00
folio = __filemap_get_folio ( mapping , 0 , FGP_WRITEBEGIN ,
mapping_gfp_mask ( mapping ) ) ;
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 05:42:02 +03:00
if ( IS_ERR ( folio ) )
return PTR_ERR ( folio ) ;
2012-12-10 23:05:57 +04:00
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
ext4_clear_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
goto out ;
}
inline_size = ext4_get_inline_size ( inode ) ;
2023-03-24 21:01:12 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
2023-03-24 21:01:14 +03:00
ret = ext4_read_inline_folio ( inode , folio ) ;
2012-12-10 23:05:57 +04:00
if ( ret < 0 )
goto out ;
}
2023-03-24 21:01:12 +03:00
ret = __block_write_begin ( & folio - > page , 0 , inline_size ,
2012-12-10 23:05:57 +04:00
ext4_da_get_block_prep ) ;
if ( ret ) {
2014-12-06 05:37:15 +03:00
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
2023-03-24 21:01:12 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:57 +04:00
ext4_truncate_failed_write ( inode ) ;
2014-12-06 05:37:15 +03:00
return ret ;
2012-12-10 23:05:57 +04:00
}
2023-03-24 21:01:12 +03:00
folio_mark_dirty ( folio ) ;
folio_mark_uptodate ( folio ) ;
2012-12-10 23:05:57 +04:00
ext4_clear_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
* fsdata = ( void * ) CONVERT_INLINE_DATA ;
out :
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
2023-03-24 21:01:12 +03:00
if ( folio ) {
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2012-12-10 23:05:57 +04:00
}
return ret ;
}
/*
* Prepare the write for the inline data .
2020-01-23 09:43:25 +03:00
* If the data can be written into the inode , we just read
2012-12-10 23:05:57 +04:00
* the page and make it uptodate , and start the journal .
* Otherwise read the page , makes it dirty so that it can be
* handle in writepages ( the i_disksize update is left to the
* normal ext4_da_write_end ) .
*/
int ext4_da_write_inline_data_begin ( struct address_space * mapping ,
struct inode * inode ,
loff_t pos , unsigned len ,
struct page * * pagep ,
void * * fsdata )
{
2022-01-17 15:11:48 +03:00
int ret ;
2012-12-10 23:05:57 +04:00
handle_t * handle ;
2023-03-24 21:01:13 +03:00
struct folio * folio ;
2012-12-10 23:05:57 +04:00
struct ext4_iloc iloc ;
2018-10-03 04:18:45 +03:00
int retries = 0 ;
2012-12-10 23:05:57 +04:00
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
2014-01-06 23:02:23 +04:00
retry_journal :
2013-02-09 06:59:22 +04:00
handle = ext4_journal_start ( inode , EXT4_HT_INODE , 1 ) ;
2012-12-10 23:05:57 +04:00
if ( IS_ERR ( handle ) ) {
ret = PTR_ERR ( handle ) ;
goto out ;
}
2022-01-17 15:11:48 +03:00
ret = ext4_prepare_inline_data ( handle , inode , pos + len ) ;
if ( ret & & ret ! = - ENOSPC )
goto out_journal ;
2012-12-10 23:05:57 +04:00
if ( ret = = - ENOSPC ) {
2018-06-17 06:41:59 +03:00
ext4_journal_stop ( handle ) ;
2012-12-10 23:05:57 +04:00
ret = ext4_da_convert_inline_data_to_extent ( mapping ,
inode ,
fsdata ) ;
2014-01-06 23:02:23 +04:00
if ( ret = = - ENOSPC & &
ext4_should_retry_alloc ( inode - > i_sb , & retries ) )
goto retry_journal ;
2012-12-10 23:05:57 +04:00
goto out ;
}
2022-02-22 18:36:28 +03:00
/*
* We cannot recurse into the filesystem as the transaction
* is already started .
*/
2023-03-24 21:01:13 +03:00
folio = __filemap_get_folio ( mapping , 0 , FGP_WRITEBEGIN | FGP_NOFS ,
mapping_gfp_mask ( mapping ) ) ;
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page().
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive rather
than exclusive, and has fixed a bunch of errors which were caused by its
unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
-----BEGIN PGP SIGNATURE-----
iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZEr3zQAKCRDdBJ7gKXxA
jlLoAP0fpQBipwFxED0Us4SKQfupV6z4caXNJGPeay7Aj11/kQD/aMRC2uPfgr96
eMG3kwn2pqkB9ST2QpkaRbxA//eMbQY=
=J+Dj
-----END PGP SIGNATURE-----
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
2023-04-28 05:42:02 +03:00
if ( IS_ERR ( folio ) ) {
ret = PTR_ERR ( folio ) ;
2014-01-06 23:03:23 +04:00
goto out_journal ;
2012-12-10 23:05:57 +04:00
}
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
ret = 0 ;
goto out_release_page ;
}
2023-03-24 21:01:13 +03:00
if ( ! folio_test_uptodate ( folio ) ) {
2023-03-24 21:01:14 +03:00
ret = ext4_read_inline_folio ( inode , folio ) ;
2012-12-10 23:05:57 +04:00
if ( ret < 0 )
goto out_release_page ;
}
2021-08-16 12:57:04 +03:00
ret = ext4_journal_get_write_access ( handle , inode - > i_sb , iloc . bh ,
EXT4_JTR_NONE ) ;
2018-07-10 08:07:43 +03:00
if ( ret )
goto out_release_page ;
2012-12-10 23:05:57 +04:00
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
2023-03-24 21:01:13 +03:00
* pagep = & folio - > page ;
2012-12-10 23:05:57 +04:00
brelse ( iloc . bh ) ;
return 1 ;
out_release_page :
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
2023-03-24 21:01:13 +03:00
folio_unlock ( folio ) ;
folio_put ( folio ) ;
2014-01-06 23:03:23 +04:00
out_journal :
ext4_journal_stop ( handle ) ;
2012-12-10 23:05:57 +04:00
out :
brelse ( iloc . bh ) ;
return ret ;
}
2012-12-10 23:05:59 +04:00
# ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir ( struct inode * dir , struct buffer_head * bh ,
void * inline_start , int inline_size )
{
int offset ;
unsigned short de_len ;
struct ext4_dir_entry_2 * de = inline_start ;
void * dlimit = inline_start + inline_size ;
trace_printk ( " inode %lu \n " , dir - > i_ino ) ;
offset = 0 ;
while ( ( void * ) de < dlimit ) {
de_len = ext4_rec_len_from_disk ( de - > rec_len , inline_size ) ;
2015-04-02 23:42:43 +03:00
trace_printk ( " de: off %u rlen %u name %.*s nlen %u ino %u \n " ,
2012-12-10 23:05:59 +04:00
offset , de_len , de - > name_len , de - > name ,
de - > name_len , le32_to_cpu ( de - > inode ) ) ;
if ( ext4_check_dir_entry ( dir , NULL , de , bh ,
inline_start , inline_size , offset ) )
BUG ( ) ;
offset + = de_len ;
de = ( struct ext4_dir_entry_2 * ) ( ( char * ) de + de_len ) ;
}
}
# else
# define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
# endif
/*
* Add a new entry into a inline dir .
* It will return - ENOSPC if no space is available , and - EIO
* and - EEXIST if directory entry already exists .
*/
static int ext4_add_dirent_to_inline ( handle_t * handle ,
2015-05-18 20:14:47 +03:00
struct ext4_filename * fname ,
2016-01-09 00:00:31 +03:00
struct inode * dir ,
2012-12-10 23:05:59 +04:00
struct inode * inode ,
struct ext4_iloc * iloc ,
void * inline_start , int inline_size )
{
int err ;
struct ext4_dir_entry_2 * de ;
2015-05-18 20:14:47 +03:00
err = ext4_find_dest_de ( dir , inode , iloc - > bh , inline_start ,
inline_size , fname , & de ) ;
2012-12-10 23:05:59 +04:00
if ( err )
return err ;
2014-05-13 06:06:43 +04:00
BUFFER_TRACE ( iloc - > bh , " get_write_access " ) ;
2021-08-16 12:57:04 +03:00
err = ext4_journal_get_write_access ( handle , dir - > i_sb , iloc - > bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:05:59 +04:00
if ( err )
return err ;
2021-03-19 10:34:13 +03:00
ext4_insert_dentry ( dir , inode , de , inline_size , fname ) ;
2012-12-10 23:05:59 +04:00
ext4_show_inline_dir ( dir , iloc - > bh , inline_start , inline_size ) ;
/*
* XXX shouldn ' t update any times until successful
* completion of syscall , but too many callers depend
* on this .
*
* XXX similarly , too many callers depend on
* ext4_new_inode ( ) setting the times , but error
* recovery deletes the inode , so the worst that can
* happen is that the times are slightly out of date
* and / or different from the directory change time .
*/
2023-07-05 22:01:07 +03:00
dir - > i_mtime = inode_set_ctime_current ( dir ) ;
2012-12-10 23:05:59 +04:00
ext4_update_dx_flag ( dir ) ;
2018-01-09 16:21:39 +03:00
inode_inc_iversion ( dir ) ;
2012-12-10 23:05:59 +04:00
return 1 ;
}
static void * ext4_get_inline_xattr_pos ( struct inode * inode ,
struct ext4_iloc * iloc )
{
struct ext4_xattr_entry * entry ;
struct ext4_xattr_ibody_header * header ;
BUG_ON ( ! EXT4_I ( inode ) - > i_inline_off ) ;
header = IHDR ( inode , ext4_raw_inode ( iloc ) ) ;
entry = ( struct ext4_xattr_entry * ) ( ( void * ) ext4_raw_inode ( iloc ) +
EXT4_I ( inode ) - > i_inline_off ) ;
return ( void * ) IFIRST ( header ) + le16_to_cpu ( entry - > e_value_offs ) ;
}
/* Set the final de to cover the whole block. */
static void ext4_update_final_de ( void * de_buf , int old_size , int new_size )
{
struct ext4_dir_entry_2 * de , * prev_de ;
void * limit ;
int de_len ;
2022-04-01 11:13:21 +03:00
de = de_buf ;
2012-12-10 23:05:59 +04:00
if ( old_size ) {
limit = de_buf + old_size ;
do {
prev_de = de ;
de_len = ext4_rec_len_from_disk ( de - > rec_len , old_size ) ;
de_buf + = de_len ;
2022-04-01 11:13:21 +03:00
de = de_buf ;
2012-12-10 23:05:59 +04:00
} while ( de_buf < limit ) ;
prev_de - > rec_len = ext4_rec_len_to_disk ( de_len + new_size -
old_size , new_size ) ;
} else {
/* this is just created, so create an empty entry. */
de - > inode = 0 ;
de - > rec_len = ext4_rec_len_to_disk ( new_size , new_size ) ;
}
}
static int ext4_update_inline_dir ( handle_t * handle , struct inode * dir ,
struct ext4_iloc * iloc )
{
int ret ;
int old_size = EXT4_I ( dir ) - > i_inline_size - EXT4_MIN_INLINE_DATA_SIZE ;
int new_size = get_max_inline_xattr_value_size ( dir , iloc ) ;
2021-03-19 10:34:13 +03:00
if ( new_size - old_size < = ext4_dir_rec_len ( 1 , NULL ) )
2012-12-10 23:05:59 +04:00
return - ENOSPC ;
ret = ext4_update_inline_data ( handle , dir ,
new_size + EXT4_MIN_INLINE_DATA_SIZE ) ;
if ( ret )
return ret ;
ext4_update_final_de ( ext4_get_inline_xattr_pos ( dir , iloc ) , old_size ,
EXT4_I ( dir ) - > i_inline_size -
EXT4_MIN_INLINE_DATA_SIZE ) ;
dir - > i_size = EXT4_I ( dir ) - > i_disksize = EXT4_I ( dir ) - > i_inline_size ;
return 0 ;
}
static void ext4_restore_inline_data ( handle_t * handle , struct inode * inode ,
struct ext4_iloc * iloc ,
void * buf , int inline_size )
{
2022-01-17 15:11:47 +03:00
int ret ;
ret = ext4_create_inline_data ( handle , inode , inline_size ) ;
if ( ret ) {
ext4_msg ( inode - > i_sb , KERN_EMERG ,
" error restoring inline_data for inode -- potential data loss! (inode %lu, error %d) " ,
inode - > i_ino , ret ) ;
return ;
}
2012-12-10 23:05:59 +04:00
ext4_write_inline_data ( inode , iloc , buf , 0 , inline_size ) ;
ext4_set_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
}
static int ext4_finish_convert_inline_dir ( handle_t * handle ,
struct inode * inode ,
struct buffer_head * dir_block ,
void * buf ,
int inline_size )
{
int err , csum_size = 0 , header_size = 0 ;
struct ext4_dir_entry_2 * de ;
void * target = dir_block - > b_data ;
/*
* First create " . " and " .. " and then copy the dir information
* back to the block .
*/
2022-04-01 11:13:21 +03:00
de = target ;
2012-12-10 23:05:59 +04:00
de = ext4_init_dot_dotdot ( inode , de ,
inode - > i_sb - > s_blocksize , csum_size ,
le32_to_cpu ( ( ( struct ext4_dir_entry_2 * ) buf ) - > inode ) , 1 ) ;
header_size = ( void * ) de - target ;
memcpy ( ( void * ) de , buf + EXT4_INLINE_DOTDOT_SIZE ,
inline_size - EXT4_INLINE_DOTDOT_SIZE ) ;
2014-10-13 11:36:16 +04:00
if ( ext4_has_metadata_csum ( inode - > i_sb ) )
2012-12-10 23:05:59 +04:00
csum_size = sizeof ( struct ext4_dir_entry_tail ) ;
inode - > i_size = inode - > i_sb - > s_blocksize ;
i_size_write ( inode , inode - > i_sb - > s_blocksize ) ;
EXT4_I ( inode ) - > i_disksize = inode - > i_sb - > s_blocksize ;
ext4_update_final_de ( dir_block - > b_data ,
inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size ,
inode - > i_sb - > s_blocksize - csum_size ) ;
2019-06-21 23:31:47 +03:00
if ( csum_size )
ext4_initialize_dirent_tail ( dir_block ,
inode - > i_sb - > s_blocksize ) ;
2012-12-10 23:05:59 +04:00
set_buffer_uptodate ( dir_block ) ;
2023-05-07 04:04:01 +03:00
unlock_buffer ( dir_block ) ;
2019-06-21 22:49:26 +03:00
err = ext4_handle_dirty_dirblock ( handle , inode , dir_block ) ;
2012-12-10 23:05:59 +04:00
if ( err )
2017-03-15 21:52:02 +03:00
return err ;
2012-12-10 23:05:59 +04:00
set_buffer_verified ( dir_block ) ;
2017-03-15 21:52:02 +03:00
return ext4_mark_inode_dirty ( handle , inode ) ;
2012-12-10 23:05:59 +04:00
}
static int ext4_convert_inline_data_nolock ( handle_t * handle ,
struct inode * inode ,
struct ext4_iloc * iloc )
{
int error ;
void * buf = NULL ;
struct buffer_head * data_bh = NULL ;
struct ext4_map_blocks map ;
int inline_size ;
inline_size = ext4_get_inline_size ( inode ) ;
buf = kmalloc ( inline_size , GFP_NOFS ) ;
if ( ! buf ) {
error = - ENOMEM ;
goto out ;
}
error = ext4_read_inline_data ( inode , buf , inline_size , iloc ) ;
if ( error < 0 )
goto out ;
2014-07-28 21:06:26 +04:00
/*
* Make sure the inline directory entries pass checks before we try to
* convert them , so that we avoid touching stuff that needs fsck .
*/
if ( S_ISDIR ( inode - > i_mode ) ) {
error = ext4_check_all_de ( inode , iloc - > bh ,
buf + EXT4_INLINE_DOTDOT_SIZE ,
inline_size - EXT4_INLINE_DOTDOT_SIZE ) ;
if ( error )
goto out ;
}
2012-12-10 23:05:59 +04:00
error = ext4_destroy_inline_data_nolock ( handle , inode ) ;
if ( error )
goto out ;
map . m_lblk = 0 ;
map . m_len = 1 ;
map . m_flags = 0 ;
error = ext4_map_blocks ( handle , inode , & map , EXT4_GET_BLOCKS_CREATE ) ;
if ( error < 0 )
goto out_restore ;
if ( ! ( map . m_flags & EXT4_MAP_MAPPED ) ) {
error = - EIO ;
goto out_restore ;
}
data_bh = sb_getblk ( inode - > i_sb , map . m_pblk ) ;
if ( ! data_bh ) {
2013-01-13 01:19:36 +04:00
error = - ENOMEM ;
2012-12-10 23:05:59 +04:00
goto out_restore ;
}
lock_buffer ( data_bh ) ;
2021-08-16 12:57:04 +03:00
error = ext4_journal_get_create_access ( handle , inode - > i_sb , data_bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:05:59 +04:00
if ( error ) {
unlock_buffer ( data_bh ) ;
error = - EIO ;
goto out_restore ;
}
memset ( data_bh - > b_data , 0 , inode - > i_sb - > s_blocksize ) ;
if ( ! S_ISDIR ( inode - > i_mode ) ) {
memcpy ( data_bh - > b_data , buf , inline_size ) ;
set_buffer_uptodate ( data_bh ) ;
2023-05-07 04:04:01 +03:00
unlock_buffer ( data_bh ) ;
2012-12-10 23:05:59 +04:00
error = ext4_handle_dirty_metadata ( handle ,
inode , data_bh ) ;
} else {
error = ext4_finish_convert_inline_dir ( handle , inode , data_bh ,
buf , inline_size ) ;
}
out_restore :
if ( error )
ext4_restore_inline_data ( handle , inode , iloc , buf , inline_size ) ;
out :
brelse ( data_bh ) ;
kfree ( buf ) ;
return error ;
}
/*
* Try to add the new entry to the inline data .
* If succeeds , return 0. If not , extended the inline dir and copied data to
* the new created block .
*/
2015-05-18 20:14:47 +03:00
int ext4_try_add_inline_entry ( handle_t * handle , struct ext4_filename * fname ,
2016-01-09 00:00:31 +03:00
struct inode * dir , struct inode * inode )
2012-12-10 23:05:59 +04:00
{
2020-04-27 04:34:37 +03:00
int ret , ret2 , inline_size , no_expand ;
2012-12-10 23:05:59 +04:00
void * inline_start ;
struct ext4_iloc iloc ;
ret = ext4_get_inode_loc ( dir , & iloc ) ;
if ( ret )
return ret ;
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( dir , & no_expand ) ;
2012-12-10 23:05:59 +04:00
if ( ! ext4_has_inline_data ( dir ) )
goto out ;
inline_start = ( void * ) ext4_raw_inode ( & iloc ) - > i_block +
EXT4_INLINE_DOTDOT_SIZE ;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE ;
2016-01-09 00:00:31 +03:00
ret = ext4_add_dirent_to_inline ( handle , fname , dir , inode , & iloc ,
2012-12-10 23:05:59 +04:00
inline_start , inline_size ) ;
if ( ret ! = - ENOSPC )
goto out ;
/* check whether it can be inserted to inline xattr space. */
inline_size = EXT4_I ( dir ) - > i_inline_size -
EXT4_MIN_INLINE_DATA_SIZE ;
if ( ! inline_size ) {
/* Try to use the xattr space.*/
ret = ext4_update_inline_dir ( handle , dir , & iloc ) ;
if ( ret & & ret ! = - ENOSPC )
goto out ;
inline_size = EXT4_I ( dir ) - > i_inline_size -
EXT4_MIN_INLINE_DATA_SIZE ;
}
if ( inline_size ) {
inline_start = ext4_get_inline_xattr_pos ( dir , & iloc ) ;
2016-01-09 00:00:31 +03:00
ret = ext4_add_dirent_to_inline ( handle , fname , dir ,
2015-05-18 20:14:47 +03:00
inode , & iloc , inline_start ,
inline_size ) ;
2012-12-10 23:05:59 +04:00
if ( ret ! = - ENOSPC )
goto out ;
}
/*
* The inline space is filled up , so create a new block for it .
* As the extent tree will be created , we have to save the inline
* dir first .
*/
ret = ext4_convert_inline_data_nolock ( handle , dir , & iloc ) ;
out :
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( dir , & no_expand ) ;
2020-04-27 04:34:37 +03:00
ret2 = ext4_mark_inode_dirty ( handle , dir ) ;
if ( unlikely ( ret2 & & ! ret ) )
ret = ret2 ;
2012-12-10 23:05:59 +04:00
brelse ( iloc . bh ) ;
return ret ;
}
2013-04-20 01:53:09 +04:00
/*
* This function fills a red - black tree with information from an
* inlined dir . It returns the number directory entries loaded
* into the tree . If there is an error it is returned in err .
*/
2019-06-22 04:57:00 +03:00
int ext4_inlinedir_to_tree ( struct file * dir_file ,
struct inode * dir , ext4_lblk_t block ,
struct dx_hash_info * hinfo ,
__u32 start_hash , __u32 start_minor_hash ,
int * has_inline_data )
2013-04-20 01:53:09 +04:00
{
int err = 0 , count = 0 ;
unsigned int parent_ino ;
int pos ;
struct ext4_dir_entry_2 * de ;
struct inode * inode = file_inode ( dir_file ) ;
int ret , inline_size = 0 ;
struct ext4_iloc iloc ;
void * dir_buf = NULL ;
struct ext4_dir_entry_2 fake ;
2016-07-10 21:01:03 +03:00
struct fscrypt_str tmp_str ;
2013-04-20 01:53:09 +04:00
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
* has_inline_data = 0 ;
goto out ;
}
inline_size = ext4_get_inline_size ( inode ) ;
dir_buf = kmalloc ( inline_size , GFP_NOFS ) ;
if ( ! dir_buf ) {
ret = - ENOMEM ;
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
goto out ;
}
ret = ext4_read_inline_data ( inode , dir_buf , inline_size , & iloc ) ;
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ret < 0 )
goto out ;
pos = 0 ;
parent_ino = le32_to_cpu ( ( ( struct ext4_dir_entry_2 * ) dir_buf ) - > inode ) ;
while ( pos < inline_size ) {
/*
* As inlined dir doesn ' t store any information about ' . ' and
* only the inode number of ' . . ' is stored , we have to handle
* them differently .
*/
if ( pos = = 0 ) {
fake . inode = cpu_to_le32 ( inode - > i_ino ) ;
fake . name_len = 1 ;
strcpy ( fake . name , " . " ) ;
fake . rec_len = ext4_rec_len_to_disk (
2021-03-19 10:34:13 +03:00
ext4_dir_rec_len ( fake . name_len , NULL ) ,
inline_size ) ;
2013-04-20 01:53:09 +04:00
ext4_set_de_type ( inode - > i_sb , & fake , S_IFDIR ) ;
de = & fake ;
pos = EXT4_INLINE_DOTDOT_OFFSET ;
} else if ( pos = = EXT4_INLINE_DOTDOT_OFFSET ) {
fake . inode = cpu_to_le32 ( parent_ino ) ;
fake . name_len = 2 ;
strcpy ( fake . name , " .. " ) ;
fake . rec_len = ext4_rec_len_to_disk (
2021-03-19 10:34:13 +03:00
ext4_dir_rec_len ( fake . name_len , NULL ) ,
inline_size ) ;
2013-04-20 01:53:09 +04:00
ext4_set_de_type ( inode - > i_sb , & fake , S_IFDIR ) ;
de = & fake ;
pos = EXT4_INLINE_DOTDOT_SIZE ;
} else {
de = ( struct ext4_dir_entry_2 * ) ( dir_buf + pos ) ;
pos + = ext4_rec_len_from_disk ( de - > rec_len , inline_size ) ;
if ( ext4_check_dir_entry ( inode , dir_file , de ,
iloc . bh , dir_buf ,
inline_size , pos ) ) {
ret = count ;
goto out ;
}
}
2021-03-19 10:34:13 +03:00
if ( ext4_hash_in_dirent ( dir ) ) {
hinfo - > hash = EXT4_DIRENT_HASH ( de ) ;
hinfo - > minor_hash = EXT4_DIRENT_MINOR_HASH ( de ) ;
} else {
ext4fs_dirhash ( dir , de - > name , de - > name_len , hinfo ) ;
}
2013-04-20 01:53:09 +04:00
if ( ( hinfo - > hash < start_hash ) | |
( ( hinfo - > hash = = start_hash ) & &
( hinfo - > minor_hash < start_minor_hash ) ) )
continue ;
if ( de - > inode = = 0 )
continue ;
2015-04-12 07:56:26 +03:00
tmp_str . name = de - > name ;
tmp_str . len = de - > name_len ;
err = ext4_htree_store_dirent ( dir_file , hinfo - > hash ,
hinfo - > minor_hash , de , & tmp_str ) ;
2013-04-20 01:53:09 +04:00
if ( err ) {
2019-08-12 21:29:38 +03:00
ret = err ;
2013-04-20 01:53:09 +04:00
goto out ;
}
count + + ;
}
ret = count ;
out :
kfree ( dir_buf ) ;
brelse ( iloc . bh ) ;
return ret ;
}
2013-04-20 01:55:33 +04:00
/*
* So this function is called when the volume is mkfsed with
* dir_index disabled . In order to keep f_pos persistent
* after we convert from an inlined dir to a blocked based ,
* we just pretend that we are a normal dir and return the
* offset as if ' . ' and ' . . ' really take place .
*
*/
2013-05-18 00:08:53 +04:00
int ext4_read_inline_dir ( struct file * file ,
struct dir_context * ctx ,
2012-12-10 23:05:59 +04:00
int * has_inline_data )
{
unsigned int offset , parent_ino ;
2013-05-18 00:08:53 +04:00
int i ;
2012-12-10 23:05:59 +04:00
struct ext4_dir_entry_2 * de ;
struct super_block * sb ;
2013-05-18 00:08:53 +04:00
struct inode * inode = file_inode ( file ) ;
2012-12-10 23:05:59 +04:00
int ret , inline_size = 0 ;
struct ext4_iloc iloc ;
void * dir_buf = NULL ;
2013-04-20 01:55:33 +04:00
int dotdot_offset , dotdot_size , extra_offset , extra_size ;
2012-12-10 23:05:59 +04:00
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) ) {
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
* has_inline_data = 0 ;
goto out ;
}
inline_size = ext4_get_inline_size ( inode ) ;
dir_buf = kmalloc ( inline_size , GFP_NOFS ) ;
if ( ! dir_buf ) {
ret = - ENOMEM ;
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
goto out ;
}
ret = ext4_read_inline_data ( inode , dir_buf , inline_size , & iloc ) ;
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ret < 0 )
goto out ;
2013-10-30 16:07:20 +04:00
ret = 0 ;
2012-12-10 23:05:59 +04:00
sb = inode - > i_sb ;
parent_ino = le32_to_cpu ( ( ( struct ext4_dir_entry_2 * ) dir_buf ) - > inode ) ;
2013-05-18 00:08:53 +04:00
offset = ctx - > pos ;
2013-04-20 01:55:33 +04:00
/*
* dotdot_offset and dotdot_size is the real offset and
* size for " .. " and " . " if the dir is block based while
* the real size for them are only EXT4_INLINE_DOTDOT_SIZE .
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration .
*/
2021-03-19 10:34:13 +03:00
dotdot_offset = ext4_dir_rec_len ( 1 , NULL ) ;
dotdot_size = dotdot_offset + ext4_dir_rec_len ( 2 , NULL ) ;
2013-04-20 01:55:33 +04:00
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE ;
extra_size = extra_offset + inline_size ;
2012-12-10 23:05:59 +04:00
2013-05-18 00:08:53 +04:00
/*
* If the version has changed since the last call to
* readdir ( 2 ) , then we might be pointing to an invalid
* dirent right now . Scan from the start of the inline
* dir to make sure .
*/
2018-02-01 16:15:25 +03:00
if ( ! inode_eq_iversion ( inode , file - > f_version ) ) {
2013-05-18 00:08:53 +04:00
for ( i = 0 ; i < extra_size & & i < offset ; ) {
/*
* " . " is with offset 0 and
* " .. " is dotdot_offset .
*/
if ( ! i ) {
i = dotdot_offset ;
continue ;
} else if ( i = = dotdot_offset ) {
i = dotdot_size ;
2013-04-20 01:55:33 +04:00
continue ;
}
2013-05-18 00:08:53 +04:00
/* for other entry, the real offset in
* the buf has to be tuned accordingly .
*/
de = ( struct ext4_dir_entry_2 * )
( dir_buf + i - extra_offset ) ;
/* It's too expensive to do a full
* dirent test each time round this
* loop , but we do have to test at
* least that it is non - zero . A
* failure will be detected in the
* dirent test below . */
if ( ext4_rec_len_from_disk ( de - > rec_len , extra_size )
2021-03-19 10:34:13 +03:00
< ext4_dir_rec_len ( 1 , NULL ) )
2013-05-18 00:08:53 +04:00
break ;
i + = ext4_rec_len_from_disk ( de - > rec_len ,
extra_size ) ;
}
offset = i ;
ctx - > pos = offset ;
2018-01-09 16:21:39 +03:00
file - > f_version = inode_query_iversion ( inode ) ;
2013-05-18 00:08:53 +04:00
}
2012-12-10 23:05:59 +04:00
2013-05-18 00:08:53 +04:00
while ( ctx - > pos < extra_size ) {
if ( ctx - > pos = = 0 ) {
if ( ! dir_emit ( ctx , " . " , 1 , inode - > i_ino , DT_DIR ) )
goto out ;
ctx - > pos = dotdot_offset ;
continue ;
}
2012-12-10 23:05:59 +04:00
2013-05-18 00:08:53 +04:00
if ( ctx - > pos = = dotdot_offset ) {
if ( ! dir_emit ( ctx , " .. " , 2 , parent_ino , DT_DIR ) )
goto out ;
ctx - > pos = dotdot_size ;
continue ;
}
2012-12-10 23:05:59 +04:00
2013-05-18 00:08:53 +04:00
de = ( struct ext4_dir_entry_2 * )
( dir_buf + ctx - > pos - extra_offset ) ;
if ( ext4_check_dir_entry ( inode , file , de , iloc . bh , dir_buf ,
extra_size , ctx - > pos ) )
goto out ;
if ( le32_to_cpu ( de - > inode ) ) {
if ( ! dir_emit ( ctx , de - > name , de - > name_len ,
le32_to_cpu ( de - > inode ) ,
get_dtype ( sb , de - > file_type ) ) )
2012-12-10 23:05:59 +04:00
goto out ;
}
2013-05-18 00:08:53 +04:00
ctx - > pos + = ext4_rec_len_from_disk ( de - > rec_len , extra_size ) ;
2012-12-10 23:05:59 +04:00
}
out :
kfree ( dir_buf ) ;
brelse ( iloc . bh ) ;
return ret ;
}
2022-06-30 12:01:00 +03:00
void * ext4_read_inline_link ( struct inode * inode )
{
struct ext4_iloc iloc ;
int ret , inline_size ;
void * link ;
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ERR_PTR ( ret ) ;
ret = - ENOMEM ;
inline_size = ext4_get_inline_size ( inode ) ;
link = kmalloc ( inline_size + 1 , GFP_NOFS ) ;
if ( ! link )
goto out ;
ret = ext4_read_inline_data ( inode , link , inline_size , & iloc ) ;
if ( ret < 0 ) {
kfree ( link ) ;
goto out ;
}
nd_terminate_link ( link , inode - > i_size , ret ) ;
out :
if ( ret < 0 )
link = ERR_PTR ( ret ) ;
brelse ( iloc . bh ) ;
return link ;
}
2012-12-10 23:06:01 +04:00
struct buffer_head * ext4_get_first_inline_block ( struct inode * inode ,
struct ext4_dir_entry_2 * * parent_de ,
int * retval )
{
struct ext4_iloc iloc ;
* retval = ext4_get_inode_loc ( inode , & iloc ) ;
if ( * retval )
return NULL ;
* parent_de = ( struct ext4_dir_entry_2 * ) ext4_raw_inode ( & iloc ) - > i_block ;
return iloc . bh ;
}
2012-12-10 23:05:59 +04:00
/*
* Try to create the inline data for the new dir .
* If it succeeds , return 0 , otherwise return the error .
* In case of ENOSPC , the caller should create the normal disk layout dir .
*/
int ext4_try_create_inline_dir ( handle_t * handle , struct inode * parent ,
struct inode * inode )
{
int ret , inline_size = EXT4_MIN_INLINE_DATA_SIZE ;
struct ext4_iloc iloc ;
struct ext4_dir_entry_2 * de ;
ret = ext4_get_inode_loc ( inode , & iloc ) ;
if ( ret )
return ret ;
ret = ext4_prepare_inline_data ( handle , inode , inline_size ) ;
if ( ret )
goto out ;
/*
* For inline dir , we only save the inode information for the " .. "
* and create a fake dentry to cover the left space .
*/
de = ( struct ext4_dir_entry_2 * ) ext4_raw_inode ( & iloc ) - > i_block ;
de - > inode = cpu_to_le32 ( parent - > i_ino ) ;
de = ( struct ext4_dir_entry_2 * ) ( ( void * ) de + EXT4_INLINE_DOTDOT_SIZE ) ;
de - > inode = 0 ;
de - > rec_len = ext4_rec_len_to_disk (
inline_size - EXT4_INLINE_DOTDOT_SIZE ,
inline_size ) ;
set_nlink ( inode , 2 ) ;
inode - > i_size = EXT4_I ( inode ) - > i_disksize = inline_size ;
out :
brelse ( iloc . bh ) ;
return ret ;
}
2012-12-10 23:06:00 +04:00
struct buffer_head * ext4_find_inline_entry ( struct inode * dir ,
2015-05-18 20:14:47 +03:00
struct ext4_filename * fname ,
2012-12-10 23:06:00 +04:00
struct ext4_dir_entry_2 * * res_dir ,
int * has_inline_data )
{
int ret ;
struct ext4_iloc iloc ;
void * inline_start ;
int inline_size ;
if ( ext4_get_inode_loc ( dir , & iloc ) )
return NULL ;
down_read ( & EXT4_I ( dir ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( dir ) ) {
* has_inline_data = 0 ;
goto out ;
}
inline_start = ( void * ) ext4_raw_inode ( & iloc ) - > i_block +
EXT4_INLINE_DOTDOT_SIZE ;
inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE ;
2015-05-18 20:14:47 +03:00
ret = ext4_search_dir ( iloc . bh , inline_start , inline_size ,
2017-05-25 01:10:49 +03:00
dir , fname , 0 , res_dir ) ;
2012-12-10 23:06:00 +04:00
if ( ret = = 1 )
goto out_find ;
if ( ret < 0 )
goto out ;
if ( ext4_get_inline_size ( dir ) = = EXT4_MIN_INLINE_DATA_SIZE )
goto out ;
inline_start = ext4_get_inline_xattr_pos ( dir , & iloc ) ;
inline_size = ext4_get_inline_size ( dir ) - EXT4_MIN_INLINE_DATA_SIZE ;
2015-05-18 20:14:47 +03:00
ret = ext4_search_dir ( iloc . bh , inline_start , inline_size ,
2017-05-25 01:10:49 +03:00
dir , fname , 0 , res_dir ) ;
2012-12-10 23:06:00 +04:00
if ( ret = = 1 )
goto out_find ;
out :
brelse ( iloc . bh ) ;
iloc . bh = NULL ;
out_find :
up_read ( & EXT4_I ( dir ) - > xattr_sem ) ;
return iloc . bh ;
}
2012-12-10 23:06:00 +04:00
int ext4_delete_inline_entry ( handle_t * handle ,
struct inode * dir ,
struct ext4_dir_entry_2 * de_del ,
struct buffer_head * bh ,
int * has_inline_data )
{
2017-01-12 05:50:46 +03:00
int err , inline_size , no_expand ;
2012-12-10 23:06:00 +04:00
struct ext4_iloc iloc ;
void * inline_start ;
err = ext4_get_inode_loc ( dir , & iloc ) ;
if ( err )
return err ;
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( dir , & no_expand ) ;
2012-12-10 23:06:00 +04:00
if ( ! ext4_has_inline_data ( dir ) ) {
* has_inline_data = 0 ;
goto out ;
}
if ( ( void * ) de_del - ( ( void * ) ext4_raw_inode ( & iloc ) - > i_block ) <
EXT4_MIN_INLINE_DATA_SIZE ) {
inline_start = ( void * ) ext4_raw_inode ( & iloc ) - > i_block +
EXT4_INLINE_DOTDOT_SIZE ;
inline_size = EXT4_MIN_INLINE_DATA_SIZE -
EXT4_INLINE_DOTDOT_SIZE ;
} else {
inline_start = ext4_get_inline_xattr_pos ( dir , & iloc ) ;
inline_size = ext4_get_inline_size ( dir ) -
EXT4_MIN_INLINE_DATA_SIZE ;
}
2014-05-13 06:06:43 +04:00
BUFFER_TRACE ( bh , " get_write_access " ) ;
2021-08-16 12:57:04 +03:00
err = ext4_journal_get_write_access ( handle , dir - > i_sb , bh ,
EXT4_JTR_NONE ) ;
2012-12-10 23:06:00 +04:00
if ( err )
goto out ;
2020-08-10 11:07:05 +03:00
err = ext4_generic_delete_entry ( dir , de_del , bh ,
2012-12-10 23:06:00 +04:00
inline_start , inline_size , 0 ) ;
if ( err )
goto out ;
ext4_show_inline_dir ( dir , iloc . bh , inline_start , inline_size ) ;
out :
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( dir , & no_expand ) ;
2017-01-12 06:14:49 +03:00
if ( likely ( err = = 0 ) )
err = ext4_mark_inode_dirty ( handle , dir ) ;
2012-12-10 23:06:00 +04:00
brelse ( iloc . bh ) ;
if ( err ! = - ENOENT )
ext4_std_error ( dir - > i_sb , err ) ;
return err ;
}
2012-12-10 23:06:01 +04:00
/*
* Get the inline dentry at offset .
*/
static inline struct ext4_dir_entry_2 *
ext4_get_inline_entry ( struct inode * inode ,
struct ext4_iloc * iloc ,
unsigned int offset ,
void * * inline_start ,
int * inline_size )
{
void * inline_pos ;
BUG_ON ( offset > ext4_get_inline_size ( inode ) ) ;
if ( offset < EXT4_MIN_INLINE_DATA_SIZE ) {
inline_pos = ( void * ) ext4_raw_inode ( iloc ) - > i_block ;
* inline_size = EXT4_MIN_INLINE_DATA_SIZE ;
} else {
inline_pos = ext4_get_inline_xattr_pos ( inode , iloc ) ;
offset - = EXT4_MIN_INLINE_DATA_SIZE ;
* inline_size = ext4_get_inline_size ( inode ) -
EXT4_MIN_INLINE_DATA_SIZE ;
}
if ( inline_start )
* inline_start = inline_pos ;
return ( struct ext4_dir_entry_2 * ) ( inline_pos + offset ) ;
}
2016-07-10 21:01:03 +03:00
bool empty_inline_dir ( struct inode * dir , int * has_inline_data )
2012-12-10 23:06:01 +04:00
{
int err , inline_size ;
struct ext4_iloc iloc ;
2018-08-27 16:22:45 +03:00
size_t inline_len ;
2012-12-10 23:06:01 +04:00
void * inline_pos ;
unsigned int offset ;
struct ext4_dir_entry_2 * de ;
ext4: fix fs corruption when tring to remove a non-empty directory with IO error
We inject IO error when rmdir non empty direcory, then got issue as follows:
step1: mkfs.ext4 -F /dev/sda
step2: mount /dev/sda test
step3: cd test
step4: mkdir -p 1/2
step5: rmdir 1
[ 110.920551] ext4_empty_dir: inject fault
[ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12:
comm rmdir: empty directory '1' has too many links (3)
step6: cd ..
step7: umount test
step8: fsck.ext4 -f /dev/sda
e2fsck 1.42.9 (28-Dec-2013)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Entry '..' in .../??? (13) has deleted/unused inode 12. Clear<y>? yes
Pass 3: Checking directory connectivity
Unconnected directory inode 13 (...)
Connect to /lost+found<y>? yes
Pass 4: Checking reference counts
Inode 13 ref count is 3, should be 2. Fix<y>? yes
Pass 5: Checking group summary information
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks
ext4_rmdir
if (!ext4_empty_dir(inode))
goto end_rmdir;
ext4_empty_dir
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
Now if read directory block failed, 'ext4_empty_dir' will return true, assume
directory is empty. Obviously, it will lead to above issue.
To solve this issue, if read directory block failed 'ext4_empty_dir' just
return false. To avoid making things worse when file system is already
corrupted, 'ext4_empty_dir' also return false.
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-02-28 05:48:15 +03:00
bool ret = false ;
2012-12-10 23:06:01 +04:00
err = ext4_get_inode_loc ( dir , & iloc ) ;
if ( err ) {
2020-03-29 02:33:43 +03:00
EXT4_ERROR_INODE_ERR ( dir , - err ,
" error %d getting inode %lu block " ,
err , dir - > i_ino ) ;
ext4: fix fs corruption when tring to remove a non-empty directory with IO error
We inject IO error when rmdir non empty direcory, then got issue as follows:
step1: mkfs.ext4 -F /dev/sda
step2: mount /dev/sda test
step3: cd test
step4: mkdir -p 1/2
step5: rmdir 1
[ 110.920551] ext4_empty_dir: inject fault
[ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12:
comm rmdir: empty directory '1' has too many links (3)
step6: cd ..
step7: umount test
step8: fsck.ext4 -f /dev/sda
e2fsck 1.42.9 (28-Dec-2013)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Entry '..' in .../??? (13) has deleted/unused inode 12. Clear<y>? yes
Pass 3: Checking directory connectivity
Unconnected directory inode 13 (...)
Connect to /lost+found<y>? yes
Pass 4: Checking reference counts
Inode 13 ref count is 3, should be 2. Fix<y>? yes
Pass 5: Checking group summary information
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks
ext4_rmdir
if (!ext4_empty_dir(inode))
goto end_rmdir;
ext4_empty_dir
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
Now if read directory block failed, 'ext4_empty_dir' will return true, assume
directory is empty. Obviously, it will lead to above issue.
To solve this issue, if read directory block failed 'ext4_empty_dir' just
return false. To avoid making things worse when file system is already
corrupted, 'ext4_empty_dir' also return false.
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-02-28 05:48:15 +03:00
return false ;
2012-12-10 23:06:01 +04:00
}
down_read ( & EXT4_I ( dir ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( dir ) ) {
* has_inline_data = 0 ;
ext4: fix fs corruption when tring to remove a non-empty directory with IO error
We inject IO error when rmdir non empty direcory, then got issue as follows:
step1: mkfs.ext4 -F /dev/sda
step2: mount /dev/sda test
step3: cd test
step4: mkdir -p 1/2
step5: rmdir 1
[ 110.920551] ext4_empty_dir: inject fault
[ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12:
comm rmdir: empty directory '1' has too many links (3)
step6: cd ..
step7: umount test
step8: fsck.ext4 -f /dev/sda
e2fsck 1.42.9 (28-Dec-2013)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Entry '..' in .../??? (13) has deleted/unused inode 12. Clear<y>? yes
Pass 3: Checking directory connectivity
Unconnected directory inode 13 (...)
Connect to /lost+found<y>? yes
Pass 4: Checking reference counts
Inode 13 ref count is 3, should be 2. Fix<y>? yes
Pass 5: Checking group summary information
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks
ext4_rmdir
if (!ext4_empty_dir(inode))
goto end_rmdir;
ext4_empty_dir
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
Now if read directory block failed, 'ext4_empty_dir' will return true, assume
directory is empty. Obviously, it will lead to above issue.
To solve this issue, if read directory block failed 'ext4_empty_dir' just
return false. To avoid making things worse when file system is already
corrupted, 'ext4_empty_dir' also return false.
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-02-28 05:48:15 +03:00
ret = true ;
2012-12-10 23:06:01 +04:00
goto out ;
}
de = ( struct ext4_dir_entry_2 * ) ext4_raw_inode ( & iloc ) - > i_block ;
if ( ! le32_to_cpu ( de - > inode ) ) {
ext4_warning ( dir - > i_sb ,
" bad inline directory (dir #%lu) - no `..' " ,
dir - > i_ino ) ;
goto out ;
}
2018-08-27 16:22:45 +03:00
inline_len = ext4_get_inline_size ( dir ) ;
2012-12-10 23:06:01 +04:00
offset = EXT4_INLINE_DOTDOT_SIZE ;
2018-08-27 16:22:45 +03:00
while ( offset < inline_len ) {
2012-12-10 23:06:01 +04:00
de = ext4_get_inline_entry ( dir , & iloc , offset ,
& inline_pos , & inline_size ) ;
if ( ext4_check_dir_entry ( dir , NULL , de ,
iloc . bh , inline_pos ,
inline_size , offset ) ) {
ext4_warning ( dir - > i_sb ,
" bad inline directory (dir #%lu) - "
" inode %u, rec_len %u, name_len %d "
2016-04-27 08:11:21 +03:00
" inline size %d " ,
2012-12-10 23:06:01 +04:00
dir - > i_ino , le32_to_cpu ( de - > inode ) ,
le16_to_cpu ( de - > rec_len ) , de - > name_len ,
inline_size ) ;
goto out ;
}
if ( le32_to_cpu ( de - > inode ) ) {
goto out ;
}
offset + = ext4_rec_len_from_disk ( de - > rec_len , inline_size ) ;
}
ext4: fix fs corruption when tring to remove a non-empty directory with IO error
We inject IO error when rmdir non empty direcory, then got issue as follows:
step1: mkfs.ext4 -F /dev/sda
step2: mount /dev/sda test
step3: cd test
step4: mkdir -p 1/2
step5: rmdir 1
[ 110.920551] ext4_empty_dir: inject fault
[ 110.921926] EXT4-fs warning (device sda): ext4_rmdir:3113: inode #12:
comm rmdir: empty directory '1' has too many links (3)
step6: cd ..
step7: umount test
step8: fsck.ext4 -f /dev/sda
e2fsck 1.42.9 (28-Dec-2013)
Pass 1: Checking inodes, blocks, and sizes
Pass 2: Checking directory structure
Entry '..' in .../??? (13) has deleted/unused inode 12. Clear<y>? yes
Pass 3: Checking directory connectivity
Unconnected directory inode 13 (...)
Connect to /lost+found<y>? yes
Pass 4: Checking reference counts
Inode 13 ref count is 3, should be 2. Fix<y>? yes
Pass 5: Checking group summary information
/dev/sda: ***** FILE SYSTEM WAS MODIFIED *****
/dev/sda: 12/131072 files (0.0% non-contiguous), 26157/524288 blocks
ext4_rmdir
if (!ext4_empty_dir(inode))
goto end_rmdir;
ext4_empty_dir
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
return true;
Now if read directory block failed, 'ext4_empty_dir' will return true, assume
directory is empty. Obviously, it will lead to above issue.
To solve this issue, if read directory block failed 'ext4_empty_dir' just
return false. To avoid making things worse when file system is already
corrupted, 'ext4_empty_dir' also return false.
Signed-off-by: Ye Bin <yebin10@huawei.com>
Cc: stable@kernel.org
Link: https://lore.kernel.org/r/20220228024815.3952506-1-yebin10@huawei.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
2022-02-28 05:48:15 +03:00
ret = true ;
2012-12-10 23:06:01 +04:00
out :
up_read ( & EXT4_I ( dir ) - > xattr_sem ) ;
brelse ( iloc . bh ) ;
return ret ;
}
2012-12-10 23:04:46 +04:00
int ext4_destroy_inline_data ( handle_t * handle , struct inode * inode )
{
2017-01-12 05:50:46 +03:00
int ret , no_expand ;
2012-12-10 23:04:46 +04:00
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
2012-12-10 23:04:46 +04:00
ret = ext4_destroy_inline_data_nolock ( handle , inode ) ;
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:04:46 +04:00
return ret ;
}
2012-12-10 23:06:02 +04:00
2017-10-02 00:57:54 +03:00
int ext4_inline_data_iomap ( struct inode * inode , struct iomap * iomap )
{
__u64 addr ;
int error = - EAGAIN ;
struct ext4_iloc iloc ;
down_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
if ( ! ext4_has_inline_data ( inode ) )
goto out ;
error = ext4_get_inode_loc ( inode , & iloc ) ;
if ( error )
goto out ;
addr = ( __u64 ) iloc . bh - > b_blocknr < < inode - > i_sb - > s_blocksize_bits ;
addr + = ( char * ) ext4_raw_inode ( & iloc ) - iloc . bh - > b_data ;
addr + = offsetof ( struct ext4_inode , i_block ) ;
brelse ( iloc . bh ) ;
iomap - > addr = addr ;
iomap - > offset = 0 ;
iomap - > length = min_t ( loff_t , ext4_get_inline_size ( inode ) ,
i_size_read ( inode ) ) ;
2018-06-01 19:03:06 +03:00
iomap - > type = IOMAP_INLINE ;
iomap - > flags = 0 ;
2017-10-02 00:57:54 +03:00
out :
up_read ( & EXT4_I ( inode ) - > xattr_sem ) ;
return error ;
}
2017-01-23 03:35:49 +03:00
int ext4_inline_data_truncate ( struct inode * inode , int * has_inline )
2012-12-10 23:06:02 +04:00
{
handle_t * handle ;
2017-01-23 03:35:49 +03:00
int inline_size , value_len , needed_blocks , no_expand , err = 0 ;
2012-12-10 23:06:02 +04:00
size_t i_size ;
void * value = NULL ;
struct ext4_xattr_ibody_find is = {
. s = { . not_found = - ENODATA , } ,
} ;
struct ext4_xattr_info i = {
. name_index = EXT4_XATTR_INDEX_SYSTEM ,
. name = EXT4_XATTR_SYSTEM_DATA ,
} ;
needed_blocks = ext4_writepage_trans_blocks ( inode ) ;
2013-02-09 06:59:22 +04:00
handle = ext4_journal_start ( inode , EXT4_HT_INODE , needed_blocks ) ;
2012-12-10 23:06:02 +04:00
if ( IS_ERR ( handle ) )
2017-01-23 03:35:49 +03:00
return PTR_ERR ( handle ) ;
2012-12-10 23:06:02 +04:00
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
2012-12-10 23:06:02 +04:00
if ( ! ext4_has_inline_data ( inode ) ) {
2020-11-03 05:29:02 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:06:02 +04:00
* has_inline = 0 ;
ext4_journal_stop ( handle ) ;
2017-01-23 03:35:49 +03:00
return 0 ;
2012-12-10 23:06:02 +04:00
}
2017-01-23 03:35:49 +03:00
if ( ( err = ext4_orphan_add ( handle , inode ) ) ! = 0 )
2012-12-10 23:06:02 +04:00
goto out ;
2017-01-23 03:35:49 +03:00
if ( ( err = ext4_get_inode_loc ( inode , & is . iloc ) ) ! = 0 )
2012-12-10 23:06:02 +04:00
goto out ;
down_write ( & EXT4_I ( inode ) - > i_data_sem ) ;
i_size = inode - > i_size ;
inline_size = ext4_get_inline_size ( inode ) ;
EXT4_I ( inode ) - > i_disksize = i_size ;
if ( i_size < inline_size ) {
2021-08-19 17:49:26 +03:00
/*
* if there ' s inline data to truncate and this file was
* converted to extents after that inline data was written ,
* the extent status cache must be cleared to avoid leaving
* behind stale delayed allocated extent entries
*/
2023-04-24 06:38:43 +03:00
if ( ! ext4_test_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) )
ext4_es_remove_extent ( inode , 0 , EXT_MAX_BLOCKS ) ;
2021-08-19 17:49:26 +03:00
2012-12-10 23:06:02 +04:00
/* Clear the content in the xattr space. */
if ( inline_size > EXT4_MIN_INLINE_DATA_SIZE ) {
2017-01-23 03:35:49 +03:00
if ( ( err = ext4_xattr_ibody_find ( inode , & i , & is ) ) ! = 0 )
2012-12-10 23:06:02 +04:00
goto out_error ;
BUG_ON ( is . s . not_found ) ;
value_len = le32_to_cpu ( is . s . here - > e_value_size ) ;
value = kmalloc ( value_len , GFP_NOFS ) ;
2017-01-23 03:35:49 +03:00
if ( ! value ) {
err = - ENOMEM ;
2012-12-10 23:06:02 +04:00
goto out_error ;
2017-01-23 03:35:49 +03:00
}
2012-12-10 23:06:02 +04:00
2017-01-23 03:35:49 +03:00
err = ext4_xattr_ibody_get ( inode , i . name_index ,
i . name , value , value_len ) ;
if ( err < = 0 )
2012-12-10 23:06:02 +04:00
goto out_error ;
i . value = value ;
i . value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0 ;
2021-06-03 05:03:02 +03:00
err = ext4_xattr_ibody_set ( handle , inode , & i , & is ) ;
2017-01-23 03:35:49 +03:00
if ( err )
2012-12-10 23:06:02 +04:00
goto out_error ;
}
/* Clear the content within i_blocks. */
2014-01-07 21:58:19 +04:00
if ( i_size < EXT4_MIN_INLINE_DATA_SIZE ) {
void * p = ( void * ) ext4_raw_inode ( & is . iloc ) - > i_block ;
memset ( p + i_size , 0 ,
EXT4_MIN_INLINE_DATA_SIZE - i_size ) ;
}
2012-12-10 23:06:02 +04:00
EXT4_I ( inode ) - > i_inline_size = i_size <
EXT4_MIN_INLINE_DATA_SIZE ?
EXT4_MIN_INLINE_DATA_SIZE : i_size ;
}
out_error :
up_write ( & EXT4_I ( inode ) - > i_data_sem ) ;
out :
brelse ( is . iloc . bh ) ;
2017-01-12 05:50:46 +03:00
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:06:02 +04:00
kfree ( value ) ;
if ( inode - > i_nlink )
ext4_orphan_del ( handle , inode ) ;
2017-01-23 03:35:49 +03:00
if ( err = = 0 ) {
2023-07-05 22:01:07 +03:00
inode - > i_mtime = inode_set_ctime_current ( inode ) ;
2017-01-23 03:35:49 +03:00
err = ext4_mark_inode_dirty ( handle , inode ) ;
if ( IS_SYNC ( inode ) )
ext4_handle_sync ( handle ) ;
}
2012-12-10 23:06:02 +04:00
ext4_journal_stop ( handle ) ;
2017-01-23 03:35:49 +03:00
return err ;
2012-12-10 23:06:02 +04:00
}
2012-12-10 23:06:03 +04:00
int ext4_convert_inline_data ( struct inode * inode )
{
2017-01-12 05:50:46 +03:00
int error , needed_blocks , no_expand ;
2012-12-10 23:06:03 +04:00
handle_t * handle ;
struct ext4_iloc iloc ;
if ( ! ext4_has_inline_data ( inode ) ) {
ext4_clear_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ;
return 0 ;
2022-05-16 15:26:34 +03:00
} else if ( ! ext4_test_inode_state ( inode , EXT4_STATE_MAY_INLINE_DATA ) ) {
/*
* Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
* cleared . This means we are in the middle of moving of
* inline data to delay allocated block . Just force writeout
* here to finish conversion .
*/
error = filemap_flush ( inode - > i_mapping ) ;
if ( error )
return error ;
if ( ! ext4_has_inline_data ( inode ) )
return 0 ;
2012-12-10 23:06:03 +04:00
}
needed_blocks = ext4_writepage_trans_blocks ( inode ) ;
iloc . bh = NULL ;
error = ext4_get_inode_loc ( inode , & iloc ) ;
if ( error )
return error ;
2013-02-09 06:59:22 +04:00
handle = ext4_journal_start ( inode , EXT4_HT_WRITE_PAGE , needed_blocks ) ;
2012-12-10 23:06:03 +04:00
if ( IS_ERR ( handle ) ) {
error = PTR_ERR ( handle ) ;
goto out_free ;
}
2017-01-12 05:50:46 +03:00
ext4_write_lock_xattr ( inode , & no_expand ) ;
if ( ext4_has_inline_data ( inode ) )
error = ext4_convert_inline_data_nolock ( handle , inode , & iloc ) ;
ext4_write_unlock_xattr ( inode , & no_expand ) ;
2012-12-10 23:06:03 +04:00
ext4_journal_stop ( handle ) ;
out_free :
brelse ( iloc . bh ) ;
return error ;
}