From c79c3c34f75d72a066e292b10aa50fc758c97c89 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 24 Feb 2021 12:00:12 -0800 Subject: [PATCH 001/172] hexagon: remove CONFIG_EXPERIMENTAL from defconfigs Since CONFIG_EXPERIMENTAL was removed in 2013, go ahead and drop it from any defconfig files. Link: https://lkml.kernel.org/r/20210115010011.29483-1-rdunlap@infradead.org Fixes: 3d374d09f16f ("final removal of CONFIG_EXPERIMENTAL") Signed-off-by: Randy Dunlap Cc: Kees Cook Cc: Greg Kroah-Hartman Cc: Brian Cain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/configs/comet_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/hexagon/configs/comet_defconfig b/arch/hexagon/configs/comet_defconfig index e324f65f41e7..f19ae2ab0aaa 100644 --- a/arch/hexagon/configs/comet_defconfig +++ b/arch/hexagon/configs/comet_defconfig @@ -1,7 +1,6 @@ CONFIG_SMP=y CONFIG_DEFAULT_MMAP_MIN_ADDR=0 CONFIG_HZ_100=y -CONFIG_EXPERIMENTAL=y CONFIG_CROSS_COMPILE="hexagon-" CONFIG_LOCALVERSION="-smp" # CONFIG_LOCALVERSION_AUTO is not set From 6b294bf6b4f6cc4a2cf2029dff31010ab4addffc Mon Sep 17 00:00:00 2001 From: tangchunyou Date: Wed, 24 Feb 2021 12:00:15 -0800 Subject: [PATCH 002/172] scripts/spelling.txt: increase error-prone spell checking Increase maping spelling error check. Link: https://lkml.kernel.org/r/20210121092125.2663-1-tangchunyou@163.com Signed-off-by: WenZhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 953f4a2de1e5..ebcb27bda1f5 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -875,6 +875,7 @@ manger||manager manoeuvering||maneuvering manufaucturing||manufacturing mappping||mapping +maping||mapping matchs||matches mathimatical||mathematical mathimatic||mathematic From 02bbbc4b554ed2d971c5f49950244d8f3d0b112e Mon Sep 17 00:00:00 2001 From: zuoqilin Date: Wed, 24 Feb 2021 12:00:18 -0800 Subject: [PATCH 003/172] scripts/spelling.txt: check for "exeeds" Increase exeeds spelling error check. Link: https://lkml.kernel.org/r/20210127060049.915-1-zuoqilin1@163.com Signed-off-by: zuoqilin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index ebcb27bda1f5..334f932e0b1b 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -566,6 +566,7 @@ estbalishment||establishment etsablishment||establishment etsbalishment||establishment evalution||evaluation +exeeds||exceeds excecutable||executable exceded||exceeded exceds||exceeds From 4945192325708bb5cc5cb5b07f288e118f5f65bd Mon Sep 17 00:00:00 2001 From: dingsenjie Date: Wed, 24 Feb 2021 12:00:21 -0800 Subject: [PATCH 004/172] scripts/spelling.txt: add "allocted" and "exeeds" typo Increase "allocted" and "exeeds" spelling error check. Link: https://lkml.kernel.org/r/20210127081919.1928-1-dingsenjie@163.com Signed-off-by: dingsenjie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index 334f932e0b1b..a47006de990e 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -103,6 +103,7 @@ alloated||allocated allocatote||allocate allocatrd||allocated allocte||allocate +allocted||allocated allpication||application alocate||allocate alogirhtms||algorithms @@ -575,6 +576,7 @@ excellant||excellent execeeded||exceeded execeeds||exceeds exeed||exceed +exeeds||exceeds exeuction||execution existance||existence existant||existent From 30cdbd53921ff8c39c7c2e7a6318d964a0ae154c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 24 Feb 2021 12:00:24 -0800 Subject: [PATCH 005/172] scripts/spelling.txt: add more spellings to spelling.txt Here are some of the more common spelling mistakes and typos that I've found while fixing up spelling mistakes in the kernel since September 2020 Link: https://lkml.kernel.org/r/20210210124318.55082-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spelling.txt | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/spelling.txt b/scripts/spelling.txt index a47006de990e..2e3ba91a5072 100644 --- a/scripts/spelling.txt +++ b/scripts/spelling.txt @@ -340,6 +340,7 @@ comppatible||compatible compres||compress compresion||compression comression||compression +comunicate||communicate comunication||communication conbination||combination conditionaly||conditionally @@ -467,6 +468,7 @@ developpment||development deveolpment||development devided||divided deviece||device +devision||division diable||disable dicline||decline dictionnary||dictionary @@ -480,6 +482,7 @@ difinition||definition digial||digital dimention||dimension dimesions||dimensions +diconnected||disconnected disgest||digest dispalying||displaying diplay||display @@ -519,6 +522,7 @@ downlads||downloads droped||dropped droput||dropout druing||during +dyanmic||dynamic dynmaic||dynamic eanable||enable eanble||enable @@ -543,6 +547,7 @@ encrupted||encrypted encrypiton||encryption encryptio||encryption endianess||endianness +enpoint||endpoint enhaced||enhanced enlightnment||enlightenment enqueing||enqueuing @@ -644,6 +649,7 @@ forwardig||forwarding frambuffer||framebuffer framming||framing framwork||framework +frequence||frequency frequncy||frequency frequancy||frequency frome||from @@ -686,10 +692,12 @@ handfull||handful hanlde||handle hanled||handled happend||happened +hardare||hardware harware||hardware havind||having heirarchically||hierarchically helpfull||helpful +heterogenous||heterogeneous hexdecimal||hexadecimal hybernate||hibernate hierachy||hierarchy @@ -734,6 +742,7 @@ inconsistant||inconsistent increas||increase incremeted||incremented incrment||increment +incuding||including inculde||include indendation||indentation indended||intended @@ -744,6 +753,7 @@ indiate||indicate indicat||indicate inexpect||inexpected inferface||interface +infinit||infinite infomation||information informatiom||information informations||information @@ -774,6 +784,7 @@ instace||instance instal||install instanciate||instantiate instanciated||instantiated +instuments||instruments insufficent||insufficient inteface||interface integreated||integrated @@ -872,6 +883,7 @@ mailformed||malformed malplaced||misplaced malplace||misplace managable||manageable +managament||management managment||management mangement||management manger||manager @@ -890,6 +902,7 @@ meetign||meeting memeory||memory memmber||member memoery||memory +memroy||memory ment||meant mergable||mergeable mesage||message @@ -1003,6 +1016,7 @@ overlaping||overlapping overide||override overrided||overridden overriden||overridden +overrrun||overrun overun||overrun overwritting||overwriting overwriten||overwritten @@ -1039,6 +1053,7 @@ peforming||performing peice||piece pendantic||pedantic peprocessor||preprocessor +perfomance||performance perfoming||performing perfomring||performing periperal||peripheral @@ -1104,6 +1119,7 @@ prodecure||procedure progamming||programming progams||programs progess||progress +programable||programmable programers||programmers programm||program programms||programs @@ -1148,6 +1164,7 @@ recieved||received recieve||receive reciever||receiver recieves||receives +recieving||receiving recogniced||recognised recognizeable||recognizable recommanded||recommended @@ -1251,6 +1268,7 @@ searchs||searches secquence||sequence secund||second segement||segment +seleted||selected semaphone||semaphore senario||scenario senarios||scenarios @@ -1267,6 +1285,7 @@ seqeunce||sequence seqeuncer||sequencer seqeuencer||sequencer sequece||sequence +sequemce||sequence sequencial||sequential serivce||service serveral||several @@ -1337,6 +1356,7 @@ suble||subtle substract||subtract submited||submitted submition||submission +succeded||succeeded suceed||succeed succesfully||successfully succesful||successful @@ -1357,6 +1377,7 @@ supportin||supporting suppoted||supported suppported||supported suppport||support +supprot||support supress||suppress surpressed||suppressed surpresses||suppresses @@ -1405,6 +1426,7 @@ thresold||threshold throught||through trackling||tracking troughput||throughput +trys||tries thses||these tiggers||triggers tiggered||triggered @@ -1418,7 +1440,9 @@ traking||tracking tramsmitted||transmitted tramsmit||transmit tranasction||transaction +tranceiver||transceiver tranfer||transfer +tranmission||transmission transcevier||transceiver transciever||transceiver transferd||transferred @@ -1472,6 +1496,7 @@ unnecesary||unnecessary unneedingly||unnecessarily unnsupported||unsupported unmached||unmatched +unprecise||imprecise unregester||unregister unresgister||unregister unrgesiter||unregister @@ -1507,6 +1532,7 @@ varient||variant vaule||value verbse||verbose veify||verify +veriosn||version verisons||versions verison||version verson||version From 6bbf29010fa90a7ff22ff14e2875b4e6dea8d576 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 24 Feb 2021 12:00:27 -0800 Subject: [PATCH 006/172] ntfs: layout.h: delete duplicated words Drop the repeated words "the" and "in" in comments. Link: https://lkml.kernel.org/r/20210125194937.24627-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/layout.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 85422761ff43..5d4bf7a3259f 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -703,7 +703,7 @@ typedef struct { /* 14*/ le16 instance; /* The instance of this attribute record. This number is unique within this mft record (see MFT_RECORD/next_attribute_instance notes in - in mft.h for more details). */ + mft.h for more details). */ /* 16*/ union { /* Resident attributes. */ struct { @@ -1838,7 +1838,7 @@ typedef struct { * Also, each security descriptor is stored twice in the $SDS stream with a * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * the first copy of the security descriptor will be at offset 0x51d0 in the + * first copy of the security descriptor will be at offset 0x51d0 in the * $SDS data stream and the second copy will be at offset 0x451d0. */ typedef struct { From 4dfe6bd94959222e18d512bdf15f6bf9edb9c27c Mon Sep 17 00:00:00 2001 From: Rustam Kovhaev Date: Wed, 24 Feb 2021 12:00:30 -0800 Subject: [PATCH 007/172] ntfs: check for valid standard information attribute Mounting a corrupted filesystem with NTFS resulted in a kernel crash. We should check for valid STANDARD_INFORMATION attribute offset and length before trying to access it Link: https://lkml.kernel.org/r/20210217155930.1506815-1-rkovhaev@gmail.com Link: https://syzkaller.appspot.com/bug?extid=c584225dabdea2f71969 Signed-off-by: Rustam Kovhaev Reported-by: syzbot+c584225dabdea2f71969@syzkaller.appspotmail.com Tested-by: syzbot+c584225dabdea2f71969@syzkaller.appspotmail.com Acked-by: Anton Altaparmakov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 4435dbbc0b63..f5c058b3192c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -629,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi) } a = ctx->attr; /* Get the standard information attribute value. */ + if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu(a->data.resident.value_length) > + (u8 *)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); + goto unm_err_out; + } si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); From 6efb59499aff080e6a9f1485ff968918c30c5b0c Mon Sep 17 00:00:00 2001 From: Yi Li Date: Wed, 24 Feb 2021 12:00:34 -0800 Subject: [PATCH 008/172] ocfs2: remove redundant conditional before iput iput handles NULL pointers gracefully, so there's no need to check the pointer before the call. Link: https://lkml.kernel.org/r/20201231040535.4091761-1-yili@winhong.com Signed-off-by: Yi Li Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2febc76e9de7..079f8826993e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -973,8 +973,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) * quota files */ dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (!inode) - continue; iput(inode); } } From 95e126d650391696f7ba8d318634cc018df10ef9 Mon Sep 17 00:00:00 2001 From: guozh Date: Wed, 24 Feb 2021 12:00:38 -0800 Subject: [PATCH 009/172] ocfs2: clean up some definitions which are not used any more There are some definitions which is not used anymore in OCFS2 module, so as to be removed. Link: https://lkml.kernel.org/r/2021011916182284700534@chinatelecom.cn Signed-off-by: Guozhonghua Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmast.c | 10 ---------- fs/ocfs2/dlm/dlmcommon.h | 4 ---- 2 files changed, 14 deletions(-) diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 6abaded3ff6b..70a10764f249 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -165,16 +165,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) spin_unlock(&lock->spinlock); } -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) -{ - BUG_ON(!dlm); - BUG_ON(!lock); - - spin_lock(&dlm->ast_lock); - __dlm_queue_bast(dlm, lock); - spin_unlock(&dlm->ast_lock); -} - static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index c8a444622faa..58d57e25d384 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -17,10 +17,7 @@ #define DLM_LOCKID_NAME_MAX 32 -#define DLM_DOMAIN_NAME_MAX_LEN 255 #define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES -#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes -#define DLM_THREAD_MS 200 // flush at least every 200 ms #define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE @@ -902,7 +899,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, From c57d117f2b2f2a19b570c36f2819ef8d8210af20 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 24 Feb 2021 12:00:41 -0800 Subject: [PATCH 010/172] ocfs2: fix a use after free on error The error handling in this function frees "reg" but it is still on the "o2hb_all_regions" list so it will lead to a use after freew. Joseph Qi points out that we need to clear the bit in the "o2hb_region_bitmap" as well Link: https://lkml.kernel.org/r/YBk4M6HUG8jB/jc7@mwanda Fixes: 1cf257f51191 ("ocfs2: fix memory leak") Signed-off-by: Dan Carpenter Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0179a73a3fa2..12a7590601dd 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2042,7 +2042,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g o2hb_nego_timeout_handler, reg, NULL, ®->hr_handler_list); if (ret) - goto free; + goto remove_item; ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, sizeof(struct o2hb_nego_msg), @@ -2057,6 +2057,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g unregister_handler: o2net_unregister_handler_list(®->hr_handler_list); +remove_item: + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + spin_unlock(&o2hb_live_lock); free: kfree(reg); return ERR_PTR(ret); From 7c908aec34733408baa755613141a08b960d8eec Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 24 Feb 2021 12:00:45 -0800 Subject: [PATCH 011/172] ocfs2: simplify the calculation of variables Fix the following coccicheck warnings: fs/ocfs2/refcounttree.c:981:16-18: WARNING !A || A && B is equivalent to !A || B. Link: https://lkml.kernel.org/r/1612235424-80367-1-git-send-email-jiapeng.chong@linux.alibaba.com Signed-off-by: Jiapeng Chong Reported-by: Abaci Robot Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index c26937824be1..c19a463fac55 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -978,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, return 0; } - if (!eb || (eb && !eb->h_next_leaf_blk)) { + if (!eb || !eb->h_next_leaf_blk) { /* * We are the last extent rec, so any high cpos should * be stored in this leaf refcount block. From 3d742d4b6ebb3348e1d478047cfb18b9b337b8df Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 24 Feb 2021 12:00:48 -0800 Subject: [PATCH 012/172] fs: delete repeated words in comments Delete duplicate words in fs/*.c. The doubled words that are being dropped are: that, be, the, in, and, for Link: https://lkml.kernel.org/r/20201224052810.25315-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Matthew Wilcox (Oracle) Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 +- fs/dcache.c | 4 ++-- fs/direct-io.c | 4 ++-- fs/exec.c | 4 ++-- fs/fhandle.c | 2 +- fs/pipe.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index ec26179c8062..91ff9bfd7c1a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1270,7 +1270,7 @@ rescan: return ret; } /* - * Only exported for for loop and dasd for historic reasons. Don't use in new + * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); diff --git a/fs/dcache.c b/fs/dcache.c index 799d9e4f0bcd..7d24ff7eb206 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2176,8 +2176,8 @@ EXPORT_SYMBOL(d_obtain_root); * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * - * For a case-insensitive lookup match and if the the case-exact dentry - * already exists in in the dcache, use it and return it. + * For a case-insensitive lookup match and if the case-exact dentry + * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. diff --git a/fs/direct-io.c b/fs/direct-io.c index aa1083ecd623..0957e1bb8eb2 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -462,7 +462,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) * Wait for the next BIO to complete. Remove it and return it. NULL is * returned once all BIOs have been completed. This must only be called once * all bios have been issued so that dio->refcount can only decrease. This - * requires that that the caller hold a reference on the dio. + * requires that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { @@ -1279,7 +1279,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (retval == -ENOTBLK) { /* * The remaining part of the request will be - * be handled by buffered I/O when we return + * handled by buffered I/O when we return */ retval = 0; } diff --git a/fs/exec.c b/fs/exec.c index 6f3c02066ce3..18594f11c31f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1455,7 +1455,7 @@ EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. - * Or, if exec fails before, free_bprm() should release ->cred and + * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) @@ -1841,7 +1841,7 @@ static int bprm_execve(struct linux_binprm *bprm, out: /* - * If past the point of no return ensure the the code never + * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. diff --git a/fs/fhandle.c b/fs/fhandle.c index 01263ffbc4c0..ec6feeccc276 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -173,7 +173,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, /* * With handle we don't look at the execute bit on the - * the directory. Ideally we would like CAP_DAC_SEARCH. + * directory. Ideally we would like CAP_DAC_SEARCH. * But we don't have that */ if (!capable(CAP_DAC_READ_SEARCH)) { diff --git a/fs/pipe.c b/fs/pipe.c index 39c96845a72f..bfd946a9ad01 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); * * Description: * This function grabs an extra reference to @buf. It's used in - * in the tee() system call, when we duplicate the buffers in one + * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) From 93da400397445f1110b394caab5558d13971378e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 24 Feb 2021 12:00:51 -0800 Subject: [PATCH 013/172] ramfs: support O_TMPFILE [akpm@linux-foundation.org: update inode_operations.tmpfile] Link: http://lkml.kernel.org/r/20190206073349.GA15311@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Christian Brauner Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 3c2658c8fde0..9ebd17d7befb 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -151,6 +151,18 @@ static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, return error; } +static int ramfs_tmpfile(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOSPC; + d_tmpfile(dentry, inode); + return 0; +} + static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, @@ -161,6 +173,7 @@ static const struct inode_operations ramfs_dir_inode_operations = { .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, + .tmpfile = ramfs_tmpfile, }; /* From 3544de8ee6e4817278b15fe08658de49abf58954 Mon Sep 17 00:00:00 2001 From: Jacob Wen Date: Wed, 24 Feb 2021 12:00:55 -0800 Subject: [PATCH 014/172] mm, tracing: record slab name for kmem_cache_free() Currently, a trace record generated by the RCU core is as below. ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f3b49a66 It doesn't tell us what the RCU core has freed. This patch adds the slab name to trace_kmem_cache_free(). The new format is as follows. ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000037f79c8d name=dentry ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=00000000f78cb7b5 name=sock_inode_cache ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=0000000018768985 name=pool_workqueue ... kmem_cache_free: call_site=rcu_core+0x1fd/0x610 ptr=000000006a6cb484 name=radix_tree_node We can use it to understand what the RCU core is going to free. For example, some users maybe interested in when the RCU core starts freeing reclaimable slabs like dentry to reduce memory pressure. Link: https://lkml.kernel.org/r/20201216072804.8838-1-jian.w.wen@oracle.com Signed-off-by: Jacob Wen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Steven Rostedt Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 24 ++++++++++++++++-------- mm/slab.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index f65b1f6db22d..40845b0d5dad 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -115,7 +115,7 @@ DEFINE_EVENT(kmem_alloc_node, kmem_cache_alloc_node, TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node) ); -DECLARE_EVENT_CLASS(kmem_free, +TRACE_EVENT(kfree, TP_PROTO(unsigned long call_site, const void *ptr), @@ -135,18 +135,26 @@ DECLARE_EVENT_CLASS(kmem_free, (void *)__entry->call_site, __entry->ptr) ); -DEFINE_EVENT(kmem_free, kfree, +TRACE_EVENT(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr), + TP_PROTO(unsigned long call_site, const void *ptr, const char *name), - TP_ARGS(call_site, ptr) -); + TP_ARGS(call_site, ptr, name), -DEFINE_EVENT(kmem_free, kmem_cache_free, + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( const char *, name ) + ), - TP_PROTO(unsigned long call_site, const void *ptr), + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->name = name; + ), - TP_ARGS(call_site, ptr) + TP_printk("call_site=%pS ptr=%p name=%s", + (void *)__entry->call_site, __entry->ptr, __entry->name) ); TRACE_EVENT(mm_page_free, diff --git a/mm/slab.c b/mm/slab.c index dcc55e78f353..2ed27849d1f2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3717,7 +3717,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); - trace_kmem_cache_free(_RET_IP_, objp); + trace_kmem_cache_free(_RET_IP_, objp, cachep->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slob.c b/mm/slob.c index ef87ada8705d..0578429b991b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -673,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b) __kmem_cache_free(b, c->size); } - trace_kmem_cache_free(_RET_IP_, b); + trace_kmem_cache_free(_RET_IP_, b, c->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index f5baf429654f..f4dc70228861 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3157,7 +3157,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); - trace_kmem_cache_free(_RET_IP_, x); + trace_kmem_cache_free(_RET_IP_, x, s->name); } EXPORT_SYMBOL(kmem_cache_free); From 3754000872188e3e4713d9d847fe3c615a47c220 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 24 Feb 2021 12:00:58 -0800 Subject: [PATCH 015/172] mm/sl?b.c: remove ctor argument from kmem_cache_flags This argument hasn't been used since e153362a50a3 ("slub: Remove objsize check in kmem_cache_flags()") so simply remove it. Link: https://lkml.kernel.org/r/20210126095733.974665-1-nborisov@suse.com Signed-off-by: Nikolay Borisov Reviewed-by: Miaohe Lin Reviewed-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 3 +-- mm/slab.h | 6 ++---- mm/slab_common.c | 2 +- mm/slub.c | 9 +++------ 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 2ed27849d1f2..9982ff8ff175 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1790,8 +1790,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } diff --git a/mm/slab.h b/mm/slab.h index ecad9b57bc44..48dc466a3791 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -110,8 +110,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)); + slab_flags_t flags, const char *name); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, @@ -119,8 +118,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, { return NULL; } static inline slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } diff --git a/mm/slab_common.c b/mm/slab_common.c index adbace4256ef..580124b7fed8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -197,7 +197,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); + flags = kmem_cache_flags(size, flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; diff --git a/mm/slub.c b/mm/slub.c index f4dc70228861..5d8d5d21e5d3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1400,7 +1400,6 @@ __setup("slub_debug", setup_slub_debug); * @object_size: the size of an object without meta data * @flags: flags to set * @name: name of the cache - * @ctor: constructor function * * Debug option(s) are applied to @flags. In addition to the debug * option(s), if a slab name (or multiple) is specified i.e. @@ -1408,8 +1407,7 @@ __setup("slub_debug", setup_slub_debug); * then only the select slabs will receive the debug option(s). */ slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { char *iter; size_t len; @@ -1474,8 +1472,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } @@ -3797,7 +3794,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { - s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif From 0b41163407e2f3f44d6ed455ebfb1534df23f4a6 Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Wed, 24 Feb 2021 12:01:01 -0800 Subject: [PATCH 016/172] mm/slab: minor coding style tweaks Fix some coding style issues, improve code reading. Adds whitespace to clearly separate the parameters. Link: https://lkml.kernel.org/r/1612841499-32166-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 9982ff8ff175..cdad64b2836d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -272,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -296,7 +296,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) +#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -332,7 +332,7 @@ static int obj_offset(struct kmem_cache *cachep) static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long long*) (objp + obj_offset(cachep) - + return (unsigned long long *) (objp + obj_offset(cachep) - sizeof(unsigned long long)); } @@ -580,7 +580,7 @@ static int transfer_objects(struct array_cache *to, if (!nr) return 0; - memcpy(to->entry + to->avail, from->entry + from->avail -nr, + memcpy(to->entry + to->avail, from->entry + from->avail - nr, sizeof(void *) *nr); from->avail -= nr; @@ -2737,7 +2737,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #else #define kfree_debugcheck(x) do { } while(0) -#define cache_free_debugcheck(x,objp,z) (objp) +#define cache_free_debugcheck(x, objp, z) (objp) #endif static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, @@ -3024,7 +3024,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; } #else -#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) +#define cache_alloc_debugcheck_after(a, b, objp, d) (objp) #endif static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) From ca220593208d8c433a761738461c31b1bf0be1f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 24 Feb 2021 12:01:04 -0800 Subject: [PATCH 017/172] mm/slub: disable user tracing for kmemleak caches by default If kmemleak is enabled, it uses a kmem cache for its own objects. These objects are used to hold information kmemleak uses, including a stack trace. If slub_debug is also turned on, each of them has *another* stack trace, so the overhead adds up, and on my tests (on ARCH=um, admittedly) 2/3rds of the allocations end up being doing the stack tracing. Turn off SLAB_STORE_USER if SLAB_NOLEAKTRACE was given, to avoid storing the essentially same data twice. Link: https://lkml.kernel.org/r/20210113215114.d94efa13ba30.I117b6764e725b3192318bbcf4269b13b709539ae@changeid Signed-off-by: Johannes Berg Acked-by: David Rientjes Acked-by: Catalin Marinas Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 5d8d5d21e5d3..a7c453bf7156 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1413,6 +1413,15 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, size_t len; char *next_block; slab_flags_t block_flags; + slab_flags_t slub_debug_local = slub_debug; + + /* + * If the slab cache is for debugging (e.g. kmemleak) then + * don't store user (stack trace) information by default, + * but let the user enable it via the command line below. + */ + if (flags & SLAB_NOLEAKTRACE) + slub_debug_local &= ~SLAB_STORE_USER; len = strlen(name); next_block = slub_debug_string; @@ -1447,7 +1456,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, } } - return flags | slub_debug; + return flags | slub_debug_local; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, From 666716fd267df0007dfbb6480cd79dd5b05da4cc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:01:08 -0800 Subject: [PATCH 018/172] mm, slub: stop freeing kmem_cache_node structures on node offline Patch series "mm, slab, slub: remove cpu and memory hotplug locks". Some related work caused me to look at how we use get/put_mems_online() and get/put_online_cpus() during kmem cache creation/descruction/shrinking, and realize that it should be actually safe to remove all of that with rather small effort (as e.g. Michal Hocko suspected in some of the past discussions already). This has the benefit to avoid rather heavy locks that have caused locking order issues already in the past. So this is the result, Patches 2 and 3 remove memory hotplug and cpu hotplug locking, respectively. Patch 1 is due to realization that in fact some races exist despite the locks (even if not removed), but the most sane solution is not to introduce more of them, but rather accept some wasted memory in scenarios that should be rare anyway (full memory hot remove), as we do the same in other contexts already. This patch (of 3): Commit e4f8e513c3d3 ("mm/slub: fix a deadlock in show_slab_objects()") has fixed a problematic locking order by removing the memory hotplug lock get/put_online_mems() from show_slab_objects(). During the discussion, it was argued [1] that this is OK, because existing slabs on the node would prevent a hotremove to proceed. That's true, but per-node kmem_cache_node structures are not necessarily allocated on the same node and may exist even without actual slab pages on the same node. Any path that uses get_node() directly or via for_each_kmem_cache_node() (such as show_slab_objects()) can race with freeing of kmem_cache_node even with the !NULL check, resulting in use-after-free. To that end, commit e4f8e513c3d3 argues in a comment that: * We don't really need mem_hotplug_lock (to hold off * slab_mem_going_offline_callback) here because slab's memory hot * unplug code doesn't destroy the kmem_cache->node[] data. While it's true that slab_mem_going_offline_callback() doesn't free the kmem_cache_node, the later callback slab_mem_offline_callback() actually does, so the race and use-after-free exists. Not just for show_slab_objects() after commit e4f8e513c3d3, but also many other places that are not under slab_mutex. And adding slab_mutex locking or other synchronization to SLUB paths such as get_any_partial() would be bad for performance and error-prone. The easiest solution is therefore to make the abovementioned comment true and stop freeing the kmem_cache_node structures, accepting some wasted memory in the full memory node removal scenario. Analogically we also don't free hotremoved pgdat as mentioned in [1], nor the similar per-node structures in SLAB. Importantly this approach will not block the hotremove, as generally such nodes should be movable in order to succeed hotremove in the first place, and thus the GFP_KERNEL allocated kmem_cache_node will come from elsewhere. [1] https://lore.kernel.org/linux-mm/20190924151147.GB23050@dhcp22.suse.cz/ Link: https://lkml.kernel.org/r/20210113131634.3671-1-vbabka@suse.cz Link: https://lkml.kernel.org/r/20210113131634.3671-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Cc: Qian Cai Cc: David Hildenbrand Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a7c453bf7156..1ecd8a7bd25c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4273,8 +4273,6 @@ static int slab_mem_going_offline_callback(void *arg) static void slab_mem_offline_callback(void *arg) { - struct kmem_cache_node *n; - struct kmem_cache *s; struct memory_notify *marg = arg; int offline_node; @@ -4288,21 +4286,11 @@ static void slab_mem_offline_callback(void *arg) return; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - n = get_node(s, offline_node); - if (n) { - /* - * if n->nr_slabs > 0, slabs still exist on the node - * that is going down. We were unable to free them, - * and offline_pages() function shouldn't call this - * callback. So, we must fail. - */ - BUG_ON(slabs_node(s, offline_node)); - - s->node[offline_node] = NULL; - kmem_cache_free(kmem_cache_node, n); - } - } + /* + * We no longer free kmem_cache_node structures here, as it would be + * racy with all get_node() users, and infeasible to protect them with + * slab_mutex. + */ mutex_unlock(&slab_mutex); } @@ -4328,6 +4316,12 @@ static int slab_mem_going_online_callback(void *arg) */ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { + /* + * The structure may already exist if the node was previously + * onlined and offlined. + */ + if (get_node(s, nid)) + continue; /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that From 7e1fa93deff44677a94dfc323ff629bbf5cf9360 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:01:12 -0800 Subject: [PATCH 019/172] mm, slab, slub: stop taking memory hotplug lock Since commit 03afc0e25f7f ("slab: get_online_mems for kmem_cache_{create,destroy,shrink}") we are taking memory hotplug lock for SLAB and SLUB when creating, destroying or shrinking a cache. It is quite a heavy lock and it's best to avoid it if possible, as we had several issues with lockdep complaining about ordering in the past, see e.g. e4f8e513c3d3 ("mm/slub: fix a deadlock in show_slab_objects()"). The problem scenario in 03afc0e25f7f (solved by the memory hotplug lock) can be summarized as follows: while there's slab_mutex synchronizing new kmem cache creation and SLUB's MEM_GOING_ONLINE callback slab_mem_going_online_callback(), we may miss creation of kmem_cache_node for the hotplugged node in the new kmem cache, because the hotplug callback doesn't yet see the new cache, and cache creation in init_kmem_cache_nodes() only inits kmem_cache_node for nodes in the N_NORMAL_MEMORY nodemask, which however may not yet include the new node, as that happens only later after the MEM_GOING_ONLINE callback. Instead of using get/put_online_mems(), the problem can be solved by SLUB maintaining its own nodemask of nodes for which it has allocated the per-node kmem_cache_node structures. This nodemask would generally mirror the N_NORMAL_MEMORY nodemask, but would be updated only in under SLUB's control in its memory hotplug callbacks under the slab_mutex. This patch adds such nodemask and its handling. Commit 03afc0e25f7f mentiones "issues like [the one above]", but there don't appear to be further issues. All the paths (shared for SLAB and SLUB) taking the memory hotplug locks are also taking the slab_mutex, except kmem_cache_shrink() where 03afc0e25f7f replaced slab_mutex with get/put_online_mems(). We however cannot simply restore slab_mutex in kmem_cache_shrink(), as SLUB can enters the function from a write to sysfs 'shrink' file, thus holding kernfs lock, and in kmem_cache_create() the kernfs lock is nested within slab_mutex. But on closer inspection we don't actually need to protect kmem_cache_shrink() from hotplug callbacks: While SLUB's __kmem_cache_shrink() does for_each_kmem_cache_node(), missing a new node added in parallel hotplug is not fatal, and parallel hotremove does not free kmem_cache_node's anymore after the previous patch, so use-after free cannot happen. The per-node shrinking itself is protected by n->list_lock. Same is true for SLAB, and SLOB is no-op. SLAB also doesn't need the memory hotplug locking, which it only gained by 03afc0e25f7f through the shared paths in slab_common.c. Its memory hotplug callbacks are also protected by slab_mutex against races with these paths. The problem of SLUB relying on N_NORMAL_MEMORY doesn't apply to SLAB, as its setup_kmem_cache_nodes relies on N_ONLINE, and the new node is already set there during the MEM_GOING_ONLINE callback, so no special care is needed for SLAB. As such, this patch removes all get/put_online_mems() usage by the slab subsystem. Link: https://lkml.kernel.org/r/20210113131634.3671-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Pekka Enberg Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 8 ++------ mm/slub.c | 28 +++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 580124b7fed8..d73bea53f759 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -310,7 +310,6 @@ kmem_cache_create_usercopy(const char *name, int err; get_online_cpus(); - get_online_mems(); mutex_lock(&slab_mutex); @@ -360,7 +359,6 @@ kmem_cache_create_usercopy(const char *name, out_unlock: mutex_unlock(&slab_mutex); - put_online_mems(); put_online_cpus(); if (err) { @@ -487,7 +485,6 @@ void kmem_cache_destroy(struct kmem_cache *s) return; get_online_cpus(); - get_online_mems(); mutex_lock(&slab_mutex); @@ -504,7 +501,6 @@ void kmem_cache_destroy(struct kmem_cache *s) out_unlock: mutex_unlock(&slab_mutex); - put_online_mems(); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -523,10 +519,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep) int ret; get_online_cpus(); - get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep); - put_online_mems(); + put_online_cpus(); return ret; } diff --git a/mm/slub.c b/mm/slub.c index 1ecd8a7bd25c..175cd905b58a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -235,6 +235,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +/* + * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. + * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * differ during memory hotplug/hotremove operations. + * Protected by slab_mutex. + */ +static nodemask_t slab_nodes; + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -2678,7 +2686,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * ignore the node constraint */ if (unlikely(node != NUMA_NO_NODE && - !node_state(node, N_NORMAL_MEMORY))) + !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; goto new_slab; } @@ -2689,7 +2697,7 @@ redo: * same as above but node_match() being false already * implies node != NUMA_NO_NODE */ - if (!node_state(node, N_NORMAL_MEMORY)) { + if (!node_isset(node, slab_nodes)) { node = NUMA_NO_NODE; goto redo; } else { @@ -3592,7 +3600,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; if (slab_state == DOWN) { @@ -4286,6 +4294,7 @@ static void slab_mem_offline_callback(void *arg) return; mutex_lock(&slab_mutex); + node_clear(offline_node, slab_nodes); /* * We no longer free kmem_cache_node structures here, as it would be * racy with all get_node() users, and infeasible to protect them with @@ -4335,6 +4344,11 @@ static int slab_mem_going_online_callback(void *arg) init_kmem_cache_node(n); s->node[nid] = n; } + /* + * Any cache created after this point will also have kmem_cache_node + * initialized for the new node. + */ + node_set(nid, slab_nodes); out: mutex_unlock(&slab_mutex); return ret; @@ -4415,6 +4429,7 @@ void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; + int node; if (debug_guardpage_minorder()) slub_max_order = 0; @@ -4422,6 +4437,13 @@ void __init kmem_cache_init(void) kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; + /* + * Initialize the nodemask for which we will allocate per node + * structures. Here we don't need taking slab_mutex yet. + */ + for_each_node_state(node, N_NORMAL_MEMORY) + node_set(node, slab_nodes); + create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); From 59450bbc12bee1c4e5dd25e6aa5d6a45a7bd6e81 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:01:15 -0800 Subject: [PATCH 020/172] mm, slab, slub: stop taking cpu hotplug lock SLAB has been using get/put_online_cpus() around creating, destroying and shrinking kmem caches since 95402b382901 ("cpu-hotplug: replace per-subsystem mutexes with get_online_cpus()") in 2008, which is supposed to be replacing a private mutex (cache_chain_mutex, called slab_mutex today) with system-wide mechanism, but in case of SLAB it's in fact used in addition to the existing mutex, without explanation why. SLUB appears to have avoided the cpu hotplug lock initially, but gained it due to common code unification, such as 20cea9683ecc ("mm, sl[aou]b: Move kmem_cache_create mutex handling to common code"). Regardless of the history, checking if the hotplug lock is actually needed today suggests that it's not, and therefore it's better to avoid this system-wide lock and the ordering this imposes wrt other locks (such as slab_mutex). Specifically, in SLAB we have for_each_online_cpu() in do_tune_cpucache() protected by slab_mutex, and cpu hotplug callbacks that also take the slab_mutex, which is also taken by the common slab function that currently also take the hotplug lock. Thus the slab_mutex protection should be sufficient. Also per-cpu array caches are allocated for each possible cpu, so not affected by their online/offline state. In SLUB we have for_each_online_cpu() in functions that show statistics and are already unprotected today, as racing with hotplug is not harmful. Otherwise SLUB relies on percpu allocator. The slub_cpu_dead() hotplug callback takes the slab_mutex. To sum up, this patch removes get/put_online_cpus() calls from slab as it should be safe without further adjustments. Link: https://lkml.kernel.org/r/20210113131634.3671-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Christoph Lameter Cc: David Hildenbrand Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Cc: Pekka Enberg Cc: Qian Cai Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index d73bea53f759..9633fa1bbbfd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -309,8 +309,6 @@ kmem_cache_create_usercopy(const char *name, const char *cache_name; int err; - get_online_cpus(); - mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); @@ -359,8 +357,6 @@ kmem_cache_create_usercopy(const char *name, out_unlock: mutex_unlock(&slab_mutex); - put_online_cpus(); - if (err) { if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", @@ -484,8 +480,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; - get_online_cpus(); - mutex_lock(&slab_mutex); s->refcount--; @@ -500,8 +494,6 @@ void kmem_cache_destroy(struct kmem_cache *s) } out_unlock: mutex_unlock(&slab_mutex); - - put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -518,12 +510,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep) { int ret; - get_online_cpus(); kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep); - put_online_cpus(); return ret; } EXPORT_SYMBOL(kmem_cache_shrink); From d930ff03c4d12621443f2d1c56d2f80745469021 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:01:19 -0800 Subject: [PATCH 021/172] mm, slub: splice cpu and page freelists in deactivate_slab() In deactivate_slab() we currently move all but one objects on the cpu freelist to the page freelist one by one using the costly cmpxchg_double() operation. Then we unfreeze the page while moving the last object on page freelist, with a final cmpxchg_double(). This can be optimized to avoid the cmpxchg_double() per object. Just count the objects on cpu freelist (to adjust page->inuse properly) and also remember the last object in the chain. Then splice page->freelist to the last object and effectively add the whole cpu freelist to page->freelist while unfreezing the page, with a single cmpxchg_double(). Link: https://lkml.kernel.org/r/20210115183543.15097-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Jann Horn Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 59 ++++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 35 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 175cd905b58a..13e5821ce3e2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2167,9 +2167,9 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - int lock = 0; + int lock = 0, free_delta = 0; enum slab_modes l = M_NONE, m = M_NONE; - void *nextfree; + void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; @@ -2180,45 +2180,34 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, } /* - * Stage one: Free all available per cpu objects back - * to the page freelist while it is still frozen. Leave the - * last one. - * - * There is no need to take the list->lock because the page - * is still frozen. + * Stage one: Count the objects on cpu's freelist as free_delta and + * remember the last object in freelist_tail for later splicing. */ - while (freelist && (nextfree = get_freepointer(s, freelist))) { - void *prior; - unsigned long counters; + freelist_tail = NULL; + freelist_iter = freelist; + while (freelist_iter) { + nextfree = get_freepointer(s, freelist_iter); /* * If 'nextfree' is invalid, it is possible that the object at - * 'freelist' is already corrupted. So isolate all objects - * starting at 'freelist'. + * 'freelist_iter' is already corrupted. So isolate all objects + * starting at 'freelist_iter' by skipping them. */ - if (freelist_corrupted(s, page, &freelist, nextfree)) + if (freelist_corrupted(s, page, &freelist_iter, nextfree)) break; - do { - prior = page->freelist; - counters = page->counters; - set_freepointer(s, freelist, prior); - new.counters = counters; - new.inuse--; - VM_BUG_ON(!new.frozen); + freelist_tail = freelist_iter; + free_delta++; - } while (!__cmpxchg_double_slab(s, page, - prior, counters, - freelist, new.counters, - "drain percpu freelist")); - - freelist = nextfree; + freelist_iter = nextfree; } /* - * Stage two: Ensure that the page is unfrozen while the - * list presence reflects the actual number of objects - * during unfreeze. + * Stage two: Unfreeze the page while splicing the per-cpu + * freelist to the head of page's freelist. + * + * Ensure that the page is unfrozen while the list presence + * reflects the actual number of objects during unfreeze. * * We setup the list membership and then perform a cmpxchg * with the count. If there is a mismatch then the page @@ -2231,15 +2220,15 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, */ redo: - old.freelist = page->freelist; - old.counters = page->counters; + old.freelist = READ_ONCE(page->freelist); + old.counters = READ_ONCE(page->counters); VM_BUG_ON(!old.frozen); /* Determine target state of the slab */ new.counters = old.counters; - if (freelist) { - new.inuse--; - set_freepointer(s, freelist, old.freelist); + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); new.freelist = freelist; } else new.freelist = old.freelist; From fe2cce15d6821aea1766708a1cf031071cec815f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:01:22 -0800 Subject: [PATCH 022/172] mm, slub: remove slub_memcg_sysfs boot param and CONFIG_SLUB_MEMCG_SYSFS_ON The boot param and config determine the value of memcg_sysfs_enabled, which is unused since commit 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for all allocations") as there are no per-memcg kmem caches anymore. Link: https://lkml.kernel.org/r/20210127124745.7928-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: David Hildenbrand Acked-by: Roman Gushchin Acked-by: David Rientjes Reviewed-by: Miaohe Lin Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 8 -------- init/Kconfig | 14 -------------- mm/slub.c | 16 ---------------- 3 files changed, 38 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0ac883777318..19cfa8b9eb60 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4899,14 +4899,6 @@ last alloc / free. For more information see Documentation/vm/slub.rst. - slub_memcg_sysfs= [MM, SLUB] - Determines whether to enable sysfs directories for - memory cgroup sub-caches. 1 to enable, 0 to disable. - The default is determined by CONFIG_SLUB_MEMCG_SYSFS_ON. - Enabling this can lead to a very high number of debug - directories and files being created under - /sys/kernel/slub. - slub_max_order= [MM, SLUB] Determines the maximum allowed order for slabs. A high setting may cause OOMs due to memory diff --git a/init/Kconfig b/init/Kconfig index ba8bd5256980..8f3a6c4fc0b4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1861,20 +1861,6 @@ config SLUB_DEBUG SLUB sysfs support. /sys/slab will not exist and there will be no support for cache validation etc. -config SLUB_MEMCG_SYSFS_ON - default n - bool "Enable memcg SLUB sysfs support by default" if EXPERT - depends on SLUB && SYSFS && MEMCG - help - SLUB creates a directory under /sys/kernel/slab for each - allocation cache to host info and debug files. If memory - cgroup is enabled, each cache can have per memory cgroup - caches. SLUB can create the same sysfs directories for these - caches under /sys/kernel/slab/CACHE/cgroup but it can lead - to a very high number of debug files being created. This is - controlled by slub_memcg_sysfs boot parameter and this - config option determines the parameter's default value. - config COMPAT_BRK bool "Disable heap randomization" default y diff --git a/mm/slub.c b/mm/slub.c index 13e5821ce3e2..49d10e0afb76 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4943,22 +4943,6 @@ enum slab_stat_type { #define SO_OBJECTS (1 << SL_OBJECTS) #define SO_TOTAL (1 << SL_TOTAL) -#ifdef CONFIG_MEMCG -static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); - -static int __init setup_slub_memcg_sysfs(char *str) -{ - int v; - - if (get_option(&str, &v) > 0) - memcg_sysfs_enabled = v; - - return 1; -} - -__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); -#endif - static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { From 457c82c3516d56fc52b6b7518b0bce14b7809a3b Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Wed, 24 Feb 2021 12:01:26 -0800 Subject: [PATCH 023/172] mm/slub: minor coding style tweaks Add whitespace to fix coding style issues, improve code reading. Link: https://lkml.kernel.org/r/1612847403-5594-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 49d10e0afb76..e0fef8f8d4bf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3269,7 +3269,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.page) continue; - slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); From 91f5345afbc6b58d79b5c5d0bc915fa83e9d238e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:01:29 -0800 Subject: [PATCH 024/172] mm/debug: improve memcg debugging The memcg_data is only valid on the head page, not the tail pages. Change the format and location of the printout within the dump to match the other parts of struct page better. Link: https://lkml.kernel.org/r/20210114190200.1894484-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index 8a40b3fefbeb..0bdda8407f71 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,6 +110,11 @@ void __dump_page(struct page *page, const char *reason) head_compound_mapcount(head)); } } + +#ifdef CONFIG_MEMCG + if (head->memcg_data) + pr_warn("memcg:%lx\n", head->memcg_data); +#endif if (PageKsm(page)) type = "ksm "; else if (PageAnon(page)) @@ -180,11 +185,6 @@ hex_only: if (reason) pr_warn("page dumped because: %s\n", reason); - -#ifdef CONFIG_MEMCG - if (!page_poisoned && page->memcg_data) - pr_warn("pages's memcg:%lx\n", page->memcg_data); -#endif } void dump_page(struct page *page, const char *reason) From bb5c47ced46797409f4791d0380db3116d93134c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Feb 2021 12:01:32 -0800 Subject: [PATCH 025/172] mm/debug_vm_pgtable/basic: add validation for dirtiness after write protect Patch series "mm/debug_vm_pgtable: Some minor updates", v3. This series contains some cleanups and new test suggestions from Catalin from an earlier discussion. https://lore.kernel.org/linux-mm/20201123142237.GF17833@gaia/ This patch (of 2): This adds validation tests for dirtiness after write protect conversion for each page table level. There are two new separate test types involved here. The first test ensures that a given page table entry does not become dirty after pxx_wrprotect(). This is important for platforms like arm64 which transfers and drops the hardware dirty bit (!PTE_RDONLY) to the software dirty bit while making it an write protected one. This test ensures that no fresh page table entry could be created with hardware dirty bit set. The second test ensures that a given page table entry always preserve the dirty information across pxx_wrprotect(). This adds two previously missing PUD level basic tests and while here fixes pxx_wrprotect() related typos in the documentation file. Link: https://lkml.kernel.org/r/1611137241-26220-1-git-send-email-anshuman.khandual@arm.com Link: https://lkml.kernel.org/r/1611137241-26220-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Catalin Marinas Tested-by: Gerald Schaefer [s390] Cc: Christophe Leroy Cc: Gerald Schaefer Cc: Vineet Gupta Cc: Paul Walmsley Cc: Steven Price Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 8 ++--- mm/debug_vm_pgtable.c | 39 +++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index f3591ee3aaa8..552567d863b8 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -50,7 +50,7 @@ PTE Page Table Helpers +---------------------------+--------------------------------------------------+ | pte_mkwrite | Creates a writable PTE | +---------------------------+--------------------------------------------------+ -| pte_mkwrprotect | Creates a write protected PTE | +| pte_wrprotect | Creates a write protected PTE | +---------------------------+--------------------------------------------------+ | pte_mkspecial | Creates a special PTE | +---------------------------+--------------------------------------------------+ @@ -120,7 +120,7 @@ PMD Page Table Helpers +---------------------------+--------------------------------------------------+ | pmd_mkwrite | Creates a writable PMD | +---------------------------+--------------------------------------------------+ -| pmd_mkwrprotect | Creates a write protected PMD | +| pmd_wrprotect | Creates a write protected PMD | +---------------------------+--------------------------------------------------+ | pmd_mkspecial | Creates a special PMD | +---------------------------+--------------------------------------------------+ @@ -186,7 +186,7 @@ PUD Page Table Helpers +---------------------------+--------------------------------------------------+ | pud_mkwrite | Creates a writable PUD | +---------------------------+--------------------------------------------------+ -| pud_mkwrprotect | Creates a write protected PUD | +| pud_wrprotect | Creates a write protected PUD | +---------------------------+--------------------------------------------------+ | pud_mkdevmap | Creates a ZONE_DEVICE mapped PUD | +---------------------------+--------------------------------------------------+ @@ -224,7 +224,7 @@ HugeTLB Page Table Helpers +---------------------------+--------------------------------------------------+ | huge_pte_mkwrite | Creates a writable HugeTLB | +---------------------------+--------------------------------------------------+ -| huge_pte_mkwrprotect | Creates a write protected HugeTLB | +| huge_pte_wrprotect | Creates a write protected HugeTLB | +---------------------------+--------------------------------------------------+ | huge_ptep_get_and_clear | Clears a HugeTLB | +---------------------------+--------------------------------------------------+ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c05d9dcf7891..1842d97522bb 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -63,6 +63,16 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) pte_t pte = pfn_pte(pfn, prot); pr_debug("Validating PTE basic\n"); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pte() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pte_dirty(pte_wrprotect(pte))); + WARN_ON(!pte_same(pte, pte)); WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); @@ -70,6 +80,8 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); + WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); + WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } static void __init pte_advanced_tests(struct mm_struct *mm, @@ -137,6 +149,17 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PMD basic\n"); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pmd() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pmd_dirty(pmd_wrprotect(pmd))); + + WARN_ON(!pmd_same(pmd, pmd)); WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); @@ -144,6 +167,8 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); + WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); /* * A huge page does not point to next level page table * entry. Hence this must qualify as pmd_bad(). @@ -257,11 +282,25 @@ static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) return; pr_debug("Validating PUD basic\n"); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pud() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pud_dirty(pud_wrprotect(pud))); + WARN_ON(!pud_same(pud, pud)); WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud)))); + WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud)))); WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); + WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); if (mm_pmd_folded(mm)) return; From 2e326c07bbe1eabeece4047ab5972ef34b15679b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Feb 2021 12:01:36 -0800 Subject: [PATCH 026/172] mm/debug_vm_pgtable/basic: iterate over entire protection_map[] Currently the basic tests just validate various page table transformations after starting with vm_get_page_prot(VM_READ|VM_WRITE|VM_EXEC) protection. Instead scan over the entire protection_map[] for better coverage. It also makes sure that all these basic page table tranformations checks hold true irrespective of the starting protection value for the page table entry. There is also a slight change in the debug print format for basic tests to capture the protection value it is being tested with. The modified output looks something like [pte_basic_tests ]: Validating PTE basic () [pte_basic_tests ]: Validating PTE basic (read) [pte_basic_tests ]: Validating PTE basic (write) [pte_basic_tests ]: Validating PTE basic (read|write) [pte_basic_tests ]: Validating PTE basic (exec) [pte_basic_tests ]: Validating PTE basic (read|exec) [pte_basic_tests ]: Validating PTE basic (write|exec) [pte_basic_tests ]: Validating PTE basic (read|write|exec) [pte_basic_tests ]: Validating PTE basic (shared) [pte_basic_tests ]: Validating PTE basic (read|shared) [pte_basic_tests ]: Validating PTE basic (write|shared) [pte_basic_tests ]: Validating PTE basic (read|write|shared) [pte_basic_tests ]: Validating PTE basic (exec|shared) [pte_basic_tests ]: Validating PTE basic (read|exec|shared) [pte_basic_tests ]: Validating PTE basic (write|exec|shared) [pte_basic_tests ]: Validating PTE basic (read|write|exec|shared) This adds a missing argument 'struct mm_struct *' in pud_basic_tests() test . This never got exposed before as PUD based THP is available only on X86 platform where mm_pmd_folded(mm) call gets macro replaced without requiring the mm_struct i.e __is_defined(__PAGETABLE_PMD_FOLDED). Link: https://lkml.kernel.org/r/1611137241-26220-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Tested-by: Gerald Schaefer [s390] Reviewed-by: Steven Price Suggested-by: Catalin Marinas Cc: Christophe Leroy Cc: Gerald Schaefer Cc: Paul Walmsley Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug_vm_pgtable.c | 47 ++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 1842d97522bb..a9bd6ce1ba02 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -58,11 +58,13 @@ #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) -static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_basic_tests(unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pte_t pte = pfn_pte(pfn, prot); + unsigned long val = idx, *ptr = &val; - pr_debug("Validating PTE basic\n"); + pr_debug("Validating PTE basic (%pGv)\n", ptr); /* * This test needs to be executed after the given page table entry @@ -141,14 +143,16 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_basic_tests(unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pmd_t pmd = pfn_pmd(pfn, prot); + unsigned long val = idx, *ptr = &val; if (!has_transparent_hugepage()) return; - pr_debug("Validating PMD basic\n"); + pr_debug("Validating PMD basic (%pGv)\n", ptr); /* * This test needs to be executed after the given page table entry @@ -274,14 +278,16 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pud_t pud = pfn_pud(pfn, prot); + unsigned long val = idx, *ptr = &val; if (!has_transparent_hugepage()) return; - pr_debug("Validating PUD basic\n"); + pr_debug("Validating PUD basic (%pGv)\n", ptr); /* * This test needs to be executed after the given page table entry @@ -398,7 +404,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) #endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -411,8 +417,8 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_basic_tests(unsigned long pfn, int idx) { } +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, @@ -938,6 +944,7 @@ static int __init debug_vm_pgtable(void) unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned, p4d_aligned, pgd_aligned; spinlock_t *ptl = NULL; + int idx; pr_info("Validating architecture page table helpers\n"); prot = vm_get_page_prot(VMFLAGS); @@ -1002,9 +1009,25 @@ static int __init debug_vm_pgtable(void) saved_pmdp = pmd_offset(pudp, 0UL); saved_ptep = pmd_pgtable(pmd); - pte_basic_tests(pte_aligned, prot); - pmd_basic_tests(pmd_aligned, prot); - pud_basic_tests(pud_aligned, prot); + /* + * Iterate over the protection_map[] to make sure that all + * the basic page table transformation validations just hold + * true irrespective of the starting protection value for a + * given page table entry. + */ + for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { + pte_basic_tests(pte_aligned, idx); + pmd_basic_tests(pmd_aligned, idx); + pud_basic_tests(mm, pud_aligned, idx); + } + + /* + * Both P4D and PGD level tests are very basic which do not + * involve creating page table entries from the protection + * value and the given pfn. Hence just keep them out from + * the above iteration for now to save some test execution + * time. + */ p4d_basic_tests(p4d_aligned, prot); pgd_basic_tests(pgd_aligned, prot); From 1d2cae8ea1cf082df8258fcb5ab35de29821c450 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:01:39 -0800 Subject: [PATCH 027/172] mm/page_owner: use helper function zone_end_pfn() to get end_pfn Commit 108bcc96ef70 ("mm: add & use zone_end_pfn() and zone_spans_pfn()") introduced the helper zone_end_pfn() to calculate the zone end pfn. But pagetypeinfo_showmixedcount_print forgot to use it. And the initialization of local variable pfn is duplicated, remove one. Link: https://lkml.kernel.org/r/20210123070538.5861-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index af464bb7fbe7..d15c7c4994f5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -263,8 +263,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn, block_end_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; int pageblock_mt, page_mt; int i; From 1f7ef657740344541645349a8bece90cbff898f5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Feb 2021 12:01:42 -0800 Subject: [PATCH 028/172] mm/filemap: remove unused parameter and change to void type for replace_page_cache_page() Since commit 74d609585d8b ("page cache: Add and replace pages using the XArray") was merged, the replace_page_cache_page() can not fail and always return 0, we can remove the redundant return value and void it. Moreover remove the unused gfp_mask. Link: https://lkml.kernel.org/r/609c30e5274ba15d8b90c872fd0d8ac437a9b2bb.1610071401.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Matthew Wilcox Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 +----- include/linux/pagemap.h | 2 +- mm/filemap.c | 7 +------ 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 588f8d1240aa..c6636b4c4ccf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); - if (err) { - unlock_page(newpage); - goto out_put_old; - } + replace_page_cache_page(oldpage, newpage); get_page(newpage); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index d5570deff400..74e466e5a2ba 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -757,7 +757,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); -int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); diff --git a/mm/filemap.c b/mm/filemap.c index 6ff2a3fb0dc7..7dfed3454a2e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range); * replace_page_cache_page - replace a pagecache page with a new one * @old: page to be replaced * @new: page to replace with - * @gfp_mask: allocation mode * * This function replaces a page in the pagecache with a new one. On * success it acquires the pagecache reference for the new page and @@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); * caller must do that. * * The remove + add is atomic. This function cannot fail. - * - * Return: %0 */ -int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +void replace_page_cache_page(struct page *old, struct page *new) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *) = mapping->a_ops->freepage; @@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (freepage) freepage(old); put_page(old); - - return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); From ab2125df921d991a3c8a4fdcfe617ef6cad6b484 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 24 Feb 2021 12:01:45 -0800 Subject: [PATCH 029/172] mm/filemap: don't revert iter on -EIOCBQUEUED Currently, if I/O is enqueued for async execution direct paths of generic_file_{read,write}_iter() will always revert the iter. There are no users expecting that, and that is also costly. Leave iterators as is on -EIOCBQUEUED. Link: https://lkml.kernel.org/r/f5247b60e7abbd2ff850cd108491f53a2e0c501a.1610207781.git.asml.silence@gmail.com Signed-off-by: Pavel Begunkov Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 7dfed3454a2e..774a869da106 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2615,7 +2615,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += retval; count -= retval; } - iov_iter_revert(iter, count - iov_iter_count(iter)); + if (retval != -EIOCBQUEUED) + iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter @@ -3426,7 +3427,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } iocb->ki_pos = pos; } - iov_iter_revert(from, write_len - iov_iter_count(from)); + if (written != -EIOCBQUEUED) + iov_iter_revert(from, write_len - iov_iter_count(from)); out: return written; } From 3a6bae48390d25a9937978a6c09ccc400b6efcbd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:01:49 -0800 Subject: [PATCH 030/172] mm/filemap: rename generic_file_buffered_read subfunctions Patch series "Refactor generic_file_buffered_read", v5. This is a combination of Christoph's work to refactor generic_file_buffered_read() and some of my large-page support which was disrupted by Kent's refactoring of generic_file_buffered_read. This patch (of 18): The recent split of generic_file_buffered_read() created some very long function names which are hard to distinguish from each other. Rename as follows: generic_file_buffered_read_readpage -> filemap_read_page generic_file_buffered_read_pagenotuptodate -> filemap_update_page generic_file_buffered_read_no_cached_page -> filemap_create_page generic_file_buffered_read_get_pages -> filemap_get_pages Link: https://lkml.kernel.org/r/20210122160140.223228-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210122160140.223228-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 774a869da106..fbfea449f417 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2178,11 +2178,8 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) return lock_page_killable(page); } -static struct page * -generic_file_buffered_read_readpage(struct kiocb *iocb, - struct file *filp, - struct address_space *mapping, - struct page *page) +static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp, + struct address_space *mapping, struct page *page) { struct file_ra_state *ra = &filp->f_ra; int error; @@ -2233,12 +2230,9 @@ generic_file_buffered_read_readpage(struct kiocb *iocb, return page; } -static struct page * -generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, - struct file *filp, - struct iov_iter *iter, - struct page *page, - loff_t pos, loff_t count) +static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp, + struct iov_iter *iter, struct page *page, loff_t pos, + loff_t count) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -2301,12 +2295,11 @@ page_not_up_to_date_locked: return page; } - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); + return filemap_read_page(iocb, filp, mapping, page); } -static struct page * -generic_file_buffered_read_no_cached_page(struct kiocb *iocb, - struct iov_iter *iter) +static struct page *filemap_create_page(struct kiocb *iocb, + struct iov_iter *iter) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; @@ -2317,10 +2310,6 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb, if (iocb->ki_flags & IOCB_NOIO) return ERR_PTR(-EAGAIN); - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ page = page_cache_alloc(mapping); if (!page) return ERR_PTR(-ENOMEM); @@ -2332,13 +2321,11 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb, return error != -EEXIST ? ERR_PTR(error) : NULL; } - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); + return filemap_read_page(iocb, filp, mapping, page); } -static int generic_file_buffered_read_get_pages(struct kiocb *iocb, - struct iov_iter *iter, - struct page **pages, - unsigned int nr) +static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, + struct page **pages, unsigned int nr) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; @@ -2365,7 +2352,7 @@ find_page: if (nr_got) goto got_pages; - pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); + pages[0] = filemap_create_page(iocb, iter); err = PTR_ERR_OR_ZERO(pages[0]); if (!IS_ERR_OR_NULL(pages[0])) nr_got = 1; @@ -2399,8 +2386,8 @@ got_pages: break; } - page = generic_file_buffered_read_pagenotuptodate(iocb, - filp, iter, page, pg_pos, pg_count); + page = filemap_update_page(iocb, filp, iter, page, + pg_pos, pg_count); if (IS_ERR_OR_NULL(page)) { for (j = i + 1; j < nr_got; j++) put_page(pages[j]); @@ -2479,8 +2466,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, iocb->ki_flags |= IOCB_NOWAIT; i = 0; - pg_nr = generic_file_buffered_read_get_pages(iocb, iter, - pages, nr_pages); + pg_nr = filemap_get_pages(iocb, iter, pages, nr_pages); if (pg_nr < 0) { error = pg_nr; break; From 0c7c575df56b957390206deb018c41acbb412159 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:01:52 -0800 Subject: [PATCH 031/172] mm/filemap: remove dynamically allocated array from filemap_read Increasing the batch size runs into diminishing returns. It's probably better to make, eg, three calls to filemap_get_pages() than it is to call into kmalloc(). Link: https://lkml.kernel.org/r/20210122160140.223228-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Miaohe Lin Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index fbfea449f417..f621a7de5833 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2431,8 +2431,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; - unsigned int nr_pages = min_t(unsigned int, 512, + struct page *pages[PAGEVEC_SIZE]; + unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE, ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - (iocb->ki_pos >> PAGE_SHIFT)); int i, pg_nr, error = 0; @@ -2446,14 +2446,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - if (nr_pages > ARRAY_SIZE(pages_onstack)) - pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - - if (!pages) { - pages = pages_onstack; - nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); - } - do { cond_resched(); @@ -2538,9 +2530,6 @@ put_pages: file_accessed(filp); - if (pages != pages_onstack) - kfree(pages); - return written ? written : error; } EXPORT_SYMBOL_GPL(generic_file_buffered_read); From ff993ba130009b1b8afb06206887e1e1f5b34591 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:01:55 -0800 Subject: [PATCH 032/172] mm/filemap: convert filemap_get_pages to take a pagevec Using a pagevec lets us keep the pages and the number of pages together which simplifies a lot of the calling conventions. Link: https://lkml.kernel.org/r/20210122160140.223228-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Cc: Kent Overstreet Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 82 ++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f621a7de5833..aadf9bd340a4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2325,22 +2325,22 @@ static struct page *filemap_create_page(struct kiocb *iocb, } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, - struct page **pages, unsigned int nr) + struct pagevec *pvec) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - int i, j, nr_got, err = 0; + unsigned int nr = min_t(unsigned long, last_index - index, PAGEVEC_SIZE); + int i, j, err = 0; - nr = min_t(unsigned long, last_index - index, nr); find_page: if (fatal_signal_pending(current)) return -EINTR; - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) + pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages); + if (pvec->nr) goto got_pages; if (iocb->ki_flags & IOCB_NOIO) @@ -2348,17 +2348,17 @@ find_page: page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) + pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages); + if (pvec->nr) goto got_pages; - pages[0] = filemap_create_page(iocb, iter); - err = PTR_ERR_OR_ZERO(pages[0]); - if (!IS_ERR_OR_NULL(pages[0])) - nr_got = 1; + pvec->pages[0] = filemap_create_page(iocb, iter); + err = PTR_ERR_OR_ZERO(pvec->pages[0]); + if (!IS_ERR_OR_NULL(pvec->pages[0])) + pvec->nr = 1; got_pages: - for (i = 0; i < nr_got; i++) { - struct page *page = pages[i]; + for (i = 0; i < pvec->nr; i++) { + struct page *page = pvec->pages[i]; pgoff_t pg_index = index + i; loff_t pg_pos = max(iocb->ki_pos, (loff_t) pg_index << PAGE_SHIFT); @@ -2366,9 +2366,9 @@ got_pages: if (PageReadahead(page)) { if (iocb->ki_flags & IOCB_NOIO) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; + for (j = i; j < pvec->nr; j++) + put_page(pvec->pages[j]); + pvec->nr = i; err = -EAGAIN; break; } @@ -2379,9 +2379,9 @@ got_pages: if (!PageUptodate(page)) { if ((iocb->ki_flags & IOCB_NOWAIT) || ((iocb->ki_flags & IOCB_WAITQ) && i)) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; + for (j = i; j < pvec->nr; j++) + put_page(pvec->pages[j]); + pvec->nr = i; err = -EAGAIN; break; } @@ -2389,17 +2389,17 @@ got_pages: page = filemap_update_page(iocb, filp, iter, page, pg_pos, pg_count); if (IS_ERR_OR_NULL(page)) { - for (j = i + 1; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; + for (j = i + 1; j < pvec->nr; j++) + put_page(pvec->pages[j]); + pvec->nr = i; err = PTR_ERR_OR_ZERO(page); break; } } } - if (likely(nr_got)) - return nr_got; + if (likely(pvec->nr)) + return 0; if (err) return err; /* @@ -2431,11 +2431,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct page *pages[PAGEVEC_SIZE]; - unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE, - ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - - (iocb->ki_pos >> PAGE_SHIFT)); - int i, pg_nr, error = 0; + struct pagevec pvec; + int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2457,12 +2454,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, if ((iocb->ki_flags & IOCB_WAITQ) && written) iocb->ki_flags |= IOCB_NOWAIT; - i = 0; - pg_nr = filemap_get_pages(iocb, iter, pages, nr_pages); - if (pg_nr < 0) { - error = pg_nr; + error = filemap_get_pages(iocb, iter, &pvec); + if (error < 0) break; - } /* * i_size must be checked after we know the pages are Uptodate. @@ -2478,9 +2472,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > + while ((iocb->ki_pos >> PAGE_SHIFT) + pvec.nr > (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) - put_page(pages[--pg_nr]); + put_page(pvec.pages[--pvec.nr]); /* * Once we start copying data, we don't want to be touching any @@ -2494,11 +2488,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pages[0]); - for (i = 1; i < pg_nr; i++) - mark_page_accessed(pages[i]); + mark_page_accessed(pvec.pages[0]); + for (i = 1; i < pagevec_count(&pvec); i++) + mark_page_accessed(pvec.pages[i]); - for (i = 0; i < pg_nr; i++) { + for (i = 0; i < pagevec_count(&pvec); i++) { unsigned int offset = iocb->ki_pos & ~PAGE_MASK; unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset); @@ -2510,9 +2504,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, * before reading the page on the kernel side. */ if (writably_mapped) - flush_dcache_page(pages[i]); + flush_dcache_page(pvec.pages[i]); - copied = copy_page_to_iter(pages[i], offset, bytes, iter); + copied = copy_page_to_iter(pvec.pages[i], offset, bytes, iter); written += copied; iocb->ki_pos += copied; @@ -2524,8 +2518,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } } put_pages: - for (i = 0; i < pg_nr; i++) - put_page(pages[i]); + for (i = 0; i < pagevec_count(&pvec); i++) + put_page(pvec.pages[i]); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); From cbd59c48ae2bcadc4a7599c29cf32fd3f9b78251 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:01:59 -0800 Subject: [PATCH 033/172] mm/filemap: use head pages in generic_file_buffered_read Add filemap_get_read_batch() which returns the head pages which represent a contiguous array of bytes in the file. It also stops when encountering a page marked as Readahead or !Uptodate (but does return that page) so it can be handled appropriately by filemap_get_pages(). That lets us remove the loop in filemap_get_pages() and check only the last page. Link: https://lkml.kernel.org/r/20210122160140.223228-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 122 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 37 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index aadf9bd340a4..4fce9735e172 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2178,6 +2178,51 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) return lock_page_killable(page); } +/* + * filemap_get_read_batch - Get a batch of pages for read + * + * Get a batch of pages which represent a contiguous range of bytes + * in the file. No tail pages will be returned. If @index is in the + * middle of a THP, the entire THP will be returned. The last page in + * the batch may have Readahead set or be not Uptodate so that the + * caller can take the appropriate action. + */ +static void filemap_get_read_batch(struct address_space *mapping, + pgoff_t index, pgoff_t max, struct pagevec *pvec) +{ + XA_STATE(xas, &mapping->i_pages, index); + struct page *head; + + rcu_read_lock(); + for (head = xas_load(&xas); head; head = xas_next(&xas)) { + if (xas_retry(&xas, head)) + continue; + if (xas.xa_index > max || xa_is_value(head)) + break; + if (!page_cache_get_speculative(head)) + goto retry; + + /* Has the page moved or been split? */ + if (unlikely(head != xas_reload(&xas))) + goto put_page; + + if (!pagevec_add(pvec, head)) + break; + if (!PageUptodate(head)) + break; + if (PageReadahead(head)) + break; + xas.xa_index = head->index + thp_nr_pages(head) - 1; + xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); + } + rcu_read_unlock(); +} + static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp, struct address_space *mapping, struct page *page) { @@ -2331,15 +2376,15 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; - pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - unsigned int nr = min_t(unsigned long, last_index - index, PAGEVEC_SIZE); - int i, j, err = 0; + pgoff_t last_index; + int err = 0; + last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); find_page: if (fatal_signal_pending(current)) return -EINTR; - pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages); + filemap_get_read_batch(mapping, index, last_index, pvec); if (pvec->nr) goto got_pages; @@ -2348,29 +2393,30 @@ find_page: page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages); + filemap_get_read_batch(mapping, index, last_index, pvec); if (pvec->nr) goto got_pages; pvec->pages[0] = filemap_create_page(iocb, iter); err = PTR_ERR_OR_ZERO(pvec->pages[0]); - if (!IS_ERR_OR_NULL(pvec->pages[0])) - pvec->nr = 1; + if (IS_ERR_OR_NULL(pvec->pages[0])) + goto err; + pvec->nr = 1; + return 0; got_pages: - for (i = 0; i < pvec->nr; i++) { - struct page *page = pvec->pages[i]; - pgoff_t pg_index = index + i; + { + struct page *page = pvec->pages[pvec->nr - 1]; + pgoff_t pg_index = page->index; loff_t pg_pos = max(iocb->ki_pos, (loff_t) pg_index << PAGE_SHIFT); loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; if (PageReadahead(page)) { if (iocb->ki_flags & IOCB_NOIO) { - for (j = i; j < pvec->nr; j++) - put_page(pvec->pages[j]); - pvec->nr = i; + put_page(page); + pvec->nr--; err = -EAGAIN; - break; + goto err; } page_cache_async_readahead(mapping, ra, filp, page, pg_index, last_index - pg_index); @@ -2378,26 +2424,23 @@ got_pages: if (!PageUptodate(page)) { if ((iocb->ki_flags & IOCB_NOWAIT) || - ((iocb->ki_flags & IOCB_WAITQ) && i)) { - for (j = i; j < pvec->nr; j++) - put_page(pvec->pages[j]); - pvec->nr = i; + ((iocb->ki_flags & IOCB_WAITQ) && pvec->nr > 1)) { + put_page(page); + pvec->nr--; err = -EAGAIN; - break; + goto err; } page = filemap_update_page(iocb, filp, iter, page, pg_pos, pg_count); if (IS_ERR_OR_NULL(page)) { - for (j = i + 1; j < pvec->nr; j++) - put_page(pvec->pages[j]); - pvec->nr = i; + pvec->nr--; err = PTR_ERR_OR_ZERO(page); - break; } } } +err: if (likely(pvec->nr)) return 0; if (err) @@ -2442,6 +2485,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + pagevec_init(&pvec); do { cond_resched(); @@ -2469,13 +2513,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_pages; - end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - while ((iocb->ki_pos >> PAGE_SHIFT) + pvec.nr > - (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) - put_page(pvec.pages[--pvec.nr]); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: @@ -2489,24 +2528,32 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) mark_page_accessed(pvec.pages[0]); - for (i = 1; i < pagevec_count(&pvec); i++) - mark_page_accessed(pvec.pages[i]); for (i = 0; i < pagevec_count(&pvec); i++) { - unsigned int offset = iocb->ki_pos & ~PAGE_MASK; - unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, - PAGE_SIZE - offset); - unsigned int copied; + struct page *page = pvec.pages[i]; + size_t page_size = thp_size(page); + size_t offset = iocb->ki_pos & (page_size - 1); + size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, + page_size - offset); + size_t copied; + if (end_offset < page_offset(page)) + break; + if (i > 0) + mark_page_accessed(page); /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (writably_mapped) - flush_dcache_page(pvec.pages[i]); + if (writably_mapped) { + int j; - copied = copy_page_to_iter(pvec.pages[i], offset, bytes, iter); + for (j = 0; j < thp_nr_pages(page); j++) + flush_dcache_page(page + j); + } + + copied = copy_page_to_iter(page, offset, bytes, iter); written += copied; iocb->ki_pos += copied; @@ -2520,6 +2567,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, put_pages: for (i = 0; i < pagevec_count(&pvec); i++) put_page(pvec.pages[i]); + pagevec_reinit(&pvec); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); From 4805462598113f350838d612d0895db2dbb3992b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:02 -0800 Subject: [PATCH 034/172] mm/filemap: pass a sleep state to put_and_wait_on_page_locked This is prep work for the next patch, but I think at least one of the current callers would prefer a killable sleep to an uninterruptible one. Link: https://lkml.kernel.org/r/20210122160140.223228-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 +-- mm/filemap.c | 7 +++++-- mm/huge_memory.c | 4 ++-- mm/migrate.c | 4 ++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 74e466e5a2ba..bd629d676a27 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page) return wait_on_page_bit_killable(compound_head(page), PG_locked); } -extern void put_and_wait_on_page_locked(struct page *page); - +int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); diff --git a/mm/filemap.c b/mm/filemap.c index 4fce9735e172..389a20eec9b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1384,20 +1384,23 @@ static int wait_on_page_locked_async(struct page *page, /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. + * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @page. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the page to * come unlocked. After this function returns, the caller should not * dereference @page. + * + * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. */ -void put_and_wait_on_page_locked(struct page *page) +int put_and_wait_on_page_locked(struct page *page, int state) { wait_queue_head_t *q; page = compound_head(page); q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); + return wait_on_page_bit_common(q, page, PG_locked, state, DROP); } /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 91ca9b103ee5..f655137536eb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1439,7 +1439,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } @@ -1475,7 +1475,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 20ca887ea769..4ec727adfaa2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); From bd8a1f3655a704b9a1924fb3feffa3ecd6e5f8ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:05 -0800 Subject: [PATCH 035/172] mm/filemap: support readpage splitting a page For page splitting to succeed, the thread asking to split the page has to be the only one with a reference to the page. Calling wait_on_page_locked() while holding a reference to the page will effectively prevent this from happening with sufficient threads waiting on the same page. Use put_and_wait_on_page_locked() to sleep without holding a reference to the page, then retry the page lookup after the page is unlocked. Since we now get the page lock a little earlier in filemap_update_page(), we can eliminate a number of duplicate checks. The original intent (commit ebded02788b5 ("avoid unnecessary calls to lock_page when waiting for IO to complete during a read")) behind getting the page lock later was to avoid re-locking the page after it has been brought uptodate by another thread. We still avoid that because we go through the normal lookup path again after the winning thread has brought the page uptodate. Link: https://lkml.kernel.org/r/20210122160140.223228-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 76 ++++++++++++++++------------------------------------ 1 file changed, 23 insertions(+), 53 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 389a20eec9b8..f62b589cbfe4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1373,14 +1373,6 @@ static int __wait_on_page_locked_async(struct page *page, return ret; } -static int wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait) -{ - if (!PageLocked(page)) - return 0; - return __wait_on_page_locked_async(compound_head(page), wait, false); -} - /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -2286,64 +2278,42 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp, struct inode *inode = mapping->host; int error; - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ if (iocb->ki_flags & IOCB_WAITQ) { - error = wait_on_page_locked_async(page, - iocb->ki_waitq); + error = lock_page_async(page, iocb->ki_waitq); + if (error) { + put_page(page); + return ERR_PTR(error); + } } else { - error = wait_on_page_locked_killable(page); + if (!trylock_page(page)) { + put_and_wait_on_page_locked(page, TASK_KILLABLE); + return NULL; + } } - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } - if (PageUptodate(page)) - return page; + if (!page->mapping) + goto truncated; + if (PageUptodate(page)) + goto uptodate; if (inode->i_blkbits == PAGE_SHIFT || !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; + goto readpage; /* pipes can't handle partially uptodate pages */ if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ - if (!page->mapping) - goto page_not_up_to_date_locked; + goto readpage; if (!mapping->a_ops->is_partially_uptodate(page, - pos & ~PAGE_MASK, count)) - goto page_not_up_to_date_locked; + pos & (thp_size(page) - 1), count)) + goto readpage; +uptodate: unlock_page(page); return page; -page_not_up_to_date: - /* Get exclusive access to the page ... */ - error = lock_page_for_iocb(iocb, page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } - -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - return NULL; - } - - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - return page; - } - +readpage: return filemap_read_page(iocb, filp, mapping, page); +truncated: + unlock_page(page); + put_page(page); + return NULL; } static struct page *filemap_create_page(struct kiocb *iocb, From f32b5dd721fb8861f3c1b8e7c06ac978236d0236 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:09 -0800 Subject: [PATCH 036/172] mm/filemap: inline __wait_on_page_locked_async into caller The previous patch removed wait_on_page_locked_async(), so inline __wait_on_page_locked_async into __lock_page_async(). Link: https://lkml.kernel.org/r/20210122160140.223228-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 53 ++++++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f62b589cbfe4..43448d4d66f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1343,36 +1343,6 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); -static int __wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait, bool set) -{ - struct wait_queue_head *q = page_waitqueue(page); - int ret = 0; - - wait->page = page; - wait->bit_nr = PG_locked; - - spin_lock_irq(&q->lock); - __add_wait_queue_entry_tail(q, &wait->wait); - SetPageWaiters(page); - if (set) - ret = !trylock_page(page); - else - ret = PageLocked(page); - /* - * If we were successful now, we know we're still on the - * waitqueue as we're still under the lock. This means it's - * safe to remove and return success, we know the callback - * isn't going to trigger. - */ - if (!ret) - __remove_wait_queue(q, &wait->wait); - else - ret = -EIOCBQUEUED; - spin_unlock_irq(&q->lock); - return ret; -} - /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. @@ -1548,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_async(struct page *page, struct wait_page_queue *wait) { - return __wait_on_page_locked_async(page, wait, true); + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + ret = !trylock_page(page); + /* + * If we were successful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; } /* From 33a0f5c6b34f58e632f1855ff29228d49bc23bcc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:12 -0800 Subject: [PATCH 037/172] mm/filemap: don't call ->readpage if IOCB_WAITQ is set The readpage operation can block in many (most?) filesystems, so we should punt to a work queue instead of calling it. This was the last caller of lock_page_for_iocb(), so remove it. Link: https://lkml.kernel.org/r/20210122160140.223228-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 43448d4d66f3..8bd1bb375d07 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2154,16 +2154,6 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } -static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) -{ - if (iocb->ki_flags & IOCB_WAITQ) - return lock_page_async(page, iocb->ki_waitq); - else if (iocb->ki_flags & IOCB_NOWAIT) - return trylock_page(page) ? 0 : -EAGAIN; - else - return lock_page_killable(page); -} - /* * filemap_get_read_batch - Get a batch of pages for read * @@ -2215,7 +2205,7 @@ static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp, struct file_ra_state *ra = &filp->f_ra; int error; - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) { unlock_page(page); put_page(page); return ERR_PTR(-EAGAIN); @@ -2236,7 +2226,7 @@ static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp, } if (!PageUptodate(page)) { - error = lock_page_for_iocb(iocb, page); + error = lock_page_killable(page); if (unlikely(error)) { put_page(page); return ERR_PTR(error); From 68430303c84e1fd457a05f424b02ea8393708552 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:15 -0800 Subject: [PATCH 038/172] mm/filemap: change filemap_read_page calling conventions Make this function more generic by passing the file instead of the iocb. Check in the callers whether we should call readpage or not. Also make it return an errno / 0 / AOP_TRUNCATED_PAGE, and make calling put_page() the caller's responsibility. Link: https://lkml.kernel.org/r/20210122160140.223228-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 89 +++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 47 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 8bd1bb375d07..0b7310971678 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2199,56 +2199,38 @@ retry: rcu_read_unlock(); } -static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp, - struct address_space *mapping, struct page *page) +static int filemap_read_page(struct file *file, struct address_space *mapping, + struct page *page) { - struct file_ra_state *ra = &filp->f_ra; int error; - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. + * A previous I/O error may have been due to temporary failures, + * eg. multipath errors. PG_error will be set again if readpage + * fails. */ ClearPageError(page); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); - - if (unlikely(error)) { - put_page(page); - return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; - } + error = mapping->a_ops->readpage(file, page); + if (error) + return error; + if (PageUptodate(page)) + return 0; + error = lock_page_killable(page); + if (error) + return error; if (!PageUptodate(page)) { - error = lock_page_killable(page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); + if (page->mapping == NULL) { + /* page truncated */ + error = AOP_TRUNCATED_PAGE; + } else { + shrink_readahead_size_eio(&file->f_ra); + error = -EIO; } - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - return NULL; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - put_page(page); - return ERR_PTR(-EIO); - } - unlock_page(page); } - - return page; + unlock_page(page); + return error; } static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp, @@ -2290,7 +2272,18 @@ uptodate: return page; readpage: - return filemap_read_page(iocb, filp, mapping, page); + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EAGAIN); + } + error = filemap_read_page(iocb->ki_filp, mapping, page); + if (!error) + return page; + put_page(page); + if (error == AOP_TRUNCATED_PAGE) + return NULL; + return ERR_PTR(error); truncated: unlock_page(page); put_page(page); @@ -2306,7 +2299,7 @@ static struct page *filemap_create_page(struct kiocb *iocb, struct page *page; int error; - if (iocb->ki_flags & IOCB_NOIO) + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) return ERR_PTR(-EAGAIN); page = page_cache_alloc(mapping); @@ -2315,12 +2308,14 @@ static struct page *filemap_create_page(struct kiocb *iocb, error = add_to_page_cache_lru(page, mapping, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - return error != -EEXIST ? ERR_PTR(error) : NULL; - } - - return filemap_read_page(iocb, filp, mapping, page); + if (!error) + error = filemap_read_page(iocb->ki_filp, mapping, page); + if (!error) + return page; + put_page(page); + if (error == -EEXIST || error == AOP_TRUNCATED_PAGE) + return NULL; + return ERR_PTR(error); } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, From f253e1854ce8441eefe98f193def2c477a017d81 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:18 -0800 Subject: [PATCH 039/172] mm/filemap: change filemap_create_page calling conventions By moving the iocb flag checks to the caller, we can pass the file and the page index instead of the iocb. It never needed the iter. By passing the pagevec, we can return an errno (or AOP_TRUNCATED_PAGE) instead of an ERR_PTR. Link: https://lkml.kernel.org/r/20210122160140.223228-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 53 ++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b7310971678..c0e5aff274f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2290,32 +2290,33 @@ truncated: return NULL; } -static struct page *filemap_create_page(struct kiocb *iocb, - struct iov_iter *iter) +static int filemap_create_page(struct file *file, + struct address_space *mapping, pgoff_t index, + struct pagevec *pvec) { - struct file *filp = iocb->ki_filp; - struct address_space *mapping = filp->f_mapping; - pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; struct page *page; int error; - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) - return ERR_PTR(-EAGAIN); - page = page_cache_alloc(mapping); if (!page) - return ERR_PTR(-ENOMEM); + return -ENOMEM; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (!error) - error = filemap_read_page(iocb->ki_filp, mapping, page); - if (!error) - return page; + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error == -EEXIST) + error = AOP_TRUNCATED_PAGE; + if (error) + goto error; + + error = filemap_read_page(file, mapping, page); + if (error) + goto error; + + pagevec_add(pvec, page); + return 0; +error: put_page(page); - if (error == -EEXIST || error == AOP_TRUNCATED_PAGE) - return NULL; - return ERR_PTR(error); + return error; } static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, @@ -2343,15 +2344,15 @@ find_page: page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); filemap_get_read_batch(mapping, index, last_index, pvec); - if (pvec->nr) - goto got_pages; - - pvec->pages[0] = filemap_create_page(iocb, iter); - err = PTR_ERR_OR_ZERO(pvec->pages[0]); - if (IS_ERR_OR_NULL(pvec->pages[0])) - goto err; - pvec->nr = 1; - return 0; + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + err = filemap_create_page(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, pvec); + if (err == AOP_TRUNCATED_PAGE) + goto find_page; + return err; + } got_pages: { struct page *page = pvec->pages[pvec->nr - 1]; From 4612aeef09ec492ca5877e06f0dbac5383da5e88 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:22 -0800 Subject: [PATCH 040/172] mm/filemap: convert filemap_update_page to return an errno Use AOP_TRUNCATED_PAGE to indicate that no error occurred, but the page we looked up is no longer valid. In this case, the reference to the page will have been removed; if we hit any other error, the caller will release the reference. Link: https://lkml.kernel.org/r/20210122160140.223228-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c0e5aff274f3..67c4df82aa41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2233,24 +2233,21 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, return error; } -static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp, - struct iov_iter *iter, struct page *page, loff_t pos, - loff_t count) +static int filemap_update_page(struct kiocb *iocb, + struct address_space *mapping, struct iov_iter *iter, + struct page *page, loff_t pos, loff_t count) { - struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; int error; if (iocb->ki_flags & IOCB_WAITQ) { error = lock_page_async(page, iocb->ki_waitq); - if (error) { - put_page(page); - return ERR_PTR(error); - } + if (error) + return error; } else { if (!trylock_page(page)) { put_and_wait_on_page_locked(page, TASK_KILLABLE); - return NULL; + return AOP_TRUNCATED_PAGE; } } @@ -2269,25 +2266,21 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp, goto readpage; uptodate: unlock_page(page); - return page; + return 0; readpage: if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) { unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); + return -EAGAIN; } error = filemap_read_page(iocb->ki_filp, mapping, page); - if (!error) - return page; - put_page(page); if (error == AOP_TRUNCATED_PAGE) - return NULL; - return ERR_PTR(error); + put_page(page); + return error; truncated: unlock_page(page); put_page(page); - return NULL; + return AOP_TRUNCATED_PAGE; } static int filemap_create_page(struct file *file, @@ -2381,11 +2374,12 @@ got_pages: goto err; } - page = filemap_update_page(iocb, filp, iter, page, + err = filemap_update_page(iocb, mapping, iter, page, pg_pos, pg_count); - if (IS_ERR_OR_NULL(page)) { + if (err) { + if (err < 0) + put_page(page); pvec->nr--; - err = PTR_ERR_OR_ZERO(page); } } } @@ -2393,6 +2387,8 @@ got_pages: err: if (likely(pvec->nr)) return 0; + if (err == AOP_TRUNCATED_PAGE) + goto find_page; if (err) return err; /* From 87d1d7b688319ae1580f057faa460d7f0b381430 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:25 -0800 Subject: [PATCH 041/172] mm/filemap: move the iocb checks into filemap_update_page We don't need to give up when a non-blocking request sees a !Uptodate page. We may be able to satisfy the read from a partially-uptodate page. Link: https://lkml.kernel.org/r/20210122160140.223228-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 67c4df82aa41..cd863c4ff9ae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2240,15 +2240,16 @@ static int filemap_update_page(struct kiocb *iocb, struct inode *inode = mapping->host; int error; - if (iocb->ki_flags & IOCB_WAITQ) { - error = lock_page_async(page, iocb->ki_waitq); - if (error) - return error; - } else { - if (!trylock_page(page)) { + if (!trylock_page(page)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) + return -EAGAIN; + if (!(iocb->ki_flags & IOCB_WAITQ)) { put_and_wait_on_page_locked(page, TASK_KILLABLE); return AOP_TRUNCATED_PAGE; } + error = __lock_page_async(page, iocb->ki_waitq); + if (error) + return error; } if (!page->mapping) @@ -2366,14 +2367,9 @@ got_pages: } if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_NOWAIT) || - ((iocb->ki_flags & IOCB_WAITQ) && pvec->nr > 1)) { - put_page(page); - pvec->nr--; - err = -EAGAIN; - goto err; - } - + if ((iocb->ki_flags & IOCB_WAITQ) && + pagevec_count(pvec) > 1) + iocb->ki_flags |= IOCB_NOWAIT; err = filemap_update_page(iocb, mapping, iter, page, pg_pos, pg_count); if (err) { From fce70da3a80fcd0a9c0192dedd6bf86a43845ac9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:28 -0800 Subject: [PATCH 042/172] mm/filemap: add filemap_range_uptodate Move the complicated condition and the calculations out of filemap_update_page() into its own function. [willy@infradead.org: unlock page before dropping its refcount] Link: https://lkml.kernel.org/r/20210201125229.GO308988@casper.infradead.org Link: https://lkml.kernel.org/r/20210122160140.223228-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 65 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index cd863c4ff9ae..1eaafc9402e0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2233,11 +2233,36 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, return error; } +static bool filemap_range_uptodate(struct address_space *mapping, + loff_t pos, struct iov_iter *iter, struct page *page) +{ + int count; + + if (PageUptodate(page)) + return true; + /* pipes can't handle partially uptodate pages */ + if (iov_iter_is_pipe(iter)) + return false; + if (!mapping->a_ops->is_partially_uptodate) + return false; + if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + return false; + + count = iter->count; + if (page_offset(page) > pos) { + count -= page_offset(page) - pos; + pos = 0; + } else { + pos -= page_offset(page); + } + + return mapping->a_ops->is_partially_uptodate(page, pos, count); +} + static int filemap_update_page(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *iter, - struct page *page, loff_t pos, loff_t count) + struct page *page) { - struct inode *inode = mapping->host; int error; if (!trylock_page(page)) { @@ -2254,26 +2279,15 @@ static int filemap_update_page(struct kiocb *iocb, if (!page->mapping) goto truncated; - if (PageUptodate(page)) - goto uptodate; - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto readpage; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto readpage; - if (!mapping->a_ops->is_partially_uptodate(page, - pos & (thp_size(page) - 1), count)) - goto readpage; -uptodate: - unlock_page(page); - return 0; -readpage: - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) { - unlock_page(page); - return -EAGAIN; - } + error = 0; + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) + goto unlock; + + error = -EAGAIN; + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) + goto unlock; + error = filemap_read_page(iocb->ki_filp, mapping, page); if (error == AOP_TRUNCATED_PAGE) put_page(page); @@ -2282,6 +2296,9 @@ truncated: unlock_page(page); put_page(page); return AOP_TRUNCATED_PAGE; +unlock: + unlock_page(page); + return error; } static int filemap_create_page(struct file *file, @@ -2351,9 +2368,6 @@ got_pages: { struct page *page = pvec->pages[pvec->nr - 1]; pgoff_t pg_index = page->index; - loff_t pg_pos = max(iocb->ki_pos, - (loff_t) pg_index << PAGE_SHIFT); - loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; if (PageReadahead(page)) { if (iocb->ki_flags & IOCB_NOIO) { @@ -2370,8 +2384,7 @@ got_pages: if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page, - pg_pos, pg_count); + err = filemap_update_page(iocb, mapping, iter, page); if (err) { if (err < 0) put_page(page); From 5963fe031638bb812c49ddf5adcdc783a57430f7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:32 -0800 Subject: [PATCH 043/172] mm/filemap: split filemap_readahead out of filemap_get_pages This simplifies the error handling. Link: https://lkml.kernel.org/r/20210122160140.223228-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 1eaafc9402e0..f8181beded38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2330,6 +2330,17 @@ error: return error; } +static int filemap_readahead(struct kiocb *iocb, struct file *file, + struct address_space *mapping, struct page *page, + pgoff_t last_index) +{ + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_async_readahead(mapping, &file->f_ra, file, page, + page->index, last_index - page->index); + return 0; +} + static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct pagevec *pvec) { @@ -2367,17 +2378,15 @@ find_page: got_pages: { struct page *page = pvec->pages[pvec->nr - 1]; - pgoff_t pg_index = page->index; if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { + err = filemap_readahead(iocb, filp, mapping, page, + last_index); + if (err) { put_page(page); pvec->nr--; - err = -EAGAIN; goto err; } - page_cache_async_readahead(mapping, ra, filp, page, - pg_index, last_index - pg_index); } if (!PageUptodate(page)) { From 2642fca647257210bf6127297748d472c22702cd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:35 -0800 Subject: [PATCH 044/172] mm/filemap: restructure filemap_get_pages Remove the got_pages label, remove indentation, rename find_page to retry, simplify error handling. Link: https://lkml.kernel.org/r/20210122160140.223228-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 71 +++++++++++++++++++++------------------------------- 1 file changed, 28 insertions(+), 43 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index f8181beded38..3311b2af9061 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2349,70 +2349,55 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; + struct page *page; int err = 0; last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); -find_page: +retry: if (fatal_signal_pending(current)) return -EINTR; filemap_get_read_batch(mapping, index, last_index, pvec); - if (pvec->nr) - goto got_pages; - - if (iocb->ki_flags & IOCB_NOIO) - return -EAGAIN; - - page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - - filemap_get_read_batch(mapping, index, last_index, pvec); + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_sync_readahead(mapping, ra, filp, index, + last_index - index); + filemap_get_read_batch(mapping, index, last_index, pvec); + } if (!pagevec_count(pvec)) { if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; err = filemap_create_page(filp, mapping, iocb->ki_pos >> PAGE_SHIFT, pvec); if (err == AOP_TRUNCATED_PAGE) - goto find_page; + goto retry; return err; } -got_pages: - { - struct page *page = pvec->pages[pvec->nr - 1]; - if (PageReadahead(page)) { - err = filemap_readahead(iocb, filp, mapping, page, - last_index); - if (err) { - put_page(page); - pvec->nr--; - goto err; - } - } - - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_WAITQ) && - pagevec_count(pvec) > 1) - iocb->ki_flags |= IOCB_NOWAIT; - err = filemap_update_page(iocb, mapping, iter, page); - if (err) { - if (err < 0) - put_page(page); - pvec->nr--; - } - } + page = pvec->pages[pagevec_count(pvec) - 1]; + if (PageReadahead(page)) { + err = filemap_readahead(iocb, filp, mapping, page, last_index); + if (err) + goto err; + } + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + iocb->ki_flags |= IOCB_NOWAIT; + err = filemap_update_page(iocb, mapping, iter, page); + if (err) + goto err; } + return 0; err: - if (likely(pvec->nr)) + if (err < 0) + put_page(page); + if (likely(--pvec->nr)) return 0; if (err == AOP_TRUNCATED_PAGE) - goto find_page; - if (err) - return err; - /* - * No pages and no error means we raced and should retry: - */ - goto find_page; + goto retry; + return err; } /** From aa1ec2f69780c5b9590143162101b6dc3dc1de5f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:02:38 -0800 Subject: [PATCH 045/172] mm/filemap: don't relock the page after calling readpage We don't need to get the page lock again; we just need to wait for the I/O to finish, so use wait_on_page_locked_killable() like the other callers of ->readpage. Link: https://lkml.kernel.org/r/20210122160140.223228-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Reviewed-by: Christoph Hellwig Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 3311b2af9061..1358cb061fd6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2214,23 +2214,16 @@ static int filemap_read_page(struct file *file, struct address_space *mapping, error = mapping->a_ops->readpage(file, page); if (error) return error; - if (PageUptodate(page)) - return 0; - error = lock_page_killable(page); + error = wait_on_page_locked_killable(page); if (error) return error; - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* page truncated */ - error = AOP_TRUNCATED_PAGE; - } else { - shrink_readahead_size_eio(&file->f_ra); - error = -EIO; - } - } - unlock_page(page); - return error; + if (PageUptodate(page)) + return 0; + if (!page->mapping) /* page truncated */ + return AOP_TRUNCATED_PAGE; + shrink_readahead_size_eio(&file->f_ra); + return -EIO; } static bool filemap_range_uptodate(struct address_space *mapping, From 87fa0f3eb267eed966ee194907bc15376c1b758f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 12:02:42 -0800 Subject: [PATCH 046/172] mm/filemap: rename generic_file_buffered_read to filemap_read Rename generic_file_buffered_read to match the naming of filemap_fault, also update the written parameter to a more descriptive name and improve the kerneldoc comment. Link: https://lkml.kernel.org/r/20210122160140.223228-18-willy@infradead.org Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- include/linux/fs.h | 4 ++-- mm/filemap.c | 35 ++++++++++++++++------------------- 3 files changed, 19 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be9e3900cce8..bf2c51a9607a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } - return generic_file_buffered_read(iocb, to, ret); + return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { diff --git a/include/linux/fs.h b/include/linux/fs.h index 418b772d6eca..ec8f3ddf4a6a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3080,8 +3080,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); -extern ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *to, ssize_t already_read); +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, + ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); diff --git a/mm/filemap.c b/mm/filemap.c index 1358cb061fd6..28c290da22c1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2394,23 +2394,20 @@ err: } /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read - * @iter: data destination - * @written: already copied + * filemap_read - Read data from the page cache. + * @iocb: The iocb to read. + * @iter: Destination for the data. + * @already_read: Number of bytes already read by the caller. * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. + * Copies data from the page cache. If the data is not currently present, + * uses the readahead and readpage address_space operations to fetch it. * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. - * - * Return: - * * total number of bytes copied, including those the were already @written - * * negative error code if nothing was copied + * Return: Total number of bytes copied, including those already read by + * the caller. If an error happens before any bytes are copied, returns + * a negative error number. */ -ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, + ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; @@ -2437,7 +2434,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ - if ((iocb->ki_flags & IOCB_WAITQ) && written) + if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; error = filemap_get_pages(iocb, iter, &pvec); @@ -2497,7 +2494,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, copied = copy_page_to_iter(page, offset, bytes, iter); - written += copied; + already_read += copied; iocb->ki_pos += copied; ra->prev_pos = iocb->ki_pos; @@ -2514,9 +2511,9 @@ put_pages: file_accessed(filp); - return written ? written : error; + return already_read ? already_read : error; } -EXPORT_SYMBOL_GPL(generic_file_buffered_read); +EXPORT_SYMBOL_GPL(filemap_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2591,7 +2588,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - retval = generic_file_buffered_read(iocb, iter, retval); + retval = filemap_read(iocb, iter, retval); out: return retval; } From 826ea860bc4d119731026655c383c7773c9d2dad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 24 Feb 2021 12:02:45 -0800 Subject: [PATCH 047/172] mm/filemap: simplify generic_file_read_iter Avoid the pointless goto out just for returning retval. Link: https://lkml.kernel.org/r/20210122160140.223228-19-willy@infradead.org Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Kent Overstreet Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 28c290da22c1..820d81901eae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2543,7 +2543,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; if (!count) - goto out; /* skip atime */ + return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; @@ -2561,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos, iocb->ki_pos + count - 1); if (retval < 0) - goto out; + return retval; } file_accessed(file); @@ -2585,12 +2585,10 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) */ if (retval < 0 || !count || iocb->ki_pos >= size || IS_DAX(inode)) - goto out; + return retval; } - retval = filemap_read(iocb, iter, retval); -out: - return retval; + return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); From 4ebd3aec3842662300979dacd6fb38e3e8edf7f4 Mon Sep 17 00:00:00 2001 From: Yang Guo Date: Wed, 24 Feb 2021 12:02:48 -0800 Subject: [PATCH 048/172] fs/buffer.c: add checking buffer head stat before clear clear_buffer_new() is used to clear buffer new stat. When PAGE_SIZE is 64K, most buffer heads in the list are not needed to clear. clear_buffer_new() has an enpensive atomic modification operation, Let's add checking buffer head before clear it as __block_write_begin_int does which is good for performance. Link: https://lkml.kernel.org/r/1612332890-57918-1-git-send-email-zhangshaokun@hisilicon.com Signed-off-by: Yang Guo Signed-off-by: Shaokun Zhang Cc: Alexander Viro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 32647d2011df..f1c3a5b27a90 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, struct page *page, set_buffer_uptodate(bh); mark_buffer_dirty(bh); } - clear_buffer_new(bh); + if (buffer_new(bh)) + clear_buffer_new(bh); block_start = block_end; bh = bh->b_this_page; From 6986c3e2b19505e9b2112fc2e548e9f99fa3021f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Feb 2021 12:02:52 -0800 Subject: [PATCH 049/172] mm: backing-dev: Remove duplicated macro definition Move the K() macro a little forward to remove the same macro definition. Link: https://lkml.kernel.org/r/d1ccdf2d3116dce9814f2bcc1f0415ecb4c76ea5.1612862230.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e33797579338..eca555f658d9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -32,6 +32,8 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; +#define K(x) ((x) << (PAGE_SHIFT - 10)) + #ifdef CONFIG_DEBUG_FS #include #include @@ -69,7 +71,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) global_dirty_limits(&background_thresh, &dirty_thresh); wb_thresh = wb_calc_thresh(wb, dirty_thresh); -#define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" @@ -98,7 +99,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_more_io, nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); -#undef K return 0; } @@ -146,8 +146,6 @@ static ssize_t read_ahead_kb_store(struct device *dev, return count; } -#define K(pages) ((pages) << (PAGE_SHIFT - 10)) - #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ From 191a7221b70d7fa7005404f508e1802f6556ba78 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 24 Feb 2021 12:02:55 -0800 Subject: [PATCH 050/172] mm/swap_slots.c: remove redundant NULL check Fix below warnings reported by coccicheck: mm/swap_slots.c:197:3-9: WARNING: NULL check before some freeing functions is not needed. Link: https://lkml.kernel.org/r/1611214229-11225-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe70645..be9de6d5b516 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -193,8 +193,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, cache->slots_ret = NULL; } spin_unlock_irq(&cache->free_lock); - if (slots) - kvfree(slots); + kvfree(slots); } } From cf532faa41c55ad39fcff211132c58b0acf35c62 Mon Sep 17 00:00:00 2001 From: Stephen Zhang Date: Wed, 24 Feb 2021 12:02:58 -0800 Subject: [PATCH 051/172] mm/swapfile.c: fix debugging information problem Once the function name is changed, it may be easy to forget to modify the corresponding code here. Link: https://lkml.kernel.org/r/1611369120-2276-1-git-send-email-stephenzhangzsd@gmail.com Signed-off-by: Stephen Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 96799a2f6957..f039745989d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1157,13 +1157,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) return p; bad_offset: - pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: - pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); + pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: - pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } @@ -1180,7 +1180,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return p; bad_free: - pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); + pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); out: return NULL; } From 25eaab438dd58092c5f0c62118d933bf8b2fcc76 Mon Sep 17 00:00:00 2001 From: Georgi Djakov Date: Wed, 24 Feb 2021 12:03:01 -0800 Subject: [PATCH 052/172] mm/page_io: use pr_alert_ratelimited for swap read/write errors If there are errors during swap read or write, they can easily fill the log buffer and remove any previous messages that might be useful for debugging, especially on systems that rely for logging only on the kernel ring-buffer. For example, on a systems using zram as swap, we are more likely to see any page allocation errors preceding the swap write errors if the alerts are ratelimited. Link: https://lkml.kernel.org/r/20210201142055.29068-1-georgi.djakov@linaro.org Signed-off-by: Georgi Djakov Acked-by: Minchan Kim Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 92f7941c6d01..485fa5cca4a2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -41,9 +41,9 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - pr_alert("Write-error on swap-device (%u:%u:%llu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -106,9 +106,9 @@ static void end_swap_bio_read(struct bio *bio) if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); - pr_alert("Read-error on swap-device (%u:%u:%llu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); goto out; } From e48333b660d57898ad6240570084ffa734f64368 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Wed, 24 Feb 2021 12:03:05 -0800 Subject: [PATCH 053/172] mm/swap_state: constify static struct attribute_group The only usage of swap_attr_group is to pass its address to sysfs_create_group() which takes a pointer to const attribute_group. Make it const to allow the compiler to put it in read-only memory. Link: https://lkml.kernel.org/r/20210201233254.91809-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Reviewed-by: Amy Parker Acked-by: "Huang, Ying" Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 751c1ef2fe0e..3507c82b2ef1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -927,7 +927,7 @@ static struct attribute *swap_attrs[] = { NULL, }; -static struct attribute_group swap_attr_group = { +static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; From cad8320b4b395702e49578580c70026c8271ea88 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:03:08 -0800 Subject: [PATCH 054/172] mm/swap: don't SetPageWorkingset unconditionally during swapin We are capable of SetPageWorkingset based on refault distances after commit aae466b0052e ("mm/swap: implement workingset detection for anonymous LRU"). This is done by workingset_refault(), which is right above the unconditional SetPageWorkingset deleted by this patch. The unconditional SetPageWorkingset miscategorizes pages that are read ahead or never belonged to the working set (e.g., tmpfs pages accessed only once by fd). When those pages are swapped in (after they were swapped out) for the first time, they skew PSI (when using async swap). When this happens again, depending on their refault distances, they might skew workingset_restore_anon counter in addition to PSI because their shadows indicate they were part of the working set. Historically, SetPageWorkingset was added as part of the PSI series, and Johannes said: "It was meant to mark incoming pages under IO with SetPageWorkingset when waiting for them constituted a memory stall. On the page cache side, because we HAVE workingset detection, this was specific to recently evicted pages that had been active in their previous life. On the anon side, the aging algorithm had no distinction between workingset and sporadically used pages. Given the choice between a) no swapin stalls are pressure and b) all swapin stalls are pressure, I went with the latter in order to detect swap storms. The false positive case - high rate of swapin without severe memory pressure - was relatively unlikely, because we tried to avoid swapping until everything was completely on fire in the first place." Link: https://lkml.kernel.org/r/20201209012400.1771150-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20201214231253.62313-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Joonsoo Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 3507c82b2ef1..807b00eae969 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -537,7 +537,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, workingset_refault(page, shadow); /* Caller will initiate read into locked page */ - SetPageWorkingset(page); lru_cache_add(page); *new_page_allocated = true; return page; From 2e9bd483159939ed2c0704b914294653c8341d25 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 24 Feb 2021 12:03:11 -0800 Subject: [PATCH 055/172] mm: memcg/slab: pre-allocate obj_cgroups for slab caches with SLAB_ACCOUNT In general it's unknown in advance if a slab page will contain accounted objects or not. In order to avoid memory waste, an obj_cgroup vector is allocated dynamically when a need to account of a new object arises. Such approach is memory efficient, but requires an expensive cmpxchg() to set up the memcg/objcgs pointer, because an allocation can race with a different allocation on another cpu. But in some common cases it's known for sure that a slab page will contain accounted objects: if the page belongs to a slab cache with a SLAB_ACCOUNT flag set. It includes such popular objects like vm_area_struct, anon_vma, task_struct, etc. In such cases we can pre-allocate the objcgs vector and simple assign it to the page without any atomic operations, because at this early stage the page is not visible to anyone else. A very simplistic benchmark (allocating 10000000 64-bytes objects in a row) shows ~15% win. In the real life it seems that most workloads are not very sensitive to the speed of (accounted) slab allocations. [guro@fb.com: open-code set_page_objcgs() and add some comments, by Johannes] Link: https://lkml.kernel.org/r/20201113001926.GA2934489@carbon.dhcp.thefacebook.com [akpm@linux-foundation.org: fix it for mm-slub-call-account_slab_page-after-slab-page-initialization-fix.patch] Link: https://lkml.kernel.org/r/20201110195753.530157-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Cc: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 19 ------------------- mm/memcontrol.c | 23 +++++++++++++++++++---- mm/slab.c | 2 +- mm/slab.h | 14 ++++++++++---- mm/slub.c | 2 +- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eeb0b52203e9..7a4dd1cb19fe 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -475,19 +475,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); } -/* - * set_page_objcgs - associate a page with a object cgroups vector - * @page: a pointer to the page struct - * @objcgs: a pointer to the object cgroups vector - * - * Atomically associates a page with a vector of object cgroups. - */ -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return !cmpxchg(&page->memcg_data, 0, (unsigned long)objcgs | - MEMCG_DATA_OBJCGS); -} #else static inline struct obj_cgroup **page_objcgs(struct page *page) { @@ -498,12 +485,6 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page) { return NULL; } - -static inline bool set_page_objcgs(struct page *page, - struct obj_cgroup **objcgs) -{ - return true; -} #endif static __always_inline bool memcg_stat_item_in_bytes(int idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b9bd354e97e..60ce452e42e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2935,9 +2935,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp) + gfp_t gfp, bool new_page) { unsigned int objects = objs_per_slab_page(s, page); + unsigned long memcg_data; void *vec; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, @@ -2945,11 +2946,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (!set_page_objcgs(page, vec)) + memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; + if (new_page) { + /* + * If the slab page is brand new and nobody can yet access + * it's memcg_data, no synchronization is required and + * memcg_data can be simply assigned. + */ + page->memcg_data = memcg_data; + } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + /* + * If the slab page is already in use, somebody can allocate + * and assign obj_cgroups in parallel. In this case the existing + * objcg vector should be reused. + */ kfree(vec); - else - kmemleak_not_leak(vec); + return 0; + } + kmemleak_not_leak(vec); return 0; } diff --git a/mm/slab.c b/mm/slab.c index cdad64b2836d..9c9e0c255c4b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - account_slab_page(page, cachep->gfporder, cachep); + account_slab_page(page, cachep->gfporder, cachep, flags); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) diff --git a/mm/slab.h b/mm/slab.h index 48dc466a3791..076582f58f68 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -238,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp); + gfp_t gfp, bool new_page); static inline void memcg_free_page_obj_cgroups(struct page *page) { @@ -315,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, page = virt_to_head_page(p[i]); if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags)) { + memcg_alloc_page_obj_cgroups(page, s, flags, + false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } @@ -379,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) } static inline int memcg_alloc_page_obj_cgroups(struct page *page, - struct kmem_cache *s, gfp_t gfp) + struct kmem_cache *s, gfp_t gfp, + bool new_page) { return 0; } @@ -420,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) } static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s) + struct kmem_cache *s, + gfp_t gfp) { + if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_page_obj_cgroups(page, s, gfp, true); + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), PAGE_SIZE << order); } diff --git a/mm/slub.c b/mm/slub.c index e0fef8f8d4bf..ca566361561e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1785,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - account_slab_page(page, oo_order(oo), s); + account_slab_page(page, oo_order(oo), s, flags); page->slab_cache = s; __SetPageSlab(page); From f3344adf38bdb3107d40483dd9501215ad40edce Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:15 -0800 Subject: [PATCH 056/172] mm: memcontrol: optimize per-lruvec stats counter memory usage The vmstat threshold is 32 (MEMCG_CHARGE_BATCH), Actually the threshold can be as big as MEMCG_CHARGE_BATCH * PAGE_SIZE. It still fits into s32. So introduce struct batched_lruvec_stat to optimize memory usage. The size of struct lruvec_stat is 304 bytes on 64 bit systems. As it is a per-cpu structure. So with this patch, we can save 304 / 2 * ncpu bytes per-memcg per-node where ncpu is the number of the possible CPU. If there are c memory cgroup (include dying cgroup) and n NUMA node in the system. Finally, we can save (152 * ncpu * c * n) bytes. [akpm@linux-foundation.org: fix typo in comment] Link: https://lkml.kernel.org/r/20201210042121.39665-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Shakeel Butt Cc: Roman Gushchin Cc: Stephen Rothwell Cc: Chris Down Cc: Yafang Shao Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 14 ++++++++++++-- mm/memcontrol.c | 10 +++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7a4dd1cb19fe..41bbf71edd9f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -92,6 +92,10 @@ struct lruvec_stat { long count[NR_VM_NODE_STAT_ITEMS]; }; +struct batched_lruvec_stat { + s32 count[NR_VM_NODE_STAT_ITEMS]; +}; + /* * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. @@ -107,11 +111,17 @@ struct memcg_shrinker_map { struct mem_cgroup_per_node { struct lruvec lruvec; - /* Legacy local VM stats */ + /* + * Legacy local VM stats. This should be struct lruvec_stat and + * cannot be optimized to struct batched_lruvec_stat. Because + * the threshold of the lruvec_stat_cpu can be as big as + * MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this + * filed has no upper limit. + */ struct lruvec_stat __percpu *lruvec_stat_local; /* Subtree VM stats (batched updates) */ - struct lruvec_stat __percpu *lruvec_stat_cpu; + struct batched_lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 60ce452e42e6..b259e7d8ce41 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5208,7 +5208,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return 1; } - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, + pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_cpu) { free_percpu(pn->lruvec_stat_local); @@ -7093,6 +7093,14 @@ static int __init mem_cgroup_init(void) { int cpu, node; + /* + * Currently s32 type (can refer to struct batched_lruvec_stat) is + * used for per-memcg-per-cpu caching of per-node statistics. In order + * to work fine, we should make sure that the overfill threshold can't + * exceed S32_MAX / PAGE_SIZE. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); From b0ba3bff3e7bb6b58bb248bdd2f3d8ad52fd10c3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:19 -0800 Subject: [PATCH 057/172] mm: memcontrol: fix NR_ANON_THPS accounting in charge moving Patch series "Convert all THP vmstat counters to pages", v6. This patch series is aimed to convert all THP vmstat counters to pages. The unit of some vmstat counters are pages, some are bytes, some are HPAGE_PMD_NR, and some are KiB. When we want to expose these vmstat counters to the userspace, we have to know the unit of the vmstat counters is which one. When the unit is bytes or kB, both clearly distinguishable by the B/KB suffix. But for the THP vmstat counters, we may make mistakes. For example, the below is some bug fix for the THP vmstat counters: - 7de2e9f195b9 ("mm: memcontrol: correct the NR_ANON_THPS counter of hierarchical memcg") - The first commit in this series ("fix NR_ANON_THPS accounting in charge moving") This patch series can make the code clear. And make all the unit of the THP vmstat counters in pages. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. In this series, I changed the following vmstat counters unit from HPAGE_PMD_NR to pages. However, there is no change to the print format of output to user space. - NR_ANON_THPS - NR_FILE_THPS - NR_SHMEM_THPS - NR_SHMEM_PMDMAPPED - NR_FILE_PMDMAPPED Doing this also can make the statistics more accuracy for the THP vmstat counters. This series is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Because we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. From this point of view, I think that do this converting is reasonable. Thanks Hugh for mentioning this. This was inspired by Johannes and Roman. Thanks to them. This patch (of 7): The unit of NR_ANON_THPS is HPAGE_PMD_NR already. So it should inc/dec by one rather than nr_pages. Link: https://lkml.kernel.org/r/20201228164110.2838-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20201228164110.2838-2-songmuchun@bytedance.com Fixes: 468c398233da ("mm: memcontrol: switch to native NR_ANON_THPS counter") Signed-off-by: Muchun Song Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Pankaj Gupta Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Joonsoo Kim Cc: NeilBrown Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Sami Tolvanen Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b259e7d8ce41..e31e47e7bab2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5652,10 +5652,8 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); if (PageTransHuge(page)) { - __mod_lruvec_state(from_vec, NR_ANON_THPS, - -nr_pages); - __mod_lruvec_state(to_vec, NR_ANON_THPS, - nr_pages); + __dec_lruvec_state(from_vec, NR_ANON_THPS); + __inc_lruvec_state(to_vec, NR_ANON_THPS); } } From 69473e5de87389be6c0fa4a5d574a50c8f904fb3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:23 -0800 Subject: [PATCH 058/172] mm: memcontrol: convert NR_ANON_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_ANON_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Greg Kroah-Hartman Cc: Rafael. J. Wysocki Cc: Alexey Dobriyan Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Hugh Dickins Cc: Shakeel Butt Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Feng Tang Cc: NeilBrown Cc: Joonsoo Kim Cc: Randy Dunlap Cc: Michal Hocko Cc: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 15 +++++++++------ fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 13 +++++++++++++ mm/huge_memory.c | 3 ++- mm/memcontrol.c | 20 ++++++-------------- mm/page_alloc.c | 2 +- mm/rmap.c | 6 +++--- mm/vmstat.c | 11 +++++++++-- 8 files changed, 44 insertions(+), 28 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 04f71c7bc3f8..6da0c3508bc9 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -461,8 +461,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(sunreclaimable) #ifdef CONFIG_TRANSPARENT_HUGEPAGE , - nid, K(node_page_state(pgdat, NR_ANON_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_ANON_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * @@ -519,10 +518,14 @@ static ssize_t node_read_vmstat(struct device *dev, sum_zone_numa_state(nid, i)); #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - len += sysfs_emit_at(buf, len, "%s %lu\n", - node_stat_name(i), - node_page_state_pages(pgdat, i)); + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; + len += sysfs_emit_at(buf, len, "%s %lu\n", node_stat_name(i), + pages); + } return len; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d6fc74619625..a635c8a84ddf 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -129,7 +129,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", - global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b593316bff3d..67d50ef5dd20 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -209,6 +209,19 @@ enum node_stat_item { NR_VM_NODE_STAT_ITEMS }; +/* + * Returns true if the item should be printed in THPs (/proc/vmstat + * currently prints number of anon, file and shmem THPs. But the item + * is charged in pages). + */ +static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) +{ + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return false; + + return item == NR_ANON_THPS; +} + /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f655137536eb..3691e863070a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2176,7 +2176,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, + -HPAGE_PMD_NR); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e31e47e7bab2..b2405f049006 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1533,7 +1533,7 @@ static struct memory_stat memory_stats[] = { * on some architectures, the macro of HPAGE_PMD_SIZE is not * constant(e.g. powerpc). */ - { "anon_thp", 0, NR_ANON_THPS }, + { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, { "file_thp", 0, NR_FILE_THPS }, { "shmem_thp", 0, NR_SHMEM_THPS }, #endif @@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_ANON_THPS || - memory_stats[i].idx == NR_FILE_THPS || + if (memory_stats[i].idx == NR_FILE_THPS || memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif @@ -4087,10 +4086,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } @@ -4121,10 +4116,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * PAGE_SIZE); } @@ -5652,10 +5643,11 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); if (PageTransHuge(page)) { - __dec_lruvec_state(from_vec, NR_ANON_THPS); - __inc_lruvec_state(to_vec, NR_ANON_THPS); + __mod_lruvec_state(from_vec, NR_ANON_THPS, + -nr_pages); + __mod_lruvec_state(to_vec, NR_ANON_THPS, + nr_pages); } - } } else { __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef5070fed76b..2e2e47f8714b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5587,7 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), - K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), diff --git a/mm/rmap.c b/mm/rmap.c index 08c56aaf72eb..c4d5c63cfd29 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1144,7 +1144,7 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1186,7 +1186,7 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1292,7 +1292,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); if (TestClearPageDoubleMap(page)) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index f8942160fc95..07dc0af50cf0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1619,8 +1619,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; seq_printf(m, "\n %-12s %lu", node_stat_name(i), - node_page_state_pages(pgdat, i)); + pages); } } seq_printf(m, @@ -1740,8 +1744,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_NUMA_STAT_ITEMS; #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { v[i] = global_node_page_state_pages(i); + if (vmstat_item_print_in_thp(i)) + v[i] /= HPAGE_PMD_NR; + } v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, From bf9ecead53c89d3d2cf60acbc460174ebbcf0027 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:27 -0800 Subject: [PATCH 059/172] mm: memcontrol: convert NR_FILE_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with if hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_FILE_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/filemap.c | 2 +- mm/huge_memory.c | 5 ++++- mm/khugepaged.c | 4 +++- mm/memcontrol.c | 5 ++--- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 6da0c3508bc9..d5952f754911 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -466,8 +466,7 @@ static ssize_t node_read_meminfo(struct device *dev, HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), - nid, K(node_page_state(pgdat, NR_FILE_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_FILE_THPS)), nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) #endif diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a635c8a84ddf..7ea4679880c8 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -135,7 +135,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); show_val_kb(m, "FileHugePages: ", - global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67d50ef5dd20..b751a9898bb6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -219,7 +219,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return false; - return item == NR_ANON_THPS; + return item == NR_ANON_THPS || + item == NR_FILE_THPS; } /* diff --git a/mm/filemap.c b/mm/filemap.c index 820d81901eae..9efe059e2b58 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -208,7 +208,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageTransHuge(page)) __dec_lruvec_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { - __dec_lruvec_page_state(page, NR_FILE_THPS); + __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3691e863070a..77181faa2340 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2752,10 +2752,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { + int nr = thp_nr_pages(head); + if (PageSwapBacked(head)) __dec_lruvec_page_state(head, NR_SHMEM_THPS); else - __dec_lruvec_page_state(head, NR_FILE_THPS); + __mod_lruvec_page_state(head, NR_FILE_THPS, + -nr); } __split_huge_page(page, list, end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fb0fdaec34d5..c427fe2ca7ff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1643,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm, XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); + int nr; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1854,11 +1855,12 @@ out_unlock: put_page(page); goto xa_unlocked; } + nr = thp_nr_pages(new_page); if (is_shmem) __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); else { - __inc_lruvec_page_state(new_page, NR_FILE_THPS); + __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2405f049006..fc552f57d3fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1534,7 +1534,7 @@ static struct memory_stat memory_stats[] = { * constant(e.g. powerpc). */ { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, - { "file_thp", 0, NR_FILE_THPS }, + { "file_thp", PAGE_SIZE, NR_FILE_THPS }, { "shmem_thp", 0, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, @@ -1566,8 +1566,7 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_FILE_THPS || - memory_stats[i].idx == NR_SHMEM_THPS) + if (memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif VM_BUG_ON(!memory_stats[i].ratio); From 57b2847d3c1dc154923578efb47a12302a57d700 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:31 -0800 Subject: [PATCH 060/172] mm: memcontrol: convert NR_SHMEM_THPS account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_SHMEM_THPS account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/filemap.c | 2 +- mm/huge_memory.c | 3 ++- mm/khugepaged.c | 2 +- mm/memcontrol.c | 26 ++------------------------ mm/page_alloc.c | 2 +- mm/shmem.c | 2 +- 9 files changed, 12 insertions(+), 33 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index d5952f754911..6d5ac6ffb6e1 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -462,8 +462,7 @@ static ssize_t node_read_meminfo(struct device *dev, #ifdef CONFIG_TRANSPARENT_HUGEPAGE , nid, K(node_page_state(pgdat, NR_ANON_THPS)), - nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), nid, K(node_page_state(pgdat, NR_FILE_THPS)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7ea4679880c8..cfb107eaa3e6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -131,7 +131,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", - global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); show_val_kb(m, "FileHugePages: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b751a9898bb6..788837f40b38 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,7 +220,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return false; return item == NR_ANON_THPS || - item == NR_FILE_THPS; + item == NR_FILE_THPS || + item == NR_SHMEM_THPS; } /* diff --git a/mm/filemap.c b/mm/filemap.c index 9efe059e2b58..46a8b9e82434 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -206,7 +206,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); } else if (PageTransHuge(page)) { __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 77181faa2340..86a3015ba54f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2755,7 +2755,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) int nr = thp_nr_pages(head); if (PageSwapBacked(head)) - __dec_lruvec_page_state(head, NR_SHMEM_THPS); + __mod_lruvec_page_state(head, NR_SHMEM_THPS, + -nr); else __mod_lruvec_page_state(head, NR_FILE_THPS, -nr); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c427fe2ca7ff..75e246f680f4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1858,7 +1858,7 @@ out_unlock: nr = thp_nr_pages(new_page); if (is_shmem) - __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); + __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); else { __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc552f57d3fb..d3a0c59210e7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1516,7 +1516,7 @@ struct memory_stat { unsigned int idx; }; -static struct memory_stat memory_stats[] = { +static const struct memory_stat memory_stats[] = { { "anon", PAGE_SIZE, NR_ANON_MAPPED }, { "file", PAGE_SIZE, NR_FILE_PAGES }, { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, @@ -1528,14 +1528,9 @@ static struct memory_stat memory_stats[] = { { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * The ratio will be initialized in memory_stats_init(). Because - * on some architectures, the macro of HPAGE_PMD_SIZE is not - * constant(e.g. powerpc). - */ { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, { "file_thp", PAGE_SIZE, NR_FILE_THPS }, - { "shmem_thp", 0, NR_SHMEM_THPS }, + { "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, @@ -1560,23 +1555,6 @@ static struct memory_stat memory_stats[] = { { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, }; -static int __init memory_stats_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_SHMEM_THPS) - memory_stats[i].ratio = HPAGE_PMD_SIZE; -#endif - VM_BUG_ON(!memory_stats[i].ratio); - VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); - } - - return 0; -} -pure_initcall(memory_stats_init); - static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e2e47f8714b..df292d8e659b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5584,7 +5584,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_THPS)), K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR), K(node_page_state(pgdat, NR_ANON_THPS)), diff --git a/mm/shmem.c b/mm/shmem.c index 7924b3bf46fb..ff741d229701 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -713,7 +713,7 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); - __inc_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); } mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); From a1528e21f8915e16252cda1137fe29672c918361 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:35 -0800 Subject: [PATCH 061/172] mm: memcontrol: convert NR_SHMEM_PMDMAPPED account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_SHMEM_PMDMAPPED account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 3 +-- mm/rmap.c | 14 ++++++++++---- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 6d5ac6ffb6e1..7a66aefe4e46 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -463,8 +463,7 @@ static ssize_t node_read_meminfo(struct device *dev, , nid, K(node_page_state(pgdat, NR_ANON_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), - nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * - HPAGE_PMD_NR), + nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), nid, K(node_page_state(pgdat, NR_FILE_THPS)), nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * HPAGE_PMD_NR) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index cfb107eaa3e6..c61f440570f9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -133,7 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", - global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 788837f40b38..7bdbfeeb5c8c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -221,7 +221,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return item == NR_ANON_THPS || item == NR_FILE_THPS || - item == NR_SHMEM_THPS; + item == NR_SHMEM_THPS || + item == NR_SHMEM_PMDMAPPED; } /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df292d8e659b..069561aadc7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5585,8 +5585,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE K(node_page_state(pgdat, NR_SHMEM_THPS)), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) - * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), diff --git a/mm/rmap.c b/mm/rmap.c index c4d5c63cfd29..1c1b576c0627 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1211,14 +1211,17 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; if (PageSwapBacked(page)) - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + nr_pages); else __inc_node_page_state(page, NR_FILE_PMDMAPPED); } else { @@ -1252,14 +1255,17 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; if (PageSwapBacked(page)) - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + -nr_pages); else __dec_node_page_state(page, NR_FILE_PMDMAPPED); } else { From 380780e71895ae301505ffcec8f954ab3666a4c7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:39 -0800 Subject: [PATCH 062/172] mm: memcontrol: convert NR_FILE_PMDMAPPED account to pages Currently we use struct per_cpu_nodestat to cache the vmstat counters, which leads to inaccurate statistics especially THP vmstat counters. In the systems with hundreds of processors it can be GBs of memory. For example, for a 96 CPUs system, the threshold is the maximum number of 125. And the per cpu counters can cache 23.4375 GB in total. The THP page is already a form of batched addition (it will add 512 worth of memory in one go) so skipping the batching seems like sensible. Although every THP stats update overflows the per-cpu counter, resorting to atomic global updates. But it can make the statistics more accuracy for the THP vmstat counters. So we convert the NR_FILE_PMDMAPPED account to pages. This patch is consistent with 8f182270dfec ("mm/swap.c: flush lru pvecs on compound page arrival"). Doing this also can make the unit of vmstat counters more unified. Finally, the unit of the vmstat counters are pages, kB and bytes. The B/KB suffix can tell us that the unit is bytes or kB. The rest which is without suffix are pages. Link: https://lkml.kernel.org/r/20201228164110.2838-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 3 +-- fs/proc/meminfo.c | 2 +- include/linux/mmzone.h | 3 ++- mm/rmap.c | 6 ++++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 7a66aefe4e46..d02d86aec19f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -465,8 +465,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(pgdat, NR_SHMEM_THPS)), nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), nid, K(node_page_state(pgdat, NR_FILE_THPS)), - nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * - HPAGE_PMD_NR) + nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED)) #endif ); len += hugetlb_report_node_meminfo(buf, len, nid); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c61f440570f9..6fa761c9cc78 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -137,7 +137,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", - global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7bdbfeeb5c8c..66d68e5d5a0f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -222,7 +222,8 @@ static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item) return item == NR_ANON_THPS || item == NR_FILE_THPS || item == NR_SHMEM_THPS || - item == NR_SHMEM_PMDMAPPED; + item == NR_SHMEM_PMDMAPPED || + item == NR_FILE_PMDMAPPED; } /* diff --git a/mm/rmap.c b/mm/rmap.c index 1c1b576c0627..5ebf16fae4b9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1223,7 +1223,8 @@ void page_add_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, nr_pages); else - __inc_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1267,7 +1268,8 @@ static void page_remove_file_rmap(struct page *page, bool compound) __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, -nr_pages); else - __dec_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + -nr_pages); } else { if (!atomic_add_negative(-1, &page->_mapcount)) return; From fff66b79a19c9b3f2aa02b0a32fe598977c89eea Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:03:43 -0800 Subject: [PATCH 063/172] mm: memcontrol: make the slab calculation consistent Although the ratio of the slab is one, we also should read the ratio from the related memory_stats instead of hard-coding. And the local variable of size is already the value of slab_unreclaimable. So we do not need to read again. To do this we need some code like below: if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { - size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + VM_BUG_ON(i < 1); + VM_BUG_ON(memory_stats[i - 1].idx != NR_SLAB_RECLAIMABLE_B); + size += memcg_page_state(memcg, memory_stats[i - 1].idx) * + memory_stats[i - 1].ratio; It requires a series of VM_BUG_ONs or comments to ensure these two items are actually adjacent and in the right order. So it would probably be easier to implement this using a wrapper that has a big switch() for unit conversion. More details about this discussion can refer to: https://lore.kernel.org/patchwork/patch/1348611/ This would fix the ratio inconsistency and get rid of the order guarantee. Link: https://lkml.kernel.org/r/20201228164110.2838-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Alexey Dobriyan Cc: Feng Tang Cc: Greg Kroah-Hartman Cc: Hugh Dickins Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Michal Hocko Cc: NeilBrown Cc: Pankaj Gupta Cc: Rafael. J. Wysocki Cc: Randy Dunlap Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Shakeel Butt Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 105 ++++++++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 39 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d3a0c59210e7..f27e862a6845 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1512,49 +1512,71 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) struct memory_stat { const char *name; - unsigned int ratio; unsigned int idx; }; static const struct memory_stat memory_stats[] = { - { "anon", PAGE_SIZE, NR_ANON_MAPPED }, - { "file", PAGE_SIZE, NR_FILE_PAGES }, - { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, - { "pagetables", PAGE_SIZE, NR_PAGETABLE }, - { "percpu", 1, MEMCG_PERCPU_B }, - { "sock", PAGE_SIZE, MEMCG_SOCK }, - { "shmem", PAGE_SIZE, NR_SHMEM }, - { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, - { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, - { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, + { "anon", NR_ANON_MAPPED }, + { "file", NR_FILE_PAGES }, + { "kernel_stack", NR_KERNEL_STACK_KB }, + { "pagetables", NR_PAGETABLE }, + { "percpu", MEMCG_PERCPU_B }, + { "sock", MEMCG_SOCK }, + { "shmem", NR_SHMEM }, + { "file_mapped", NR_FILE_MAPPED }, + { "file_dirty", NR_FILE_DIRTY }, + { "file_writeback", NR_WRITEBACK }, #ifdef CONFIG_TRANSPARENT_HUGEPAGE - { "anon_thp", PAGE_SIZE, NR_ANON_THPS }, - { "file_thp", PAGE_SIZE, NR_FILE_THPS }, - { "shmem_thp", PAGE_SIZE, NR_SHMEM_THPS }, + { "anon_thp", NR_ANON_THPS }, + { "file_thp", NR_FILE_THPS }, + { "shmem_thp", NR_SHMEM_THPS }, #endif - { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, - { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, - { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, - { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, - { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, - - /* - * Note: The slab_reclaimable and slab_unreclaimable must be - * together and slab_reclaimable must be in front. - */ - { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, - { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, + { "inactive_anon", NR_INACTIVE_ANON }, + { "active_anon", NR_ACTIVE_ANON }, + { "inactive_file", NR_INACTIVE_FILE }, + { "active_file", NR_ACTIVE_FILE }, + { "unevictable", NR_UNEVICTABLE }, + { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, + { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, /* The memory events */ - { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, - { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, - { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, - { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, - { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, - { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, - { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, + { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, + { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, + { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, + { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, + { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, + { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, + { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, }; +/* Translate stat items to the correct unit for memory.stat output */ +static int memcg_page_state_unit(int item) +{ + switch (item) { + case MEMCG_PERCPU_B: + case NR_SLAB_RECLAIMABLE_B: + case NR_SLAB_UNRECLAIMABLE_B: + case WORKINGSET_REFAULT_ANON: + case WORKINGSET_REFAULT_FILE: + case WORKINGSET_ACTIVATE_ANON: + case WORKINGSET_ACTIVATE_FILE: + case WORKINGSET_RESTORE_ANON: + case WORKINGSET_RESTORE_FILE: + case WORKINGSET_NODERECLAIM: + return 1; + case NR_KERNEL_STACK_KB: + return SZ_1K; + default: + return PAGE_SIZE; + } +} + +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, + int item) +{ + return memcg_page_state(memcg, item) * memcg_page_state_unit(item); +} + static char *memory_stat_format(struct mem_cgroup *memcg) { struct seq_buf s; @@ -1578,13 +1600,12 @@ static char *memory_stat_format(struct mem_cgroup *memcg) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; - size = memcg_page_state(memcg, memory_stats[i].idx); - size *= memory_stats[i].ratio; + size = memcg_page_state_output(memcg, memory_stats[i].idx); seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { - size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + size += memcg_page_state_output(memcg, + NR_SLAB_RECLAIMABLE_B); seq_buf_printf(&s, "slab %llu\n", size); } } @@ -6375,6 +6396,12 @@ static int memory_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_NUMA +static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, + int item) +{ + return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); +} + static int memory_numa_stat_show(struct seq_file *m, void *v) { int i; @@ -6392,8 +6419,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - size = lruvec_page_state(lruvec, memory_stats[i].idx); - size *= memory_stats[i].ratio; + size = lruvec_page_state_output(lruvec, + memory_stats[i].idx); seq_printf(m, " N%d=%llu", nid, size); } seq_putc(m, '\n'); From d7e3aba583e6d13a81932597c5ee8da3c8b6af04 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 24 Feb 2021 12:03:47 -0800 Subject: [PATCH 064/172] mm/memcg: revise the using condition of lock_page_lruvec function series lock_page_lruvec() and its variants are safe to use under the same conditions as commit_charge(): add lock_page_memcg() to the comment. Polished with Hugh Dickins' suggestions, thanks! Link: https://lkml.kernel.org/r/1608614453-10739-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Hugh Dickins Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f27e862a6845..2b13b2cf5dcb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1346,10 +1346,11 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) * lock_page_lruvec - lock and return lruvec for a given page. * @page: the page * - * This series functions should be used in either conditions: - * PageLRU is cleared or unset - * or page->_refcount is zero - * or page is locked. + * These functions are safe to use under any of the following conditions: + * - page locked + * - PageLRU cleared + * - lock_page_memcg() + * - page->_refcount is zero */ struct lruvec *lock_page_lruvec(struct page *page) { From f9b1038ebccad354256cf84749cbc321b5347497 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 24 Feb 2021 12:03:51 -0800 Subject: [PATCH 065/172] mm/memcg: remove rcu locking for lock_page_lruvec function series lock_page_lruvec() and its variants used rcu_read_lock() with the intention of safeguarding against the mem_cgroup being destroyed concurrently; but so long as they are called under the specified conditions (as they are), there is no way for the page's mem_cgroup to be destroyed. Delete the unnecessary rcu_read_lock() and _unlock(). Hugh Dickins polished the commit log. Thanks a lot! Link: https://lkml.kernel.org/r/1608614453-10739-2-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Hugh Dickins Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b13b2cf5dcb..5435b370c929 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1357,10 +1357,8 @@ struct lruvec *lock_page_lruvec(struct page *page) struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock(&lruvec->lru_lock); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1372,10 +1370,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page) struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock_irq(&lruvec->lru_lock); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1387,10 +1383,8 @@ struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock_irqsave(&lruvec->lru_lock, *flags); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); From b6038942480e574c697ea1a80019bbe586c1d654 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 24 Feb 2021 12:03:55 -0800 Subject: [PATCH 066/172] mm: memcg: add swapcache stat for memcg v2 This patch adds swapcache stat for the cgroup v2. The swapcache represents the memory that is accounted against both the memory and the swap limit of the cgroup. The main motivation behind exposing the swapcache stat is for enabling users to gracefully migrate from cgroup v1's memsw counter to cgroup v2's memory and swap counters. Cgroup v1's memsw limit allows users to limit the memory+swap usage of a workload but without control on the exact proportion of memory and swap. Cgroup v2 provides separate limits for memory and swap which enables more control on the exact usage of memory and swap individually for the workload. With some little subtleties, the v1's memsw limit can be switched with the sum of the v2's memory and swap limits. However the alternative for memsw usage is not yet available in cgroup v2. Exposing per-cgroup swapcache stat enables that alternative. Adding the memory usage and swap usage and subtracting the swapcache will approximate the memsw usage. This will help in the transparent migration of the workloads depending on memsw usage and limit to v2' memory and swap counters. The reasons these applications are still interested in this approximate memsw usage are: (1) these applications are not really interested in two separate memory and swap usage metrics. A single usage metric is more simple to use and reason about for them. (2) The memsw usage metric hides the underlying system's swap setup from the applications. Applications with multiple instances running in a datacenter with heterogeneous systems (some have swap and some don't) will keep seeing a consistent view of their usage. [akpm@linux-foundation.org: fix CONFIG_SWAP=n build] Link: https://lkml.kernel.org/r/20210108155813.2914586-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ drivers/base/node.c | 6 ++++++ include/linux/mmzone.h | 3 +++ include/linux/swap.h | 6 +++++- mm/memcontrol.c | 3 +++ mm/migrate.c | 6 ++++++ mm/swap_state.c | 28 ++----------------------- mm/vmstat.c | 3 +++ 8 files changed, 32 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c513eafaddea..9acc57f68f31 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1299,6 +1299,10 @@ PAGE_SIZE multiple when read back. Amount of cached filesystem data that was modified and is currently being written back to disk + swapcached + Amount of swap cached in memory. The swapcache is accounted + against both memory and swap usage. + anon_thp Amount of memory used in anonymous mappings backed by transparent hugepages diff --git a/drivers/base/node.c b/drivers/base/node.c index d02d86aec19f..f449dbb2c746 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -372,14 +372,19 @@ static ssize_t node_read_meminfo(struct device *dev, struct pglist_data *pgdat = NODE_DATA(nid); struct sysinfo i; unsigned long sreclaimable, sunreclaimable; + unsigned long swapcached = 0; si_meminfo_node(&i, nid); sreclaimable = node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B); sunreclaimable = node_page_state_pages(pgdat, NR_SLAB_UNRECLAIMABLE_B); +#ifdef CONFIG_SWAP + swapcached = node_page_state_pages(pgdat, NR_SWAPCACHE); +#endif len = sysfs_emit_at(buf, len, "Node %d MemTotal: %8lu kB\n" "Node %d MemFree: %8lu kB\n" "Node %d MemUsed: %8lu kB\n" + "Node %d SwapCached: %8lu kB\n" "Node %d Active: %8lu kB\n" "Node %d Inactive: %8lu kB\n" "Node %d Active(anon): %8lu kB\n" @@ -391,6 +396,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), + nid, K(swapcached), nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) + node_page_state(pgdat, NR_ACTIVE_FILE)), nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) + diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 66d68e5d5a0f..fc99e9241846 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,9 @@ enum node_stat_item { NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_PAGETABLE, /* used for pagetables */ +#ifdef CONFIG_SWAP + NR_SWAPCACHE, +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/include/linux/swap.h b/include/linux/swap.h index 3f1f7ae0fbe9..0d64a2bc7e00 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -408,7 +408,11 @@ extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) -extern unsigned long total_swapcache_pages(void); +static inline unsigned long total_swapcache_pages(void) +{ + return global_node_page_state(NR_SWAPCACHE); +} + extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern void *get_shadow_from_swap_cache(swp_entry_t entry); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5435b370c929..58edfb6614b8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1521,6 +1521,9 @@ static const struct memory_stat memory_stats[] = { { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, { "file_writeback", NR_WRITEBACK }, +#ifdef CONFIG_SWAP + { "swapcached", NR_SWAPCACHE }, +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE { "anon_thp", NR_ANON_THPS }, { "file_thp", NR_FILE_THPS }, diff --git a/mm/migrate.c b/mm/migrate.c index 4ec727adfaa2..62b81d5257aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping, __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + } +#endif if (dirty && mapping_can_writeback(mapping)) { __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); diff --git a/mm/swap_state.c b/mm/swap_state.c index 807b00eae969..c1a648d9092b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -68,32 +68,6 @@ static struct { unsigned long find_total; } swap_cache_info; -unsigned long total_swapcache_pages(void) -{ - unsigned int i, j, nr; - unsigned long ret = 0; - struct address_space *spaces; - struct swap_info_struct *si; - - for (i = 0; i < MAX_SWAPFILES; i++) { - swp_entry_t entry = swp_entry(i, 1); - - /* Avoid get_swap_device() to warn for bad swap entry */ - if (!swp_swap_info(entry)) - continue; - /* Prevent swapoff to free swapper_spaces */ - si = get_swap_device(entry); - if (!si) - continue; - nr = nr_swapper_spaces[i]; - spaces = swapper_spaces[i]; - for (j = 0; j < nr; j++) - ret += spaces[j].nrpages; - put_swap_device(si); - } - return ret; -} - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) @@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); @@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page, address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); ADD_CACHE_INFO(del_total, nr); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 07dc0af50cf0..a0e949542204 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", +#ifdef CONFIG_SWAP + "nr_swapcached", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", From c1a660dea3fa616420606f1e206e6d22f7e05c30 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 24 Feb 2021 12:03:58 -0800 Subject: [PATCH 067/172] mm: kmem: make __memcg_kmem_(un)charge static I've noticed that __memcg_kmem_charge() and __memcg_kmem_uncharge() are not used anywhere except memcontrol.c. Yet they are not declared as non-static and are declared in memcontrol.h. This patch makes them static. Link: https://lkml.kernel.org/r/20210108020332.4096911-1-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 --- mm/memcontrol.c | 11 ++++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 41bbf71edd9f..7a38a1517a05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1592,9 +1592,6 @@ static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, #endif #ifdef CONFIG_MEMCG_KMEM -int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages); -void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 58edfb6614b8..dce5291525a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages); +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, + unsigned int nr_pages); + static void obj_cgroup_release(struct percpu_ref *ref) { struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); @@ -3087,8 +3092,8 @@ static void memcg_free_cache_id(int id) * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; int ret; @@ -3120,7 +3125,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, * @memcg: memcg to uncharge * @nr_pages: number of pages to uncharge */ -void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); From 802f1d522d5fdaefc2b935141bc8fe03d43a99ab Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 24 Feb 2021 12:04:01 -0800 Subject: [PATCH 068/172] mm: page_counter: re-layout structure to reduce false sharing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When checking a memory cgroup related performance regression [1], from the perf c2c profiling data, we found high false sharing for accessing 'usage' and 'parent'. On 64 bit system, the 'usage' and 'parent' are close to each other, and easy to be in one cacheline (for cacheline size == 64+ B). 'usage' is usally written, while 'parent' is usually read as the cgroup's hierarchical counting nature. So move the 'parent' to the end of the structure to make sure they are in different cache lines. Following are some performance data with the patch, against v5.11-rc1. [ In the data, A means a platform with 2 sockets 48C/96T, B is a platform of 4 sockests 72C/144T, and if a %stddev will be shown bigger than 2%, P100/P50 means number of test tasks equals to 100%/50% of nr_cpu] will-it-scale/malloc1 --------------------- v5.11-rc1 v5.11-rc1+patch A-P100 15782 ± 2% -0.1% 15765 ± 3% will-it-scale.per_process_ops A-P50 21511 +8.9% 23432 will-it-scale.per_process_ops B-P100 9155 +2.2% 9357 will-it-scale.per_process_ops B-P50 10967 +7.1% 11751 ± 2% will-it-scale.per_process_ops will-it-scale/pagefault2 ------------------------ v5.11-rc1 v5.11-rc1+patch A-P100 79028 +3.0% 81411 will-it-scale.per_process_ops A-P50 183960 ± 2% +4.4% 192078 ± 2% will-it-scale.per_process_ops B-P100 85966 +9.9% 94467 ± 3% will-it-scale.per_process_ops B-P50 198195 +9.8% 217526 will-it-scale.per_process_ops fio (4k/1M is block size) ------------------------- v5.11-rc1 v5.11-rc1+patch A-P50-r-4k 16881 ± 2% +1.2% 17081 ± 2% fio.read_bw_MBps A-P50-w-4k 3931 +4.5% 4111 ± 2% fio.write_bw_MBps A-P50-r-1M 15178 -0.2% 15154 fio.read_bw_MBps A-P50-w-1M 3924 +0.1% 3929 fio.write_bw_MBps [1].https://lore.kernel.org/lkml/20201102091543.GM31092@shao2-debian/ Link: https://lkml.kernel.org/r/1611040814-33449-1-git-send-email-feng.tang@intel.com Signed-off-by: Feng Tang Reviewed-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_counter.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 85bd413e784e..679591301994 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -12,7 +12,6 @@ struct page_counter { unsigned long low; unsigned long high; unsigned long max; - struct page_counter *parent; /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -27,6 +26,14 @@ struct page_counter { /* legacy */ unsigned long watermark; unsigned long failcnt; + + /* + * 'parent' is placed here to be far from 'usage' to reduce + * cache false sharing, as 'usage' is written mostly while + * parent is frequently read for cgroup's hierarchical + * counting nature. + */ + struct page_counter *parent; }; #if BITS_PER_LONG == 32 From 8a260162f9a0634db9a1ee7b8db276e7a00ee1d9 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 24 Feb 2021 12:04:05 -0800 Subject: [PATCH 069/172] mm/memcontrol: remove redundant NULL check Fix below warnings reported by coccicheck: mm/memcontrol.c:451:3-9: WARNING: NULL check before some freeing functions is not needed. Link: https://lkml.kernel.org/r/1611216029-34397-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Reviewed-by: Andrew Morton Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dce5291525a5..ed5cc78a8dbf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -452,8 +452,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) for_each_node(nid) { pn = mem_cgroup_nodeinfo(memcg, nid); map = rcu_dereference_protected(pn->shrinker_map, true); - if (map) - kvfree(map); + kvfree(map); rcu_assign_pointer(pn->shrinker_map, NULL); } } From c41a40b6baf732ca1d519ff558fb0082c0c04e9a Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:04:08 -0800 Subject: [PATCH 070/172] mm: memcontrol: replace the loop with a list_for_each_entry() The rule of list walk has gone since commit a9d5adeeb4b2 ("mm/memcontrol: allow to uncharge page without using page->lru field") So remove the strange comment and replace the loop with a list_for_each_entry(). There is only one caller of the uncharge_list(). So just fold it into mem_cgroup_uncharge_list() and remove it. Link: https://lkml.kernel.org/r/20210204163055.56080-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Roman Gushchin Reviewed-by: Miaohe Lin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ed5cc78a8dbf..8c035846c7a4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6862,31 +6862,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) css_put(&ug->memcg->css); } -static void uncharge_list(struct list_head *page_list) -{ - struct uncharge_gather ug; - struct list_head *next; - - uncharge_gather_clear(&ug); - - /* - * Note that the list can be a single page->lru; hence the - * do-while loop instead of a simple list_for_each_entry(). - */ - next = page_list->next; - do { - struct page *page; - - page = list_entry(next, struct page, lru); - next = page->lru.next; - - uncharge_page(page, &ug); - } while (next != page_list); - - if (ug.memcg) - uncharge_batch(&ug); -} - /** * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge @@ -6918,11 +6893,17 @@ void mem_cgroup_uncharge(struct page *page) */ void mem_cgroup_uncharge_list(struct list_head *page_list) { + struct uncharge_gather ug; + struct page *page; + if (mem_cgroup_disabled()) return; - if (!list_empty(page_list)) - uncharge_list(page_list); + uncharge_gather_clear(&ug); + list_for_each_entry(page, page_list, lru) + uncharge_page(page, &ug); + if (ug.memcg) + uncharge_batch(&ug); } /** From a7b7e1df892457935ec4f35ef9e9aa344758dbc9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 24 Feb 2021 12:04:12 -0800 Subject: [PATCH 071/172] mm/list_lru.c: remove kvfree_rcu_local() The list_lru file used to have local kvfree_rcu() which was renamed by commit e0feed08ab41 ("mm/list_lru.c: Rename kvfree_rcu() to local variant") to introduce the globally visible kvfree_rcu(). Now we have global kvfree_rcu(), so remove the local kvfree_rcu_local() and just use the global one. Link: https://lkml.kernel.org/r/20210207152148.1285842-1-shakeelb@google.com Signed-off-by: Shakeel Butt Reviewed-by: Uladzislau Rezki Reviewed-by: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index fe230081690b..6f067b6b935f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -373,21 +373,13 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; /* * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu_local(). + * and nobody can use it. So, there is no need to use kvfree_rcu(). */ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); kvfree(memcg_lrus); } -static void kvfree_rcu_local(struct rcu_head *head) -{ - struct list_lru_memcg *mlru; - - mlru = container_of(head, struct list_lru_memcg, rcu); - kvfree(mlru); -} - static int memcg_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { @@ -419,7 +411,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - call_rcu(&old->rcu, kvfree_rcu_local); + kvfree_rcu(old, rcu); return 0; } From 6eeb104e114cb6b7391c2d69ff873403858c1f35 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 24 Feb 2021 12:04:15 -0800 Subject: [PATCH 072/172] fs: buffer: use raw page_memcg() on locked page alloc_page_buffers() currently uses get_mem_cgroup_from_page() for charging the buffers to the page owner, which does an rcu-protected page->memcg lookup and acquires a reference. But buffer allocation has the page lock held throughout, which pins the page to the memcg and thereby the memcg - neither rcu nor holding an extra reference during the allocation are necessary. Use a raw page_memcg() instead. This was the last user of get_mem_cgroup_from_page(), delete it. Link: https://lkml.kernel.org/r/20210209190126.97842-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Muchun Song Reviewed-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- include/linux/memcontrol.h | 7 ------- mm/memcontrol.c | 23 ----------------------- 3 files changed, 2 insertions(+), 32 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index f1c3a5b27a90..0cb7ffd4977c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - memcg = get_mem_cgroup_from_page(page); + /* The page lock pins the memcg */ + memcg = page_memcg(page); old_memcg = set_active_memcg(memcg); head = NULL; @@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } out: set_active_memcg(old_memcg); - mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7a38a1517a05..e6dc793d587d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -680,8 +680,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); -struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); - struct lruvec *lock_page_lruvec(struct page *page); struct lruvec *lock_page_lruvec_irq(struct page *page); struct lruvec *lock_page_lruvec_irqsave(struct page *page, @@ -1191,11 +1189,6 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } -static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) -{ - return NULL; -} - static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8c035846c7a4..7fdc001ce15f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1047,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -/** - * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. - * @page: page from which memcg should be extracted. - * - * Obtain a reference on page->memcg and returns it if successful. Otherwise - * root_mem_cgroup is returned. - */ -struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) -{ - struct mem_cgroup *memcg = page_memcg(page); - - if (mem_cgroup_disabled()) - return NULL; - - rcu_read_lock(); - /* Page should not get uncharged and freed memcg under us. */ - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - return memcg; -} -EXPORT_SYMBOL(get_mem_cgroup_from_page); - static __always_inline struct mem_cgroup *active_memcg(void) { if (in_interrupt()) From cae3af62b33aa931427a0f211e04347b22180b36 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:04:19 -0800 Subject: [PATCH 073/172] mm: memcontrol: fix swap undercounting in cgroup2 When pages are swapped in, the VM may retain the swap copy to avoid repeated writes in the future. It's also retained if shared pages are faulted back in some processes, but not in others. During that time we have an in-memory copy of the page, as well as an on-swap copy. Cgroup1 and cgroup2 handle these overlapping lifetimes slightly differently due to the nature of how they account memory and swap: Cgroup1 has a unified memory+swap counter that tracks a data page regardless whether it's in-core or swapped out. On swapin, we transfer the charge from the swap entry to the newly allocated swapcache page, even though the swap entry might stick around for a while. That's why we have a mem_cgroup_uncharge_swap() call inside mem_cgroup_charge(). Cgroup2 tracks memory and swap as separate, independent resources and thus has split memory and swap counters. On swapin, we charge the newly allocated swapcache page as memory, while the swap slot in turn must remain charged to the swap counter as long as its allocated too. The cgroup2 logic was broken by commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), because it accidentally removed the do_memsw_account() check in the branch inside mem_cgroup_uncharge() that was supposed to tell the difference between the charge transfer in cgroup1 and the separate counters in cgroup2. As a result, cgroup2 currently undercounts retained swap to varying degrees: swap slots are cached up to 50% of the configured limit or total available swap space; partially faulted back shared pages are only limited by physical capacity. This in turn allows cgroups to significantly overconsume their alloted swap space. Add the do_memsw_account() check back to fix this problem. Link: https://lkml.kernel.org/r/20210217153237.92484-1-songmuchun@bytedance.com Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") Signed-off-by: Muchun Song Acked-by: Johannes Weiner Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Vladimir Davydov Cc: [5.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7fdc001ce15f..2db2aeac8a9e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6748,7 +6748,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) memcg_check_events(memcg, page); local_irq_enable(); - if (PageSwapCache(page)) { + /* + * Cgroup1's unified memory+swap counter has been charged with the + * new swapcache page, finish the transfer by uncharging the swap + * slot. The swap slot would also get uncharged when it dies, but + * it can stick around indefinitely and we'd count the page twice + * the entire time. + * + * Cgroup2 has separate resource counters for memory and swap, + * so this is a non-issue here. Memory and swap charge lifetimes + * correspond 1:1 to page and swap slot lifetimes: we charge the + * page to memory here, and uncharge swap when the slot is freed. + */ + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, From 1685bde6b9af55923180a76152036c7fb7176db0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:04:22 -0800 Subject: [PATCH 074/172] mm: memcontrol: fix get_active_memcg return value We use a global percpu int_active_memcg variable to store the remote memcg when we are in the interrupt context. But get_active_memcg always return the current->active_memcg or root_mem_cgroup. The remote memcg (set in the interrupt context) is ignored. This is not what we want. So fix it. Link: https://lkml.kernel.org/r/20210223091101.42150-1-songmuchun@bytedance.com Fixes: 37d5985c003d ("mm: kmem: prepare remote memcg charging infra for interrupt contexts") Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2db2aeac8a9e..845eec01ef9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1061,13 +1061,9 @@ static __always_inline struct mem_cgroup *get_active_memcg(void) rcu_read_lock(); memcg = active_memcg(); - if (memcg) { - /* current->active_memcg must hold a ref. */ - if (WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - else - memcg = current->active_memcg; - } + /* remote memcg must hold a ref. */ + if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; rcu_read_unlock(); return memcg; From 96403bfe50c344b587ea53894954a9d152af1c9d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 24 Feb 2021 12:04:26 -0800 Subject: [PATCH 075/172] mm: memcontrol: fix slub memory accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SLUB currently account kmalloc() and kmalloc_node() allocations larger than order-1 page per-node. But it forget to update the per-memcg vmstats. So it can lead to inaccurate statistics of "slab_unreclaimable" which is from memory.stat. Fix it by using mod_lruvec_page_state instead of mod_node_page_state. Link: https://lkml.kernel.org/r/20210223092423.42420-1-songmuchun@bytedance.com Fixes: 6a486c0ad4dc ("mm, sl[ou]b: improve memory accounting") Signed-off-by: Muchun Song Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Reviewed-by: Michal Koutný Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 4 ++-- mm/slub.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 9633fa1bbbfd..5be7825ad3ce 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -898,8 +898,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_pages(flags, order); if (likely(page)) { ret = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ diff --git a/mm/slub.c b/mm/slub.c index ca566361561e..046d7194acaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4042,8 +4042,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) page = alloc_pages_node(node, flags, order); if (page) { ptr = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } return kmalloc_large_node_hook(ptr, size, flags); @@ -4174,8 +4174,8 @@ void kfree(const void *x) BUG_ON(!PageCompound(page)); kfree_hook(object); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); __free_pages(page, order); return; } From b7204006c8602f43793ee1b285e963084bdb1a26 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Wed, 24 Feb 2021 12:04:29 -0800 Subject: [PATCH 076/172] mm/mmap.c: remove unnecessary local variable The local variable 'retval' is assigned just for once in __do_sys_brk(), and the function returns the value of the local variable right after the assignment. Remove unnecessary assignment and local variable declaration. Link: https://lkml.kernel.org/r/20201222103249.30683-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang Acked-by: Souptick Joarder Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 90673febce6a..3f287599a7a3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -189,7 +189,6 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long retval; unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; @@ -281,9 +280,8 @@ success: return brk; out: - retval = origbrk; mmap_write_unlock(mm); - return retval; + return origbrk; } static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) From 90a3e375d324b2255b83e3dd29e99e2b05d82aaf Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:04:33 -0800 Subject: [PATCH 077/172] mm/memory.c: fix potential pte_unmap_unlock pte error Since commit 42e4089c7890 ("x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings"), when the first pfn modify is not allowed, we would break the loop with pte unchanged. Then the wrong pte - 1 would be passed to pte_unmap_unlock. Andi said: "While the fix is correct, I'm not sure if it actually is a real bug. Is there any architecture that would do something else than unlocking the underlying page? If it's just the underlying page then it should be always the same page, so no bug" Link: https://lkml.kernel.org/r/20210109080118.20885-1-linmiaohe@huawei.com Fixes: 42e4089c789 ("x86/speculation/l1tf: Disallow non privileged high MMIO PROT_NONE mappings") Signed-off-by: Hongxiang Lou Signed-off-by: Miaohe Lin Cc: Thomas Gleixner Cc: Dave Hansen Cc: Andi Kleen Cc: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5da964079678..83e1bcf7a669 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2177,11 +2177,11 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { - pte_t *pte; + pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); @@ -2195,7 +2195,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); return err; } From c045c72ccde3a267963f8e85f388db4c40dea3b3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:04:36 -0800 Subject: [PATCH 078/172] mm/pgtable-generic.c: simplify the VM_BUG_ON condition in pmdp_huge_clear_flush() The condition (A && !C && !D) || !A is equivalent to !A || (A && !C && !D) and can be further simplified to !A || (!C && !D). Link: https://lkml.kernel.org/r/20210201114319.34720-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 9578db83e312..fa1375f3e3b2 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -135,8 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + VM_BUG_ON(!pmd_present(*pmdp) || (!pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp))); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; From 374437a274e24e8e3ccd19f704e80d325f75f254 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:04:39 -0800 Subject: [PATCH 079/172] mm/pgtable-generic.c: optimize the VM_BUG_ON condition in pmdp_huge_clear_flush() The developer will have trouble figuring out why the BUG actually triggered when there is a complex expression in the VM_BUG_ON. Because we can only identify the condition triggered BUG via line number provided by VM_BUG_ON. Optimize this by spliting such a complex expression into two simple conditions. Link: https://lkml.kernel.org/r/20210203084137.25522-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index fa1375f3e3b2..c2210e1cdb51 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -135,8 +135,9 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_present(*pmdp) || (!pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp))); + VM_BUG_ON(!pmd_present(*pmdp)); + /* Below assumes pmd_present() is true */ + VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; From 8abb50c76b484e8d8dc355c092170c37b5f832f5 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:04:42 -0800 Subject: [PATCH 080/172] mm/memory.c: fix potential pte_unmap_unlock pte error If all pte entry is none in 'non-create' case, we would break the loop with pte unchanged. Then the wrong pte - 1 would be passed to pte_unmap_unlock. This is a theoretical issue which may not be a real bug. So it's not worth cc stable. Link: https://lkml.kernel.org/r/20210205081925.59809-1-linmiaohe@huawei.com Fixes: aee16b3cee27 ("Add apply_to_page_range() which applies a function to a pte range") Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Cc: Ingo Molnar Cc: Ian Pratt Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 83e1bcf7a669..ef4e10c60b5f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2394,18 +2394,18 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { - pte_t *pte; + pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { - pte = (mm == &init_mm) ? + mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { - pte = (mm == &init_mm) ? + mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); } @@ -2428,7 +2428,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_leave_lazy_mmu_mode(); if (mm != &init_mm) - pte_unmap_unlock(pte-1, ptl); + pte_unmap_unlock(mapped_pte, ptl); return err; } From dbf53f7597be11ffc18b16444a1ffc7d7b76746e Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Wed, 24 Feb 2021 12:04:46 -0800 Subject: [PATCH 081/172] mm/mprotect.c: optimize error detection in do_mprotect_pkey() Obviously, the error variable detection of the if statement is for the mprotect callback function, so it is also put into the scope of calling callbck. This is a cleanup which makes this site consistent with the rest of this function's error handling. Link: https://lkml.kernel.org/r/20210118133310.98375-1-tianjia.zhang@linux.alibaba.com Signed-off-by: Tianjia Zhang Reported-by: Jia Zhang Reviewed-by: Andrew Morton Cc: Jarkko Sakkinen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index ab709023e9aa..94188df1ee55 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -617,10 +617,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (tmp > end) tmp = end; - if (vma->vm_ops && vma->vm_ops->mprotect) + if (vma->vm_ops && vma->vm_ops->mprotect) { error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); - if (error) - goto out; + if (error) + goto out; + } error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) From ee8ab1903e3d912d8f10bedbf96c3b6a1c8cbede Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Wed, 24 Feb 2021 12:04:49 -0800 Subject: [PATCH 082/172] mm: rmap: explicitly reset vma->anon_vma in unlink_anon_vmas() In case the vma will continue to be used after unlink its relevant anon_vma, we need to reset the vma->anon_vma pointer to NULL. So, later when fault happen within this vma again, a new anon_vma will be prepared. By this way, the vma will only be checked for reverse mapping of pages which been fault in after the unlink_anon_vmas call. Currently, the mremap with MREMAP_DONTUNMAP scenario will continue use the vma after moved its page table entries to a new vma. For other scenarios, the vma itself will be freed after call unlink_anon_vmas. Link: https://lkml.kernel.org/r/20210119075126.3513154-1-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Cc: Andrea Arcangeli Cc: Brian Geffon Cc: Kirill A. Shutemov Cc: Lokesh Gidra Cc: Minchan Kim Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 5ebf16fae4b9..e26ae119a131 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) + if (vma->anon_vma) { vma->anon_vma->degree--; + + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + } unlock_anon_vma_root(root); /* From 1583aa278f5f6a58b6ff8e9e703d0cca2b953d97 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Wed, 24 Feb 2021 12:04:53 -0800 Subject: [PATCH 083/172] mm: mremap: unlink anon_vmas when mremap with MREMAP_DONTUNMAP success mremap with MREMAP_DONTUNMAP can move all page table entries to new vma, which means all pages allocated for the old vma are not relevant to it anymore, and the relevant anon_vma links needs to be unlinked, in nature the old vma is much like been freshly created and have no pages been fault in. But we should not do unlink, if the new vma has effectively merged with the old one. [lixinhai.lxh@gmail.com: v2] Link: https://lkml.kernel.org/r/20210127083917.309264-2-lixinhai.lxh@gmail.com Link: https://lkml.kernel.org/r/20210119075126.3513154-2-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Cc: Brian Geffon Cc: Lokesh Gidra Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index 47192691fe32..ec8f840399ed 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -593,6 +593,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + /* + * anon_vma links of the old vma is no longer needed after its page + * table has been moved. + */ + if (new_vma != vma && vma->vm_start == old_addr && + vma->vm_end == (old_addr + old_len)) + unlink_anon_vmas(vma); + /* Because we won't unmap we don't need to touch locked_vm */ return new_addr; } From 5df6d792011b0b221f0a3a7ba5a732230cd71b4f Mon Sep 17 00:00:00 2001 From: "sh_def@163.com" Date: Wed, 24 Feb 2021 12:04:57 -0800 Subject: [PATCH 084/172] mm/page_reporting: use list_entry_is_head() in page_reporting_cycle() Replace '&next->lru != list' with list_entry_is_head(). No functional change. Link: https://lkml.kernel.org/r/20201222182735.GA1257912@ubuntu-A520I-AC Signed-off-by: sh Reviewed-by: Andrew Morton Cc: Alexander Duyck Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_reporting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_reporting.c b/mm/page_reporting.c index cd8e13d41df4..c50d93ffa252 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -211,7 +211,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, } /* Rotate any leftover pages to the head of the freelist */ - if (&next->lru != list && !list_is_first(&next->lru, list)) + if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) list_rotate_to_front(&next->lru, list); spin_unlock_irq(&zone->lock); From fb9bf0484af4770240342f4d1b3dd054889cc31e Mon Sep 17 00:00:00 2001 From: Yang Li Date: Wed, 24 Feb 2021 12:05:00 -0800 Subject: [PATCH 085/172] vmalloc: remove redundant NULL check Fix below warnings reported by coccicheck: fs/proc/vmcore.c:1503:2-7: WARNING: NULL check before some freeing functions is not needed. Link: https://lkml.kernel.org/r/1611216753-44598-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Alexey Dobriyan Cc: "Uladzislau Rezki (Sony)" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index c3a345c28a93..9a15334da208 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1503,11 +1503,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return 0; out_err: - if (buf) - vfree(buf); - - if (dump) - vfree(dump); + vfree(buf); + vfree(dump); return ret; } From f00748bfa0246c428bf93f45267b8f1aa1816098 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:05 -0800 Subject: [PATCH 086/172] kasan: prefix global functions with kasan_ Patch series "kasan: HW_TAGS tests support and fixes", v4. This patchset adds support for running KASAN-KUnit tests with the hardware tag-based mode and also contains a few fixes. This patch (of 15): There's a number of internal KASAN functions that are used across multiple source code files and therefore aren't marked as static inline. To avoid littering the kernel function names list with generic function names, prefix all such KASAN functions with kasan_. As a part of this change: - Rename internal (un)poison_range() to kasan_(un)poison() (no _range) to avoid name collision with a public kasan_unpoison_range(). - Rename check_memory_region() to kasan_check_range(), as it's a more fitting name. Link: https://lkml.kernel.org/r/cover.1610733117.git.andreyknvl@google.com Link: https://linux-review.googlesource.com/id/I719cc93483d4ba288a634dba80ee6b7f2809cd26 Link: https://lkml.kernel.org/r/13777aedf8d3ebbf35891136e1f2287e2f34aaba.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Marco Elver Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Dmitry Vyukov Cc: Will Deacon Cc: Andrey Ryabinin Cc: Peter Collingbourne Cc: Evgenii Stepanov Cc: Branislav Rankov Cc: Kevin Brodsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 47 +++++++++++++++++++------------------- mm/kasan/generic.c | 36 ++++++++++++++--------------- mm/kasan/kasan.h | 48 +++++++++++++++++++-------------------- mm/kasan/quarantine.c | 22 +++++++++--------- mm/kasan/report.c | 13 ++++++----- mm/kasan/report_generic.c | 8 +++---- mm/kasan/report_hw_tags.c | 8 +++---- mm/kasan/report_sw_tags.c | 8 +++---- mm/kasan/shadow.c | 26 ++++++++++----------- mm/kasan/sw_tags.c | 16 ++++++------- tools/objtool/check.c | 2 +- 11 files changed, 117 insertions(+), 117 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b25167664ead..eedc3e0fe365 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -60,7 +60,7 @@ void kasan_disable_current(void) void __kasan_unpoison_range(const void *address, size_t size) { - unpoison_range(address, size); + kasan_unpoison(address, size); } #if CONFIG_KASAN_STACK @@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task) { void *base = task_stack_page(task); - unpoison_range(base, THREAD_SIZE); + kasan_unpoison(base, THREAD_SIZE); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - unpoison_range(base, watermark - base); + kasan_unpoison(base, watermark - base); } #endif /* CONFIG_KASAN_STACK */ @@ -105,18 +105,17 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) if (unlikely(PageHighMem(page))) return; - tag = random_tag(); + tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - unpoison_range(page_address(page), PAGE_SIZE << order); + kasan_unpoison(page_address(page), PAGE_SIZE << order); } void __kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) - poison_range(page_address(page), - PAGE_SIZE << order, - KASAN_FREE_PAGE); + kasan_poison(page_address(page), PAGE_SIZE << order, + KASAN_FREE_PAGE); } /* @@ -246,18 +245,18 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - poison_range(page_address(page), page_size(page), + kasan_poison(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - unpoison_range(object, cache->object_size); + kasan_unpoison(object, cache->object_size); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE); + kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE); } /* @@ -294,7 +293,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object, * set, assign a tag when the object is being allocated (init == false). */ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return init ? KASAN_TAG_KERNEL : random_tag(); + return init ? KASAN_TAG_KERNEL : kasan_random_tag(); /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB @@ -305,7 +304,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object, * For SLUB assign a random tag during slab creation, otherwise reuse * the already assigned tag. */ - return init ? random_tag() : get_tag(object); + return init ? kasan_random_tag() : get_tag(object); #endif } @@ -346,12 +345,12 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; - if (check_invalid_free(tagged_object)) { + if (kasan_check_invalid_free(tagged_object)) { kasan_report_invalid_free(tagged_object, ip); return true; } - poison_range(object, cache->object_size, KASAN_KMALLOC_FREE); + kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); if (!kasan_stack_collection_enabled()) return false; @@ -361,7 +360,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, kasan_set_free_info(cache, object, tag); - return quarantine_put(cache, object); + return kasan_quarantine_put(cache, object); } bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) @@ -386,7 +385,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - poison_range(ptr, page_size(page), KASAN_FREE_PAGE); + kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); } @@ -409,7 +408,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, u8 tag; if (gfpflags_allow_blocking(flags)) - quarantine_reduce(); + kasan_quarantine_reduce(); if (unlikely(object == NULL)) return NULL; @@ -421,9 +420,9 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ - unpoison_range(set_tag(object, tag), size); - poison_range((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + kasan_unpoison(set_tag(object, tag), size); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); if (kasan_stack_collection_enabled()) set_alloc_info(cache, (void *)object, flags); @@ -452,7 +451,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, unsigned long redzone_end; if (gfpflags_allow_blocking(flags)) - quarantine_reduce(); + kasan_quarantine_reduce(); if (unlikely(ptr == NULL)) return NULL; @@ -462,8 +461,8 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(page); - unpoison_range(ptr, size); - poison_range((void *)redzone_start, redzone_end - redzone_start, + kasan_unpoison(ptr, size); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); return (void *)ptr; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5106b84b07d4..acab8862dc67 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -158,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline bool check_memory_region_inline(unsigned long addr, +static __always_inline bool check_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { @@ -179,13 +179,13 @@ static __always_inline bool check_memory_region_inline(unsigned long addr, return !kasan_report(addr, size, write, ret_ip); } -bool check_memory_region(unsigned long addr, size_t size, bool write, - unsigned long ret_ip) +bool kasan_check_range(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) { - return check_memory_region_inline(addr, size, write, ret_ip); + return check_region_inline(addr, size, write, ret_ip); } -bool check_invalid_free(void *addr) +bool kasan_check_invalid_free(void *addr) { s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); @@ -194,22 +194,22 @@ bool check_invalid_free(void *addr) void kasan_cache_shrink(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + kasan_quarantine_remove_cache(cache); } void kasan_cache_shutdown(struct kmem_cache *cache) { if (!__kmem_cache_empty(cache)) - quarantine_remove_cache(cache); + kasan_quarantine_remove_cache(cache); } static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - unpoison_range(global->beg, global->size); + kasan_unpoison(global->beg, global->size); - poison_range(global->beg + aligned_size, + kasan_poison(global->beg + aligned_size, global->size_with_redzone - aligned_size, KASAN_GLOBAL_REDZONE); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(__asan_unregister_globals); #define DEFINE_ASAN_LOAD_STORE(size) \ void __asan_load##size(unsigned long addr) \ { \ - check_memory_region_inline(addr, size, false, _RET_IP_);\ + check_region_inline(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_load##size); \ __alias(__asan_load##size) \ @@ -239,7 +239,7 @@ EXPORT_SYMBOL(__asan_unregister_globals); EXPORT_SYMBOL(__asan_load##size##_noabort); \ void __asan_store##size(unsigned long addr) \ { \ - check_memory_region_inline(addr, size, true, _RET_IP_); \ + check_region_inline(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_store##size); \ __alias(__asan_store##size) \ @@ -254,7 +254,7 @@ DEFINE_ASAN_LOAD_STORE(16); void __asan_loadN(unsigned long addr, size_t size) { - check_memory_region(addr, size, false, _RET_IP_); + kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); @@ -264,7 +264,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort); void __asan_storeN(unsigned long addr, size_t size) { - check_memory_region(addr, size, true, _RET_IP_); + kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); @@ -290,11 +290,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - unpoison_range((const void *)(addr + rounded_down_size), - size - rounded_down_size); - poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + kasan_unpoison((const void *)(addr + rounded_down_size), + size - rounded_down_size); + kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, KASAN_ALLOCA_LEFT); - poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, KASAN_ALLOCA_RIGHT); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -305,7 +305,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - unpoison_range(stack_top, stack_bottom - stack_top); + kasan_unpoison(stack_top, stack_bottom - stack_top); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8c706e7652f2..3810e75b8eea 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -195,14 +195,14 @@ static inline bool addr_has_metadata(const void *addr) } /** - * check_memory_region - Check memory region, and report if invalid access. + * kasan_check_range - Check memory region, and report if invalid access. * @addr: the accessed address * @size: the accessed size * @write: true if access is a write access * @ret_ip: return address * @return: true if access was valid, false if invalid */ -bool check_memory_region(unsigned long addr, size_t size, bool write, +bool kasan_check_range(unsigned long addr, size_t size, bool write, unsigned long ret_ip); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -215,19 +215,19 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -void print_tags(u8 addr_tag, const void *addr); +void kasan_print_tags(u8 addr_tag, const void *addr); #else -static inline void print_tags(u8 addr_tag, const void *addr) { } +static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *find_first_bad_addr(void *addr, size_t size); -const char *get_bug_type(struct kasan_access_info *info); -void metadata_fetch_row(char *buffer, void *row); +void *kasan_find_first_bad_addr(void *addr, size_t size); +const char *kasan_get_bug_type(struct kasan_access_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); #if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK -void print_address_stack_frame(const void *addr); +void kasan_print_address_stack_frame(const void *addr); #else -static inline void print_address_stack_frame(const void *addr) { } +static inline void kasan_print_address_stack_frame(const void *addr) { } #endif bool kasan_report(unsigned long addr, size_t size, @@ -244,13 +244,13 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) -bool quarantine_put(struct kmem_cache *cache, void *object); -void quarantine_reduce(void); -void quarantine_remove_cache(struct kmem_cache *cache); +bool kasan_quarantine_put(struct kmem_cache *cache, void *object); +void kasan_quarantine_reduce(void); +void kasan_quarantine_remove_cache(struct kmem_cache *cache); #else -static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; } -static inline void quarantine_reduce(void) { } -static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; } +static inline void kasan_quarantine_reduce(void) { } +static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { } #endif #ifndef arch_kasan_set_tag @@ -293,28 +293,28 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #endif /* CONFIG_KASAN_HW_TAGS */ #ifdef CONFIG_KASAN_SW_TAGS -u8 random_tag(void); +u8 kasan_random_tag(void); #elif defined(CONFIG_KASAN_HW_TAGS) -static inline u8 random_tag(void) { return hw_get_random_tag(); } +static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); } #else -static inline u8 random_tag(void) { return 0; } +static inline u8 kasan_random_tag(void) { return 0; } #endif #ifdef CONFIG_KASAN_HW_TAGS -static inline void poison_range(const void *address, size_t size, u8 value) +static inline void kasan_poison(const void *address, size_t size, u8 value) { hw_set_mem_tag_range(kasan_reset_tag(address), round_up(size, KASAN_GRANULE_SIZE), value); } -static inline void unpoison_range(const void *address, size_t size) +static inline void kasan_unpoison(const void *address, size_t size) { hw_set_mem_tag_range(kasan_reset_tag(address), round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); } -static inline bool check_invalid_free(void *addr) +static inline bool kasan_check_invalid_free(void *addr) { u8 ptr_tag = get_tag(addr); u8 mem_tag = hw_get_mem_tag(addr); @@ -325,9 +325,9 @@ static inline bool check_invalid_free(void *addr) #else /* CONFIG_KASAN_HW_TAGS */ -void poison_range(const void *address, size_t size, u8 value); -void unpoison_range(const void *address, size_t size); -bool check_invalid_free(void *addr); +void kasan_poison(const void *address, size_t size, u8 value); +void kasan_unpoison(const void *address, size_t size); +bool kasan_check_invalid_free(void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 55783125a767..728fb24c5683 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -168,7 +168,7 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) qlist_init(q); } -bool quarantine_put(struct kmem_cache *cache, void *object) +bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { unsigned long flags; struct qlist_head *q; @@ -184,11 +184,11 @@ bool quarantine_put(struct kmem_cache *cache, void *object) /* * Note: irq must be disabled until after we move the batch to the - * global quarantine. Otherwise quarantine_remove_cache() can miss - * some objects belonging to the cache if they are in our local temp - * list. quarantine_remove_cache() executes on_each_cpu() at the - * beginning which ensures that it either sees the objects in per-cpu - * lists or in the global quarantine. + * global quarantine. Otherwise kasan_quarantine_remove_cache() can + * miss some objects belonging to the cache if they are in our local + * temp list. kasan_quarantine_remove_cache() executes on_each_cpu() + * at the beginning which ensures that it either sees the objects in + * per-cpu lists or in the global quarantine. */ local_irq_save(flags); @@ -222,7 +222,7 @@ bool quarantine_put(struct kmem_cache *cache, void *object) return true; } -void quarantine_reduce(void) +void kasan_quarantine_reduce(void) { size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; @@ -234,7 +234,7 @@ void quarantine_reduce(void) return; /* - * srcu critical section ensures that quarantine_remove_cache() + * srcu critical section ensures that kasan_quarantine_remove_cache() * will not miss objects belonging to the cache while they are in our * local to_free list. srcu is chosen because (1) it gives us private * grace period domain that does not interfere with anything else, @@ -309,15 +309,15 @@ static void per_cpu_remove_cache(void *arg) } /* Free all quarantined objects belonging to cache. */ -void quarantine_remove_cache(struct kmem_cache *cache) +void kasan_quarantine_remove_cache(struct kmem_cache *cache) { unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; /* * Must be careful to not miss any objects that are being moved from - * per-cpu list to the global quarantine in quarantine_put(), - * nor objects being freed in quarantine_reduce(). on_each_cpu() + * per-cpu list to the global quarantine in kasan_quarantine_put(), + * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu() * achieves the first goal, while synchronize_srcu() achieves the * second. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c0fb21797550..e93d7973792e 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -61,7 +61,7 @@ __setup("kasan_multi_shot", kasan_set_multi_shot); static void print_error_description(struct kasan_access_info *info) { pr_err("BUG: KASAN: %s in %pS\n", - get_bug_type(info), (void *)info->ip); + kasan_get_bug_type(info), (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -247,7 +247,7 @@ static void print_address_description(void *addr, u8 tag) dump_page(page, "kasan: bad access detected"); } - print_address_stack_frame(addr); + kasan_print_address_stack_frame(addr); } static bool meta_row_is_guilty(const void *row, const void *addr) @@ -293,7 +293,7 @@ static void print_memory_metadata(const void *addr) * function, because generic functions may try to * access kasan mapping for the passed address. */ - metadata_fetch_row(&metadata[0], row); + kasan_metadata_fetch_row(&metadata[0], row); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1, @@ -350,7 +350,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - print_tags(tag, object); + kasan_print_tags(tag, object); pr_err("\n"); print_address_description(object, tag); pr_err("\n"); @@ -378,7 +378,8 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, info.access_addr = tagged_addr; if (addr_has_metadata(untagged_addr)) - info.first_bad_addr = find_first_bad_addr(tagged_addr, size); + info.first_bad_addr = + kasan_find_first_bad_addr(tagged_addr, size); else info.first_bad_addr = untagged_addr; info.access_size = size; @@ -389,7 +390,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, print_error_description(&info); if (addr_has_metadata(untagged_addr)) - print_tags(get_tag(tagged_addr), info.first_bad_addr); + kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); pr_err("\n"); if (addr_has_metadata(untagged_addr)) { diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 8a9c889872da..41f374585144 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -30,7 +30,7 @@ #include "kasan.h" #include "../slab.h" -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { void *p = addr; @@ -105,7 +105,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -123,7 +123,7 @@ const char *get_bug_type(struct kasan_access_info *info) return get_wild_bug_type(info); } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } @@ -263,7 +263,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr, return true; } -void print_address_stack_frame(const void *addr) +void kasan_print_address_stack_frame(const void *addr) { unsigned long offset; const char *frame_descr; diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 57114f0e14d1..42b2168755d6 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,17 +15,17 @@ #include "kasan.h" -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { return "invalid-access"; } -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { return kasan_reset_tag(addr); } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { int i; @@ -33,7 +33,7 @@ void metadata_fetch_row(char *buffer, void *row) buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE); } -void print_tags(u8 addr_tag, const void *addr) +void kasan_print_tags(u8 addr_tag, const void *addr) { u8 memory_tag = hw_get_mem_tag((void *)addr); diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 1b026793ad57..3d20d3451d9e 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -29,7 +29,7 @@ #include "kasan.h" #include "../slab.h" -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; @@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info) return "invalid-access"; } -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { u8 tag = get_tag(addr); void *p = kasan_reset_tag(addr); @@ -83,12 +83,12 @@ void *find_first_bad_addr(void *addr, size_t size) return p; } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } -void print_tags(u8 addr_tag, const void *addr) +void kasan_print_tags(u8 addr_tag, const void *addr) { u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 7c2c08c55f32..38958eb0d653 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -27,20 +27,20 @@ bool __kasan_check_read(const volatile void *p, unsigned int size) { - return check_memory_region((unsigned long)p, size, false, _RET_IP_); + return kasan_check_range((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); bool __kasan_check_write(const volatile void *p, unsigned int size) { - return check_memory_region((unsigned long)p, size, true, _RET_IP_); + return kasan_check_range((unsigned long)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -50,8 +50,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -61,8 +61,8 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); @@ -72,7 +72,7 @@ void *memcpy(void *dest, const void *src, size_t len) * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_GRANULE_SIZE. */ -void poison_range(const void *address, size_t size, u8 value) +void kasan_poison(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -90,7 +90,7 @@ void poison_range(const void *address, size_t size, u8 value) __memset(shadow_start, value, shadow_end - shadow_start); } -void unpoison_range(const void *address, size_t size) +void kasan_unpoison(const void *address, size_t size) { u8 tag = get_tag(address); @@ -101,7 +101,7 @@ void unpoison_range(const void *address, size_t size) */ address = kasan_reset_tag(address); - poison_range(address, size, tag); + kasan_poison(address, size, tag); if (size & KASAN_GRANULE_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); @@ -286,7 +286,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // vmalloc() allocates memory * // let a = area->addr * // we reach kasan_populate_vmalloc - * // and call unpoison_range: + * // and call kasan_unpoison: * STORE shadow(a), unpoison_val * ... * STORE shadow(a+99), unpoison_val x = LOAD p @@ -321,7 +321,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_GRANULE_SIZE); - poison_range(start, size, KASAN_VMALLOC_INVALID); + kasan_poison(start, size, KASAN_VMALLOC_INVALID); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -329,7 +329,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - unpoison_range(start, size); + kasan_unpoison(start, size); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 5dcd830805b2..cc271fceb5d5 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -57,7 +57,7 @@ void __init kasan_init_sw_tags(void) * sequence has in fact positive effect, since interrupts that randomly skew * PRNG at unpredictable points do only good. */ -u8 random_tag(void) +u8 kasan_random_tag(void) { u32 state = this_cpu_read(prng_state); @@ -67,7 +67,7 @@ u8 random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -bool check_memory_region(unsigned long addr, size_t size, bool write, +bool kasan_check_range(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { u8 tag; @@ -118,7 +118,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, return true; } -bool check_invalid_free(void *addr) +bool kasan_check_invalid_free(void *addr) { u8 tag = get_tag(addr); u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); @@ -130,12 +130,12 @@ bool check_invalid_free(void *addr) #define DEFINE_HWASAN_LOAD_STORE(size) \ void __hwasan_load##size##_noabort(unsigned long addr) \ { \ - check_memory_region(addr, size, false, _RET_IP_); \ + kasan_check_range(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ void __hwasan_store##size##_noabort(unsigned long addr) \ { \ - check_memory_region(addr, size, true, _RET_IP_); \ + kasan_check_range(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_store##size##_noabort) @@ -147,19 +147,19 @@ DEFINE_HWASAN_LOAD_STORE(16); void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) { - check_memory_region(addr, size, false, _RET_IP_); + kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__hwasan_loadN_noabort); void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) { - check_memory_region(addr, size, true, _RET_IP_); + kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - poison_range((void *)addr, size, tag); + kasan_poison((void *)addr, size, tag); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 080a3d6cbd75..80e7b822a432 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -666,7 +666,7 @@ static void add_ignores(struct objtool_file *file) static const char *uaccess_safe_builtin[] = { /* KASAN */ "kasan_report", - "check_memory_region", + "kasan_check_range", /* KASAN out-of-line */ "__asan_loadN_noabort", "__asan_load1_noabort", From 4062c24598b160f929d93a4a6fbf50b0b0b13d11 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:09 -0800 Subject: [PATCH 087/172] kasan: clarify HW_TAGS impact on TBI Mention in the documentation that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI (Top Byte Ignore) being enabled. Also do a few minor documentation cleanups. Link: https://linux-review.googlesource.com/id/Iba2a6697e3c6304cb53f89ec61dedc77fa29e3ae Link: https://lkml.kernel.org/r/3b4ea6875bb14d312092ad14ac55cb456c83c08e.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index a248ac3941be..0ae0efe82e8e 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -147,15 +147,14 @@ negative values to distinguish between different kinds of inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h). In the report above the arrows point to the shadow byte 03, which means that -the accessed address is partially accessible. - -For tag-based KASAN this last report section shows the memory tags around the -accessed address (see `Implementation details`_ section). +the accessed address is partially accessible. For tag-based KASAN modes this +last report section shows the memory tags around the accessed address +(see the `Implementation details`_ section). Boot parameters ~~~~~~~~~~~~~~~ -Hardware tag-based KASAN mode (see the section about different mode below) is +Hardware tag-based KASAN mode (see the section about various modes below) is intended for use in production as a security mitigation. Therefore it supports boot parameters that allow to disable KASAN competely or otherwise control particular KASAN features. @@ -289,6 +288,13 @@ reserved to tag freed memory regions. Hardware tag-based KASAN currently only supports tagging of kmem_cache_alloc/kmalloc and page_alloc memory. +If the hardware doesn't support MTE (pre ARMv8.5), hardware tag-based KASAN +won't be enabled. In this case all boot parameters are ignored. + +Note, that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being +enabled. Even when kasan.mode=off is provided, or when the hardware doesn't +support MTE (but supports TBI). + What memory accesses are sanitised by KASAN? -------------------------------------------- From 0fd379253691e7bb7c0285a7b87525e1ff6e2fd2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:13 -0800 Subject: [PATCH 088/172] kasan: clean up comments in tests Clarify and update comments in KASAN tests. Link: https://linux-review.googlesource.com/id/I6c816c51fa1e0eb7aa3dead6bda1f339d2af46c8 Link: https://lkml.kernel.org/r/ba6db104d53ae0e3796f80ef395f6873c1c1282f.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 59 +++++++++++++++++++++++++---------------- lib/test_kasan_module.c | 5 ++-- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 2947274cc2d3..6f46e27c2af7 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -28,10 +28,9 @@ #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) /* - * We assign some test results to these globals to make sure the tests - * are not eliminated as dead code. + * Some tests use these global variables to store return values from function + * calls that could otherwise be eliminated by the compiler as dead code. */ - void *kasan_ptr_result; int kasan_int_result; @@ -39,14 +38,13 @@ static struct kunit_resource resource; static struct kunit_kasan_expectation fail_data; static bool multishot; +/* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the + * first detected bug and panic the kernel if panic_on_warn is enabled. + */ static int kasan_test_init(struct kunit *test) { - /* - * Temporarily enable multi-shot mode and set panic_on_warn=0. - * Otherwise, we'd only get a report for the first case. - */ multishot = kasan_save_enable_multi_shot(); - return 0; } @@ -56,12 +54,12 @@ static void kasan_test_exit(struct kunit *test) } /** - * KUNIT_EXPECT_KASAN_FAIL() - Causes a test failure when the expression does - * not cause a KASAN error. This uses a KUnit resource named "kasan_data." Do - * Do not use this name for a KUnit resource outside here. - * + * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a + * KASAN report; causes a test failure otherwise. This relies on a KUnit + * resource named "kasan_data". Do not use this name for KUnit resources + * outside of KASAN tests. */ -#define KUNIT_EXPECT_KASAN_FAIL(test, condition) do { \ +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ fail_data.report_expected = true; \ fail_data.report_found = false; \ kunit_add_named_resource(test, \ @@ -69,7 +67,7 @@ static void kasan_test_exit(struct kunit *test) NULL, \ &resource, \ "kasan_data", &fail_data); \ - condition; \ + expression; \ KUNIT_EXPECT_EQ(test, \ fail_data.report_expected, \ fail_data.report_found); \ @@ -121,7 +119,8 @@ static void kmalloc_pagealloc_oob_right(struct kunit *test) return; } - /* Allocate a chunk that does not fit into a SLUB cache to trigger + /* + * Allocate a chunk that does not fit into a SLUB cache to trigger * the page allocator fallback. */ ptr = kmalloc(size, GFP_KERNEL); @@ -168,7 +167,9 @@ static void kmalloc_large_oob_right(struct kunit *test) { char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE - 256; - /* Allocate a chunk that is large enough, but still fits into a slab + + /* + * Allocate a chunk that is large enough, but still fits into a slab * and does not trigger the page allocator fallback in SLUB. */ ptr = kmalloc(size, GFP_KERNEL); @@ -469,10 +470,13 @@ static void ksize_unpoisons_memory(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); real_size = ksize(ptr); - /* This access doesn't trigger an error. */ + + /* This access shouldn't trigger a KASAN report. */ ptr[size] = 'x'; - /* This one does. */ + + /* This one must. */ KUNIT_EXPECT_KASAN_FAIL(test, ptr[real_size] = 'y'); + kfree(ptr); } @@ -568,7 +572,7 @@ static void kmem_cache_invalid_free(struct kunit *test) return; } - /* Trigger invalid free, the object doesn't get freed */ + /* Trigger invalid free, the object doesn't get freed. */ KUNIT_EXPECT_KASAN_FAIL(test, kmem_cache_free(cache, p + 1)); /* @@ -585,7 +589,10 @@ static void kasan_memchr(struct kunit *test) char *ptr; size_t size = 24; - /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { kunit_info(test, "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); @@ -610,7 +617,10 @@ static void kasan_memcmp(struct kunit *test) size_t size = 24; int arr[9]; - /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { kunit_info(test, "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); @@ -634,7 +644,10 @@ static void kasan_strings(struct kunit *test) char *ptr; size_t size = 24; - /* See https://bugzilla.kernel.org/show_bug.cgi?id=206337 */ + /* + * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. + * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. + */ if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { kunit_info(test, "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); @@ -706,7 +719,7 @@ static void kasan_bitops_generic(struct kunit *test) } /* - * Allocate 1 more byte, which causes kzalloc to round up to 16-bytes; + * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes; * this way we do not actually corrupt other memory. */ bits = kzalloc(sizeof(*bits) + 1, GFP_KERNEL); diff --git a/lib/test_kasan_module.c b/lib/test_kasan_module.c index 3b4cc77992d2..eee017ff8980 100644 --- a/lib/test_kasan_module.c +++ b/lib/test_kasan_module.c @@ -123,8 +123,9 @@ static noinline void __init kasan_workqueue_uaf(void) static int __init test_kasan_module_init(void) { /* - * Temporarily enable multi-shot mode. Otherwise, we'd only get a - * report for the first case. + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. */ bool multishot = kasan_save_enable_multi_shot(); From da17e377723f50c7acd019e39cfeeca342415714 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:17 -0800 Subject: [PATCH 089/172] kasan: add macros to simplify checking test constraints Some KASAN tests require specific kernel configs to be enabled. Instead of copy-pasting the checks for these configs add a few helper macros and use them. Link: https://linux-review.googlesource.com/id/I237484a7fddfedf4a4aae9cc61ecbcdbe85a0a63 Link: https://lkml.kernel.org/r/6a0fcdb9676b7e869cfc415893ede12d916c246c.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Suggested-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 101 +++++++++++++++-------------------------------- 1 file changed, 31 insertions(+), 70 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 6f46e27c2af7..714ea27fcc3e 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -73,6 +73,20 @@ static void kasan_test_exit(struct kunit *test) fail_data.report_found); \ } while (0) +#define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ + if (!IS_ENABLED(config)) { \ + kunit_info((test), "skipping, " #config " required"); \ + return; \ + } \ +} while (0) + +#define KASAN_TEST_NEEDS_CONFIG_OFF(test, config) do { \ + if (IS_ENABLED(config)) { \ + kunit_info((test), "skipping, " #config " enabled"); \ + return; \ + } \ +} while (0) + static void kmalloc_oob_right(struct kunit *test) { char *ptr; @@ -114,10 +128,7 @@ static void kmalloc_pagealloc_oob_right(struct kunit *test) char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; - if (!IS_ENABLED(CONFIG_SLUB)) { - kunit_info(test, "CONFIG_SLUB is not enabled."); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); /* * Allocate a chunk that does not fit into a SLUB cache to trigger @@ -135,10 +146,7 @@ static void kmalloc_pagealloc_uaf(struct kunit *test) char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; - if (!IS_ENABLED(CONFIG_SLUB)) { - kunit_info(test, "CONFIG_SLUB is not enabled."); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -152,10 +160,7 @@ static void kmalloc_pagealloc_invalid_free(struct kunit *test) char *ptr; size_t size = KMALLOC_MAX_CACHE_SIZE + 10; - if (!IS_ENABLED(CONFIG_SLUB)) { - kunit_info(test, "CONFIG_SLUB is not enabled."); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -218,10 +223,7 @@ static void kmalloc_oob_16(struct kunit *test) } *ptr1, *ptr2; /* This test is specifically crafted for the generic mode. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_GENERIC required\n"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -454,10 +456,7 @@ static void kasan_global_oob(struct kunit *test) char *p = &global_array[ARRAY_SIZE(global_array) + i]; /* Only generic mode instruments globals. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_GENERIC required"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } @@ -486,10 +485,7 @@ static void kasan_stack_oob(struct kunit *test) volatile int i = OOB_TAG_OFF; char *p = &stack_array[ARRAY_SIZE(stack_array) + i]; - if (!IS_ENABLED(CONFIG_KASAN_STACK)) { - kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } @@ -501,15 +497,8 @@ static void kasan_alloca_oob_left(struct kunit *test) char *p = alloca_array - 1; /* Only generic mode instruments dynamic allocas. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_GENERIC required"); - return; - } - - if (!IS_ENABLED(CONFIG_KASAN_STACK)) { - kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } @@ -521,15 +510,8 @@ static void kasan_alloca_oob_right(struct kunit *test) char *p = alloca_array + i; /* Only generic mode instruments dynamic allocas. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_GENERIC required"); - return; - } - - if (!IS_ENABLED(CONFIG_KASAN_STACK)) { - kunit_info(test, "CONFIG_KASAN_STACK is not enabled"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_STACK); KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } @@ -593,11 +575,7 @@ static void kasan_memchr(struct kunit *test) * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - kunit_info(test, - "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); - return; - } + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); if (OOB_TAG_OFF) size = round_up(size, OOB_TAG_OFF); @@ -621,11 +599,7 @@ static void kasan_memcmp(struct kunit *test) * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - kunit_info(test, - "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); - return; - } + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); if (OOB_TAG_OFF) size = round_up(size, OOB_TAG_OFF); @@ -648,11 +622,7 @@ static void kasan_strings(struct kunit *test) * str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT. * See https://bugzilla.kernel.org/show_bug.cgi?id=206337 for details. */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { - kunit_info(test, - "str* functions are not instrumented with CONFIG_AMD_MEM_ENCRYPT"); - return; - } + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_AMD_MEM_ENCRYPT); ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -713,10 +683,7 @@ static void kasan_bitops_generic(struct kunit *test) long *bits; /* This test is specifically crafted for the generic mode. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_GENERIC required\n"); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_GENERIC); /* * Allocate 1 more byte, which causes kzalloc to round up to 16 bytes; @@ -744,11 +711,8 @@ static void kasan_bitops_tags(struct kunit *test) { long *bits; - /* This test is specifically crafted for the tag-based mode. */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC)) { - kunit_info(test, "CONFIG_KASAN_SW_TAGS required\n"); - return; - } + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); /* Allocation size will be rounded to up granule size, which is 16. */ bits = kzalloc(sizeof(*bits), GFP_KERNEL); @@ -777,10 +741,7 @@ static void vmalloc_oob(struct kunit *test) { void *area; - if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - kunit_info(test, "CONFIG_KASAN_VMALLOC is not enabled."); - return; - } + KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_VMALLOC); /* * We have to be careful not to hit the guard page. From 573a48092313dec7b254d9dbcc2db62167f00456 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:21 -0800 Subject: [PATCH 090/172] kasan: add match-all tag tests Add 3 new tests for tag-based KASAN modes: 1. Check that match-all pointer tag is not assigned randomly. 2. Check that 0xff works as a match-all pointer tag. 3. Check that there are no match-all memory tags. Note, that test #3 causes a significant number (255) of KASAN reports to be printed during execution for the SW_TAGS mode. [arnd@arndb.de: export kasan_poison] Link: https://lkml.kernel.org/r/20210125112831.2156212-1-arnd@kernel.org [akpm@linux-foundation.org: s/EXPORT_SYMBOL_GPL/EXPORT_SYMBOL/, per Andrey] Link: https://linux-review.googlesource.com/id/I78f1375efafa162b37f3abcb2c5bc2f3955dfd8e Link: https://lkml.kernel.org/r/da841a5408e2204bf25f3b23f70540a65844e8a4.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Signed-off-by: Arnd Bergmann Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 6 ++++ mm/kasan/shadow.c | 1 + 3 files changed, 99 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 714ea27fcc3e..c344fe506ffc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -754,6 +755,94 @@ static void vmalloc_oob(struct kunit *test) vfree(area); } +/* + * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, + * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based + * modes. + */ +static void match_all_not_assigned(struct kunit *test) +{ + char *ptr; + struct page *pages; + int i, size, order; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + for (i = 0; i < 256; i++) { + size = (get_random_int() % 1024) + 1; + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + kfree(ptr); + } + + for (i = 0; i < 256; i++) { + order = (get_random_int() % 4) + 1; + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); + KUNIT_EXPECT_LT(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + free_pages((unsigned long)ptr, order); + } +} + +/* Check that 0xff works as a match-all pointer tag for tag-based modes. */ +static void match_all_ptr_tag(struct kunit *test) +{ + char *ptr; + u8 tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + /* Backup the assigned tag. */ + tag = get_tag(ptr); + KUNIT_EXPECT_NE(test, tag, (u8)KASAN_TAG_KERNEL); + + /* Reset the tag to 0xff.*/ + ptr = set_tag(ptr, KASAN_TAG_KERNEL); + + /* This access shouldn't trigger a KASAN report. */ + *ptr = 0; + + /* Recover the pointer tag and free. */ + ptr = set_tag(ptr, tag); + kfree(ptr); +} + +/* Check that there are no match-all memory tags for tag-based modes. */ +static void match_all_mem_tag(struct kunit *test) +{ + char *ptr; + int tag; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr = kmalloc(128, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + KUNIT_EXPECT_NE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_KERNEL); + + /* For each possible tag value not matching the pointer tag. */ + for (tag = KASAN_TAG_MIN; tag <= KASAN_TAG_KERNEL; tag++) { + if (tag == get_tag(ptr)) + continue; + + /* Mark the first memory granule with the chosen memory tag. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, (u8)tag); + + /* This access must cause a KASAN report. */ + KUNIT_EXPECT_KASAN_FAIL(test, *ptr = 0); + } + + /* Recover the memory tag and free. */ + kasan_poison(ptr, KASAN_GRANULE_SIZE, get_tag(ptr)); + kfree(ptr); +} + static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_right), KUNIT_CASE(kmalloc_oob_left), @@ -793,6 +882,9 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(match_all_not_assigned), + KUNIT_CASE(match_all_ptr_tag), + KUNIT_CASE(match_all_mem_tag), {} }; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3810e75b8eea..12932186a7f9 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -36,6 +36,12 @@ extern bool kasan_flag_panic __ro_after_init; #define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ #define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ +#ifdef CONFIG_KASAN_HW_TAGS +#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */ +#else +#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */ +#endif + #ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 38958eb0d653..80adc85d0393 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -89,6 +89,7 @@ void kasan_poison(const void *address, size_t size, u8 value) __memset(shadow_start, value, shadow_end - shadow_start); } +EXPORT_SYMBOL(kasan_poison); void kasan_unpoison(const void *address, size_t size) { From f05842cfb9ae25b5e78c618429c4716d9e4d5fc8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:26 -0800 Subject: [PATCH 091/172] kasan, arm64: allow using KUnit tests with HW_TAGS mode On a high level, this patch allows running KUnit KASAN tests with the hardware tag-based KASAN mode. Internally, this change reenables tag checking at the end of each KASAN test that triggers a tag fault and leads to tag checking being disabled. Also simplify is_write calculation in report_tag_fault. With this patch KASAN tests are still failing for the hardware tag-based mode; fixes come in the next few patches. [andreyknvl@google.com: export HW_TAGS symbols for KUnit tests] Link: https://lkml.kernel.org/r/e7eeb252da408b08f0c81b950a55fb852f92000b.1613155970.git.andreyknvl@google.com Link: https://linux-review.googlesource.com/id/Id94dc9eccd33b23cda4950be408c27f879e474c8 Link: https://lkml.kernel.org/r/51b23112cf3fd62b8f8e9df81026fa2b15870501.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Catalin Marinas Reviewed-by: Vincenzo Frascino Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Marco Elver Cc: Peter Collingbourne Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/memory.h | 1 + arch/arm64/include/asm/mte-kasan.h | 12 +++++++++ arch/arm64/kernel/mte.c | 12 +++++++++ arch/arm64/mm/fault.c | 20 +++++++++----- lib/Kconfig.kasan | 4 +-- lib/test_kasan.c | 42 +++++++++++++++++++++--------- mm/kasan/hw_tags.c | 16 ++++++++++++ mm/kasan/kasan.h | 21 +++++++++++++++ 8 files changed, 107 insertions(+), 21 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index bc09af26c1b8..c759faf7a1ff 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -244,6 +244,7 @@ static inline const void *__tag_set(const void *addr, u8 tag) #ifdef CONFIG_KASAN_HW_TAGS #define arch_enable_tagging() mte_enable_kernel() +#define arch_set_tagging_report_once(state) mte_set_report_once(state) #define arch_init_tags(max_tag) mte_init_tags(max_tag) #define arch_get_random_tag() mte_get_random_tag() #define arch_get_mem_tag(addr) mte_get_mem_tag(addr) diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 26349a4b5e2e..3748d5bb88c0 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -32,6 +32,9 @@ void *mte_set_mem_tag_range(void *addr, size_t size, u8 tag); void mte_enable_kernel(void); void mte_init_tags(u64 max_tag); +void mte_set_report_once(bool state); +bool mte_report_once(void); + #else /* CONFIG_ARM64_MTE */ static inline u8 mte_get_ptr_tag(void *ptr) @@ -60,6 +63,15 @@ static inline void mte_init_tags(u64 max_tag) { } +static inline void mte_set_report_once(bool state) +{ +} + +static inline bool mte_report_once(void) +{ + return false; +} + #endif /* CONFIG_ARM64_MTE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 80b62fe49dcf..2cfc850809ce 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -25,6 +25,8 @@ u64 gcr_kernel_excl __ro_after_init; +static bool report_fault_once = true; + static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap) { pte_t old_pte = READ_ONCE(*ptep); @@ -158,6 +160,16 @@ void mte_enable_kernel(void) isb(); } +void mte_set_report_once(bool state) +{ + WRITE_ONCE(report_fault_once, state); +} + +bool mte_report_once(void) +{ + return READ_ONCE(report_fault_once); +} + static void update_sctlr_el1_tcf0(u64 tcf0) { /* ISB required for the kernel uaccess routines */ diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2e339f0bd958..dc9f96442edc 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -302,12 +302,24 @@ static void die_kernel_fault(const char *msg, unsigned long addr, static void report_tag_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - bool is_write = ((esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT) != 0; + static bool reported; + bool is_write; + + if (READ_ONCE(reported)) + return; + + /* + * This is used for KASAN tests and assumes that no MTE faults + * happened before running the tests. + */ + if (mte_report_once()) + WRITE_ONCE(reported, true); /* * SAS bits aren't set for all faults reported in EL1, so we can't * find out access size. */ + is_write = !!(esr & ESR_ELx_WNR); kasan_report(addr, 0, is_write, regs->pc); } #else @@ -319,12 +331,8 @@ static inline void report_tag_fault(unsigned long addr, unsigned int esr, static void do_tag_recovery(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - static bool reported; - if (!READ_ONCE(reported)) { - report_tag_fault(addr, esr, regs); - WRITE_ONCE(reported, true); - } + report_tag_fault(addr, esr, regs); /* * Disable MTE Tag Checking on the local CPU for the current EL. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f5fa4ba126bf..3091432acb0a 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -190,11 +190,11 @@ config KASAN_KUNIT_TEST kernel debugging features like KASAN. For more information on KUnit and unit tests in general, please refer - to the KUnit documentation in Documentation/dev-tools/kunit + to the KUnit documentation in Documentation/dev-tools/kunit. config TEST_KASAN_MODULE tristate "KUnit-incompatible tests of KASAN bug detection capabilities" - depends on m && KASAN + depends on m && KASAN && !KASAN_HW_TAGS help This is a part of the KASAN test suite that is incompatible with KUnit. Currently includes tests that do bad copy_from/to_user diff --git a/lib/test_kasan.c b/lib/test_kasan.c index c344fe506ffc..502709db41c0 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -41,16 +41,20 @@ static bool multishot; /* * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the - * first detected bug and panic the kernel if panic_on_warn is enabled. + * first detected bug and panic the kernel if panic_on_warn is enabled. For + * hardware tag-based KASAN also allow tag checking to be reenabled for each + * test, see the comment for KUNIT_EXPECT_KASAN_FAIL(). */ static int kasan_test_init(struct kunit *test) { multishot = kasan_save_enable_multi_shot(); + kasan_set_tagging_report_once(false); return 0; } static void kasan_test_exit(struct kunit *test) { + kasan_set_tagging_report_once(true); kasan_restore_multi_shot(multishot); } @@ -59,19 +63,31 @@ static void kasan_test_exit(struct kunit *test) * KASAN report; causes a test failure otherwise. This relies on a KUnit * resource named "kasan_data". Do not use this name for KUnit resources * outside of KASAN tests. + * + * For hardware tag-based KASAN, when a tag fault happens, tag checking is + * normally auto-disabled. When this happens, this test handler reenables + * tag checking. As tag checking can be only disabled or enabled per CPU, this + * handler disables migration (preemption). */ -#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ - fail_data.report_expected = true; \ - fail_data.report_found = false; \ - kunit_add_named_resource(test, \ - NULL, \ - NULL, \ - &resource, \ - "kasan_data", &fail_data); \ - expression; \ - KUNIT_EXPECT_EQ(test, \ - fail_data.report_expected, \ - fail_data.report_found); \ +#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \ + migrate_disable(); \ + fail_data.report_expected = true; \ + fail_data.report_found = false; \ + kunit_add_named_resource(test, \ + NULL, \ + NULL, \ + &resource, \ + "kasan_data", &fail_data); \ + expression; \ + KUNIT_EXPECT_EQ(test, \ + fail_data.report_expected, \ + fail_data.report_found); \ + if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ + if (fail_data.report_found) \ + kasan_enable_tagging(); \ + migrate_enable(); \ + } \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index d558799b25b3..b31aeef505dd 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -185,3 +185,19 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, return &alloc_meta->free_track[0]; } + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_set_tagging_report_once(bool state) +{ + hw_set_tagging_report_once(state); +} +EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); + +void kasan_enable_tagging(void) +{ + hw_enable_tagging(); +} +EXPORT_SYMBOL_GPL(kasan_enable_tagging); + +#endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 12932186a7f9..1298b79f9518 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -280,6 +280,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_init_tags #define arch_init_tags(max_tag) #endif +#ifndef arch_set_tagging_report_once +#define arch_set_tagging_report_once(state) +#endif #ifndef arch_get_random_tag #define arch_get_random_tag() (0xFF) #endif @@ -292,12 +295,30 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging() arch_enable_tagging() #define hw_init_tags(max_tag) arch_init_tags(max_tag) +#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#else /* CONFIG_KASAN_HW_TAGS */ + +#define hw_enable_tagging() +#define hw_set_tagging_report_once(state) + #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_set_tagging_report_once(bool state); +void kasan_enable_tagging(void); + +#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_set_tagging_report_once(bool state) { } +static inline void kasan_enable_tagging(void) { } + +#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ + #ifdef CONFIG_KASAN_SW_TAGS u8 kasan_random_tag(void); #elif defined(CONFIG_KASAN_HW_TAGS) From 5d92bdffd2d53f98de683229c0ad7d028703fdba Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:29 -0800 Subject: [PATCH 092/172] kasan: rename CONFIG_TEST_KASAN_MODULE Rename CONFIG_TEST_KASAN_MODULE to CONFIG_KASAN_MODULE_TEST. This naming is more consistent with the existing CONFIG_KASAN_KUNIT_TEST. Link: https://linux-review.googlesource.com/id/Id347dfa5fe8788b7a1a189863e039f409da0ae5f Link: https://lkml.kernel.org/r/f08250246683981bcf8a094fbba7c361995624d2.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/dev-tools/kasan.rst | 8 ++++---- lib/Kconfig.kasan | 2 +- lib/Makefile | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 0ae0efe82e8e..cde14aeefca7 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -358,17 +358,17 @@ unmapped. This will require changes in arch-specific code. This allows ``VMAP_STACK`` support on x86, and can simplify support of architectures that do not have a fixed module region. -CONFIG_KASAN_KUNIT_TEST & CONFIG_TEST_KASAN_MODULE --------------------------------------------------- +CONFIG_KASAN_KUNIT_TEST and CONFIG_KASAN_MODULE_TEST +---------------------------------------------------- -KASAN tests consist on two parts: +KASAN tests consist of two parts: 1. Tests that are integrated with the KUnit Test Framework. Enabled with ``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified automatically in a few different ways, see the instructions below. 2. Tests that are currently incompatible with KUnit. Enabled with -``CONFIG_TEST_KASAN_MODULE`` and can only be run as a module. These tests can +``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can only be verified manually, by loading the kernel module and inspecting the kernel log for KASAN reports. diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 3091432acb0a..624ae1df7984 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -192,7 +192,7 @@ config KASAN_KUNIT_TEST For more information on KUnit and unit tests in general, please refer to the KUnit documentation in Documentation/dev-tools/kunit. -config TEST_KASAN_MODULE +config KASAN_MODULE_TEST tristate "KUnit-incompatible tests of KASAN bug detection capabilities" depends on m && KASAN && !KASAN_HW_TAGS help diff --git a/lib/Makefile b/lib/Makefile index fb7d946bb8c3..b5307d3eec1a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -66,7 +66,7 @@ obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) -obj-$(CONFIG_TEST_KASAN_MODULE) += test_kasan_module.o +obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) From 2e4bde6a1e3a3feb8511685b8c97be668728eefb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:34 -0800 Subject: [PATCH 093/172] kasan: add compiler barriers to KUNIT_EXPECT_KASAN_FAIL It might not be obvious to the compiler that the expression must be executed between writing and reading to fail_data. In this case, the compiler might reorder or optimize away some of the accesses, and the tests will fail. Add compiler barriers around the expression in KUNIT_EXPECT_KASAN_FAIL and use READ/WRITE_ONCE() for accessing fail_data fields. Link: https://linux-review.googlesource.com/id/I046079f48641a1d36fe627fc8827a9249102fd50 Link: https://lkml.kernel.org/r/6f11596f367d8ae8f71d800351e9a5d91eda19f6.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 17 ++++++++++++----- mm/kasan/report.c | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 502709db41c0..603fd7937b94 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -68,23 +68,30 @@ static void kasan_test_exit(struct kunit *test) * normally auto-disabled. When this happens, this test handler reenables * tag checking. As tag checking can be only disabled or enabled per CPU, this * handler disables migration (preemption). + * + * Since the compiler doesn't see that the expression can change the fail_data + * fields, it can reorder or optimize away the accesses to those fields. + * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the + * expression to prevent that. */ #define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) \ migrate_disable(); \ - fail_data.report_expected = true; \ - fail_data.report_found = false; \ + WRITE_ONCE(fail_data.report_expected, true); \ + WRITE_ONCE(fail_data.report_found, false); \ kunit_add_named_resource(test, \ NULL, \ NULL, \ &resource, \ "kasan_data", &fail_data); \ + barrier(); \ expression; \ + barrier(); \ KUNIT_EXPECT_EQ(test, \ - fail_data.report_expected, \ - fail_data.report_found); \ + READ_ONCE(fail_data.report_expected), \ + READ_ONCE(fail_data.report_found)); \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS)) { \ - if (fail_data.report_found) \ + if (READ_ONCE(fail_data.report_found)) \ kasan_enable_tagging(); \ migrate_enable(); \ } \ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index e93d7973792e..234f35a84f19 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -331,7 +331,7 @@ static void kasan_update_kunit_status(struct kunit *cur_test) } kasan_data = (struct kunit_kasan_expectation *)resource->data; - kasan_data->report_found = true; + WRITE_ONCE(kasan_data->report_found, true); kunit_put_resource(resource); } #endif /* IS_ENABLED(CONFIG_KUNIT) */ From 1b1df4c4e2576f6b9c5b1f5f1fc9435e3f6c6b47 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:38 -0800 Subject: [PATCH 094/172] kasan: adapt kmalloc_uaf2 test to HW_TAGS mode In the kmalloc_uaf2() test, the pointers to the two allocated memory blocks might happen to be the same, and the test will fail. With the software tag-based mode, the probability of the that is 1/254, so it's hard to observe the failure. For the hardware tag-based mode though, the probablity is 1/14, which is quite noticable. Allow up to 16 attempts at generating different tags for the tag-based modes. Link: https://linux-review.googlesource.com/id/Ibfa458ef2804ff465d8eb07434a300bf36388d55 Link: https://lkml.kernel.org/r/9cd5cf2f633dcbf55cab801cd26845d2b075cec7.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 603fd7937b94..9a227d7e06d6 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -382,7 +382,9 @@ static void kmalloc_uaf2(struct kunit *test) { char *ptr1, *ptr2; size_t size = 43; + int counter = 0; +again: ptr1 = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); @@ -391,6 +393,15 @@ static void kmalloc_uaf2(struct kunit *test) ptr2 = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* + * For tag-based KASAN ptr1 and ptr2 tags might happen to be the same. + * Allow up to 16 attempts at generating different tags. + */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && ptr1 == ptr2 && counter++ < 16) { + kfree(ptr2); + goto again; + } + KUNIT_EXPECT_KASAN_FAIL(test, ptr1[40] = 'x'); KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); From e66e1799a76621003e5b04c9c057826a2152e103 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:42 -0800 Subject: [PATCH 095/172] kasan: fix memory corruption in kasan_bitops_tags test Since the hardware tag-based KASAN mode might not have a redzone that comes after an allocated object (when kasan.mode=prod is enabled), the kasan_bitops_tags() test ends up corrupting the next object in memory. Change the test so it always accesses the redzone that lies within the allocated object's boundaries. Link: https://linux-review.googlesource.com/id/I67f51d1ee48f0a8d0fe2658c2a39e4879fe0832a Link: https://lkml.kernel.org/r/7d452ce4ae35bb1988d2c9244dfea56cf2cc9315.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 9a227d7e06d6..e59f185b8075 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -749,13 +749,13 @@ static void kasan_bitops_tags(struct kunit *test) /* This test is specifically crafted for tag-based modes. */ KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); - /* Allocation size will be rounded to up granule size, which is 16. */ - bits = kzalloc(sizeof(*bits), GFP_KERNEL); + /* kmalloc-64 cache will be used and the last 16 bytes will be the redzone. */ + bits = kzalloc(48, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bits); - /* Do the accesses past the 16 allocated bytes. */ - kasan_bitops_modify(test, BITS_PER_LONG, &bits[1]); - kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, &bits[1]); + /* Do the accesses past the 48 allocated bytes, but within the redone. */ + kasan_bitops_modify(test, BITS_PER_LONG, (void *)bits + 48); + kasan_bitops_test_and_modify(test, BITS_PER_LONG + BITS_PER_BYTE, (void *)bits + 48); kfree(bits); } From 027b37b552f326aa94ef06c7ea77088b16c41e6e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:46 -0800 Subject: [PATCH 096/172] kasan: move _RET_IP_ to inline wrappers Generic mm functions that call KASAN annotations that might report a bug pass _RET_IP_ to them as an argument. This allows KASAN to include the name of the function that called the mm function in its report's header. Now that KASAN has inline wrappers for all of its annotations, move _RET_IP_ to those wrappers to simplify annotation call sites. Link: https://linux-review.googlesource.com/id/I8fb3c06d49671305ee184175a39591bc26647a67 Link: https://lkml.kernel.org/r/5c1490eddf20b436b8c4eeea83fce47687d5e4a4.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 20 +++++++++----------- mm/mempool.c | 2 +- mm/slab.c | 2 +- mm/slub.c | 4 ++-- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 0aea9e2a2a01..a7254186558a 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -185,19 +185,18 @@ static __always_inline void * __must_check kasan_init_slab_obj( } bool __kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip); -static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip) +static __always_inline bool kasan_slab_free(struct kmem_cache *s, void *object) { if (kasan_enabled()) - return __kasan_slab_free(s, object, ip); + return __kasan_slab_free(s, object, _RET_IP_); return false; } void __kasan_slab_free_mempool(void *ptr, unsigned long ip); -static __always_inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) +static __always_inline void kasan_slab_free_mempool(void *ptr) { if (kasan_enabled()) - __kasan_slab_free_mempool(ptr, ip); + __kasan_slab_free_mempool(ptr, _RET_IP_); } void * __must_check __kasan_slab_alloc(struct kmem_cache *s, @@ -241,10 +240,10 @@ static __always_inline void * __must_check kasan_krealloc(const void *object, } void __kasan_kfree_large(void *ptr, unsigned long ip); -static __always_inline void kasan_kfree_large(void *ptr, unsigned long ip) +static __always_inline void kasan_kfree_large(void *ptr) { if (kasan_enabled()) - __kasan_kfree_large(ptr, ip); + __kasan_kfree_large(ptr, _RET_IP_); } bool kasan_save_enable_multi_shot(void); @@ -277,12 +276,11 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache, { return (void *)object; } -static inline bool kasan_slab_free(struct kmem_cache *s, void *object, - unsigned long ip) +static inline bool kasan_slab_free(struct kmem_cache *s, void *object) { return false; } -static inline void kasan_slab_free_mempool(void *ptr, unsigned long ip) {} +static inline void kasan_slab_free_mempool(void *ptr) {} static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) { @@ -302,7 +300,7 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, { return (void *)object; } -static inline void kasan_kfree_large(void *ptr, unsigned long ip) {} +static inline void kasan_kfree_large(void *ptr) {} #endif /* CONFIG_KASAN */ diff --git a/mm/mempool.c b/mm/mempool.c index 624ed51b060f..79959fac27d7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element) static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_slab_free_mempool(element, _RET_IP_); + kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/slab.c b/mm/slab.c index 9c9e0c255c4b..35c68d99d460 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3420,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, memset(objp, 0, cachep->object_size); /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp, _RET_IP_)) + if (kasan_slab_free(cachep, objp)) return; /* Use KCSAN to help debug racy use-after-free. */ diff --git a/mm/slub.c b/mm/slub.c index 046d7194acaf..b2833ce85c92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1528,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x, _RET_IP_); + kasan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) @@ -1558,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x, _RET_IP_); + return kasan_slab_free(s, x); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, From 611806b4bf8dd97a4f3d73f5cf3c2c7730c51eb2 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:50 -0800 Subject: [PATCH 097/172] kasan: fix bug detection via ksize for HW_TAGS mode The currently existing kasan_check_read/write() annotations are intended to be used for kernel modules that have KASAN compiler instrumentation disabled. Thus, they are only relevant for the software KASAN modes that rely on compiler instrumentation. However there's another use case for these annotations: ksize() checks that the object passed to it is indeed accessible before unpoisoning the whole object. This is currently done via __kasan_check_read(), which is compiled away for the hardware tag-based mode that doesn't rely on compiler instrumentation. This leads to KASAN missing detecting some memory corruptions. Provide another annotation called kasan_check_byte() that is available for all KASAN modes. As the implementation rename and reuse kasan_check_invalid_free(). Use this new annotation in ksize(). To avoid having ksize() as the top frame in the reported stack trace pass _RET_IP_ to __kasan_check_byte(). Also add a new ksize_uaf() test that checks that a use-after-free is detected via ksize() itself, and via plain accesses that happen later. Link: https://linux-review.googlesource.com/id/Iaabf771881d0f9ce1b969f2a62938e99d3308ec5 Link: https://lkml.kernel.org/r/f32ad74a60b28d8402482a38476f02bb7600f620.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan-checks.h | 6 ++++++ include/linux/kasan.h | 17 +++++++++++++++++ lib/test_kasan.c | 20 ++++++++++++++++++++ mm/kasan/common.c | 11 ++++++++++- mm/kasan/generic.c | 4 ++-- mm/kasan/kasan.h | 10 +++++----- mm/kasan/sw_tags.c | 6 +++--- mm/slab_common.c | 16 +++++++++------- 8 files changed, 72 insertions(+), 18 deletions(-) diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index ca5e89fb10d3..3d6d22a25bdc 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h @@ -4,6 +4,12 @@ #include +/* + * The annotations present in this file are only relevant for the software + * KASAN modes that rely on compiler instrumentation, and will be optimized + * away for the hardware tag-based KASAN mode. Use kasan_check_byte() instead. + */ + /* * __kasan_check_*: Always available when KASAN is enabled. This may be used * even in compilation units that selectively disable KASAN, but must use KASAN diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a7254186558a..7eaf2d9effb4 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -246,6 +246,19 @@ static __always_inline void kasan_kfree_large(void *ptr) __kasan_kfree_large(ptr, _RET_IP_); } +/* + * Unlike kasan_check_read/write(), kasan_check_byte() is performed even for + * the hardware tag-based mode that doesn't rely on compiler instrumentation. + */ +bool __kasan_check_byte(const void *addr, unsigned long ip); +static __always_inline bool kasan_check_byte(const void *addr) +{ + if (kasan_enabled()) + return __kasan_check_byte(addr, _RET_IP_); + return true; +} + + bool kasan_save_enable_multi_shot(void); void kasan_restore_multi_shot(bool enabled); @@ -301,6 +314,10 @@ static inline void *kasan_krealloc(const void *object, size_t new_size, return (void *)object; } static inline void kasan_kfree_large(void *ptr) {} +static inline bool kasan_check_byte(const void *address) +{ + return true; +} #endif /* CONFIG_KASAN */ diff --git a/lib/test_kasan.c b/lib/test_kasan.c index e59f185b8075..3f771fabd0ec 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -496,6 +496,7 @@ static void kasan_global_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } +/* Check that ksize() makes the whole object accessible. */ static void ksize_unpoisons_memory(struct kunit *test) { char *ptr; @@ -514,6 +515,24 @@ static void ksize_unpoisons_memory(struct kunit *test) kfree(ptr); } +/* + * Check that a use-after-free is detected by ksize() and via normal accesses + * after it. + */ +static void ksize_uaf(struct kunit *test) +{ + char *ptr; + int size = 128 - KASAN_GRANULE_SIZE; + + ptr = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + kfree(ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *ptr); + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = *(ptr + size)); +} + static void kasan_stack_oob(struct kunit *test) { char stack_array[10]; @@ -907,6 +926,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_alloca_oob_left), KUNIT_CASE(kasan_alloca_oob_right), KUNIT_CASE(ksize_unpoisons_memory), + KUNIT_CASE(ksize_uaf), KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), KUNIT_CASE(kasan_memchr), diff --git a/mm/kasan/common.c b/mm/kasan/common.c index eedc3e0fe365..b18189ef3a92 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -345,7 +345,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; - if (kasan_check_invalid_free(tagged_object)) { + if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip); return true; } @@ -490,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); /* The object will be poisoned by kasan_free_pages(). */ } + +bool __kasan_check_byte(const void *address, unsigned long ip) +{ + if (!kasan_byte_accessible(address)) { + kasan_report((unsigned long)address, 1, false, ip); + return false; + } + return true; +} diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index acab8862dc67..3f17a1218055 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -185,11 +185,11 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return check_region_inline(addr, size, write, ret_ip); } -bool kasan_check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); - return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE; + return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } void kasan_cache_shrink(struct kmem_cache *cache) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1298b79f9518..cc14b6e6c14c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -341,20 +341,20 @@ static inline void kasan_unpoison(const void *address, size_t size) round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); } -static inline bool kasan_check_invalid_free(void *addr) +static inline bool kasan_byte_accessible(const void *addr) { u8 ptr_tag = get_tag(addr); - u8 mem_tag = hw_get_mem_tag(addr); + u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag == KASAN_TAG_INVALID) || - (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag); + return (mem_tag != KASAN_TAG_INVALID) && + (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); } #else /* CONFIG_KASAN_HW_TAGS */ void kasan_poison(const void *address, size_t size, u8 value); void kasan_unpoison(const void *address, size_t size); -bool kasan_check_invalid_free(void *addr); +bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index cc271fceb5d5..94c2d33be333 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -118,13 +118,13 @@ bool kasan_check_range(unsigned long addr, size_t size, bool write, return true; } -bool kasan_check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); - return (shadow_byte == KASAN_TAG_INVALID) || - (tag != KASAN_TAG_KERNEL && tag != shadow_byte); + return (shadow_byte != KASAN_TAG_INVALID) && + (tag == KASAN_TAG_KERNEL || tag == shadow_byte); } #define DEFINE_HWASAN_LOAD_STORE(size) \ diff --git a/mm/slab_common.c b/mm/slab_common.c index 5be7825ad3ce..7c8298c17145 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1218,19 +1218,21 @@ size_t ksize(const void *objp) size_t size; /* - * We need to check that the pointed to object is valid, and only then - * unpoison the shadow memory below. We use __kasan_check_read(), to - * generate a more useful report at the time ksize() is called (rather - * than later where behaviour is undefined due to potential - * use-after-free or double-free). + * We need to first check that the pointer to the object is valid, and + * only then unpoison the memory. The report printed from ksize() is + * more useful, then when it's printed later when the behaviour could + * be undefined due to a potential use-after-free or double-free. * - * If the pointed to memory is invalid we return 0, to avoid users of + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). + * + * If the pointed to memory is invalid, we return 0 to avoid users of * ksize() writing to and potentially corrupting the memory region. * * We want to perform the check before __ksize(), to avoid potentially * crashing in __ksize() due to accessing invalid metadata. */ - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1)) + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; size = __ksize(objp); From 858bdeb046f6dc7a79039d577d03e4d2b39272b7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:55 -0800 Subject: [PATCH 098/172] kasan: add proper page allocator tests The currently existing page allocator tests rely on kmalloc fallback with large sizes that is only present for SLUB. Add proper tests that use alloc/free_pages(). Link: https://linux-review.googlesource.com/id/Ia173d5a1b215fe6b2548d814ef0f4433cf983570 Link: https://lkml.kernel.org/r/a2648930e55ff75b8e700f2e0d905c2b55a67483.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 51 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 3f771fabd0ec..acbc7d54d067 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -147,6 +147,12 @@ static void kmalloc_node_oob_right(struct kunit *test) kfree(ptr); } +/* + * These kmalloc_pagealloc_* tests try allocating a memory chunk that doesn't + * fit into a slab cache and therefore is allocated via the page allocator + * fallback. Since this kind of fallback is only implemented for SLUB, these + * tests are limited to that allocator. + */ static void kmalloc_pagealloc_oob_right(struct kunit *test) { char *ptr; @@ -154,14 +160,11 @@ static void kmalloc_pagealloc_oob_right(struct kunit *test) KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_SLUB); - /* - * Allocate a chunk that does not fit into a SLUB cache to trigger - * the page allocator fallback. - */ ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + OOB_TAG_OFF] = 0); + kfree(ptr); } @@ -174,8 +177,8 @@ static void kmalloc_pagealloc_uaf(struct kunit *test) ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - kfree(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); } @@ -192,6 +195,42 @@ static void kmalloc_pagealloc_invalid_free(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree(ptr + 1)); } +static void pagealloc_oob_right(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + size_t size = (1UL << (PAGE_SHIFT + order)); + + /* + * With generic KASAN page allocations have no redzones, thus + * out-of-bounds detection is not guaranteed. + * See https://bugzilla.kernel.org/show_bug.cgi?id=210503. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 0); + free_pages((unsigned long)ptr, order); +} + +static void pagealloc_uaf(struct kunit *test) +{ + char *ptr; + struct page *pages; + size_t order = 4; + + pages = alloc_pages(GFP_KERNEL, order); + ptr = page_address(pages); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + free_pages((unsigned long)ptr, order); + + KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = 0); +} + static void kmalloc_large_oob_right(struct kunit *test) { char *ptr; @@ -903,6 +942,8 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_pagealloc_oob_right), KUNIT_CASE(kmalloc_pagealloc_uaf), KUNIT_CASE(kmalloc_pagealloc_invalid_free), + KUNIT_CASE(pagealloc_oob_right), + KUNIT_CASE(pagealloc_uaf), KUNIT_CASE(kmalloc_large_oob_right), KUNIT_CASE(kmalloc_oob_krealloc_more), KUNIT_CASE(kmalloc_oob_krealloc_less), From 115161354d0e0af6fc07dcbbf0fc4e7574d32cd6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:05:59 -0800 Subject: [PATCH 099/172] kasan: add a test for kmem_cache_alloc/free_bulk Add a test for kmem_cache_alloc/free_bulk to make sure there are no false-positives when these functions are used. Link: https://linux-review.googlesource.com/id/I2a8bf797aecf81baeac61380c567308f319e263d Link: https://lkml.kernel.org/r/418122ebe4600771ac81e9ca6eab6740cf8dcfa1.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index acbc7d54d067..b04729b61d1d 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -479,10 +479,11 @@ static void kmem_cache_oob(struct kunit *test) { char *p; size_t size = 200; - struct kmem_cache *cache = kmem_cache_create("test_cache", - size, 0, - 0, NULL); + struct kmem_cache *cache; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + p = kmem_cache_alloc(cache, GFP_KERNEL); if (!p) { kunit_err(test, "Allocation failed: %s\n", __func__); @@ -491,11 +492,12 @@ static void kmem_cache_oob(struct kunit *test) } KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); + kmem_cache_free(cache, p); kmem_cache_destroy(cache); } -static void memcg_accounted_kmem_cache(struct kunit *test) +static void kmem_cache_accounted(struct kunit *test) { int i; char *p; @@ -522,6 +524,31 @@ free_cache: kmem_cache_destroy(cache); } +static void kmem_cache_bulk(struct kunit *test) +{ + struct kmem_cache *cache; + size_t size = 200; + char *p[10]; + bool ret; + int i; + + cache = kmem_cache_create("test_cache", size, 0, 0, NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + ret = kmem_cache_alloc_bulk(cache, GFP_KERNEL, ARRAY_SIZE(p), (void **)&p); + if (!ret) { + kunit_err(test, "Allocation failed: %s\n", __func__); + kmem_cache_destroy(cache); + return; + } + + for (i = 0; i < ARRAY_SIZE(p); i++) + p[i][0] = p[i][size - 1] = 42; + + kmem_cache_free_bulk(cache, ARRAY_SIZE(p), (void **)&p); + kmem_cache_destroy(cache); +} + static char global_array[10]; static void kasan_global_oob(struct kunit *test) @@ -961,7 +988,8 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kfree_via_page), KUNIT_CASE(kfree_via_phys), KUNIT_CASE(kmem_cache_oob), - KUNIT_CASE(memcg_accounted_kmem_cache), + KUNIT_CASE(kmem_cache_accounted), + KUNIT_CASE(kmem_cache_bulk), KUNIT_CASE(kasan_global_oob), KUNIT_CASE(kasan_stack_oob), KUNIT_CASE(kasan_alloca_oob_left), From d82dc3a40d12c6eea15c18d24c0bdbc887d0e7c6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 24 Feb 2021 12:06:02 -0800 Subject: [PATCH 100/172] kasan: don't run tests when KASAN is not enabled Don't run KASAN tests when it's disabled with kasan.mode=off to avoid corrupting kernel memory. Link: https://linux-review.googlesource.com/id/I6447af436a69a94bfc35477f6bf4e2122948355e Link: https://lkml.kernel.org/r/25bd4fb5cae7b421d806a1f33fb633edd313f0c7.1610733117.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Reviewed-by: Alexander Potapenko Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Peter Collingbourne Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_kasan.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index b04729b61d1d..25576303897b 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -47,6 +47,11 @@ static bool multishot; */ static int kasan_test_init(struct kunit *test) { + if (!kasan_enabled()) { + kunit_err(test, "can't run KASAN tests with KASAN disabled"); + return -1; + } + multishot = kasan_save_enable_multi_shot(); kasan_set_tagging_report_once(false); return 0; From 93f503c3fcd168a43e4a6c875fe2cfafaf8439dc Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:10 -0800 Subject: [PATCH 101/172] mm: fix prototype warning from kernel test robot Patch series "mm: clean up names and parameters of memmap_init_xxxx functions", v5. This patchset corrects inappropriate function names of memmap_init_xxx, and simplify parameters of functions in the code flow. And also fix a prototype warning reported by lkp. This patch (of 5); Kernel test robot calling make with 'W=1' is triggering warning like below for memmap_init_zone() function. mm/page_alloc.c:6259:23: warning: no previous prototype for 'memmap_init_zone' [-Wmissing-prototypes] 6259 | void __meminit __weak memmap_init_zone(unsigned long size, int nid, | ^~~~~~~~~~~~~~~~ Fix it by adding the function declaration in include/linux/mm.h. Since memmap_init_zone() has a generic version with '__weak', the declaratoin in ia64 header file can be simply removed. Link: https://lkml.kernel.org/r/20210122135956.5946-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20210122135956.5946-2-bhe@redhat.com Signed-off-by: Baoquan He Reported-by: kernel test robot Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/pgtable.h | 6 ------ include/linux/mm.h | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 779b6972aa84..9b4efe89e62d 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -517,12 +517,6 @@ extern struct page *zero_page_memmap_ptr; __changed; \ }) #endif - -# ifdef CONFIG_VIRTUAL_MEM_MAP - /* arch mem_map init routine is needed due to holes in a virtual mem_map */ - extern void memmap_init (unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn); -# endif /* CONFIG_VIRTUAL_MEM_MAP */ # endif /* !__ASSEMBLY__ */ /* diff --git a/include/linux/mm.h b/include/linux/mm.h index 7deb7168104d..1c1eabdf112c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2408,6 +2408,8 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); +extern void memmap_init(unsigned long size, int nid, + unsigned long zone, unsigned long range_start_pfn); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); From ab28cb6e1e5e59eb8bf3ad399133617414301d3a Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:14 -0800 Subject: [PATCH 102/172] mm: rename memmap_init() and memmap_init_zone() The current memmap_init_zone() only handles memory region inside one zone, actually memmap_init() does the memmap init of one zone. So rename both of them accordingly. Link: https://lkml.kernel.org/r/20210122135956.5946-3-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 6 +++--- include/linux/mm.h | 4 ++-- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 8 ++++---- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b19f47a5a305..39b782f62d7d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -536,18 +536,18 @@ virtual_memmap_init(u64 start, u64 end, void *arg) / sizeof(struct page)); if (map_start < map_end) - memmap_init_zone((unsigned long)(map_end - map_start), + memmap_init_range((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), page_to_pfn(map_end), MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); return 0; } void __meminit -memmap_init (unsigned long size, int nid, unsigned long zone, +memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, start_pfn + size, + memmap_init_range(size, nid, zone, start_pfn, start_pfn + size, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; diff --git a/include/linux/mm.h b/include/linux/mm.h index 1c1eabdf112c..56d0eeae0ce3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2405,10 +2405,10 @@ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, +extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); -extern void memmap_init(unsigned long size, int nid, +extern void memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long range_start_pfn); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9d57b9be8c7..ddcb1cd24c60 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, + memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 069561aadc7b..519cf52963eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6121,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, struct vmem_altmap *altmap, int migratetype) @@ -6258,7 +6258,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -void __meminit __weak memmap_init(unsigned long size, int nid, +void __meminit __weak memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long range_start_pfn) { @@ -6272,7 +6272,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; - memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn, + memmap_init_range(size, nid, zone, start_pfn, range_end_pfn, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } @@ -6982,7 +6982,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init(size, nid, j, zone_start_pfn); + memmap_init_zone(size, nid, j, zone_start_pfn); } } From 3256ff83c566235e812498ee1dc806c45a5d5af7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:17 -0800 Subject: [PATCH 103/172] mm: simplify parater of function memmap_init_zone() As David suggested, simply passing 'struct zone *zone' is enough. We can get all needed information from 'struct zone*' easily. Link: https://lkml.kernel.org/r/20210122135956.5946-4-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: David Hildenbrand Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 12 +++++++----- include/linux/mm.h | 3 +-- mm/page_alloc.c | 24 +++++++++++------------- 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 39b782f62d7d..16d0d7d22657 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -542,12 +542,14 @@ virtual_memmap_init(u64 start, u64 end, void *arg) return 0; } -void __meminit -memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn) +void __meminit memmap_init_zone(struct zone *zone) { + int nid = zone_to_nid(zone), zone_id = zone_idx(zone); + unsigned long start_pfn = zone->zone_start_pfn; + unsigned long size = zone->spanned_pages; + if (!vmem_map) { - memmap_init_range(size, nid, zone, start_pfn, start_pfn + size, + memmap_init_range(size, nid, zone_id, start_pfn, start_pfn + size, MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } else { struct page *start; @@ -557,7 +559,7 @@ memmap_init_zone(unsigned long size, int nid, unsigned long zone, args.start = start; args.end = start + size; args.nid = nid; - args.zone = zone; + args.zone = zone_id; efi_memmap_walk(virtual_memmap_init, &args); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 56d0eeae0ce3..2601a5cce0ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2408,8 +2408,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_range(unsigned long, int, unsigned long, unsigned long, unsigned long, enum meminit_context, struct vmem_altmap *, int migratetype); -extern void memmap_init_zone(unsigned long size, int nid, - unsigned long zone, unsigned long range_start_pfn); +extern void memmap_init_zone(struct zone *zone); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 519cf52963eb..aa04c54ad47c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6258,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -void __meminit __weak memmap_init_zone(unsigned long size, int nid, - unsigned long zone, - unsigned long range_start_pfn) +void __meminit __weak memmap_init_zone(struct zone *zone) { + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; + int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); unsigned long start_pfn, end_pfn; - unsigned long range_end_pfn = range_start_pfn + size; - int i; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); - end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); - if (end_pfn > start_pfn) { - size = end_pfn - start_pfn; - memmap_init_range(size, nid, zone, start_pfn, range_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); - } + if (end_pfn > start_pfn) + memmap_init_range(end_pfn - start_pfn, nid, + zone_id, start_pfn, zone_end_pfn, + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } @@ -6982,7 +6980,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init_zone(size, nid, j, zone_start_pfn); + memmap_init_zone(zone); } } From 7010a6eca49fc4a5a50f491342f08ddaa087ff07 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:20 -0800 Subject: [PATCH 104/172] mm: simplify parameter of setup_usemap() Parameter 'zone' has got needed information, let's remove other unnecessary parameters. Link: https://lkml.kernel.org/r/20210122135956.5946-5-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa04c54ad47c..56a59f5b4fc2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6765,25 +6765,22 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __ref setup_usemap(struct pglist_data *pgdat, - struct zone *zone, - unsigned long zone_start_pfn, - unsigned long zonesize) +static void __ref setup_usemap(struct zone *zone) { - unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); + unsigned long usemapsize = usemap_size(zone->zone_start_pfn, + zone->spanned_pages); zone->pageblock_flags = NULL; if (usemapsize) { zone->pageblock_flags = memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, - pgdat->node_id); + zone_to_nid(zone)); if (!zone->pageblock_flags) panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", - usemapsize, zone->name, pgdat->node_id); + usemapsize, zone->name, zone_to_nid(zone)); } } #else -static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, - unsigned long zone_start_pfn, unsigned long zonesize) {} +static inline void setup_usemap(struct zone *zone) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -6978,7 +6975,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); + setup_usemap(zone); init_currently_empty_zone(zone, zone_start_pfn, size); memmap_init_zone(zone); } From 9699ee7b2984c612ec3b45c1f7b963daebec3d6c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 24 Feb 2021 12:06:24 -0800 Subject: [PATCH 105/172] mm: remove unneeded local variable in free_area_init_core Local variable 'zone_start_pfn' is not needed since there's only one call site in free_area_init_core(). Let's remove it and pass zone->zone_start_pfn directly to init_currently_empty_zone(). Link: https://lkml.kernel.org/r/20210122135956.5946-6-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Mike Rapoport Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 56a59f5b4fc2..35e230cadb94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6927,7 +6927,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, freesize, memmap_pages; - unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; freesize = zone->present_pages; @@ -6976,7 +6975,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat) set_pageblock_order(); setup_usemap(zone); - init_currently_empty_zone(zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone->zone_start_pfn, size); memmap_init_zone(zone); } } From b3880c690beb7f3abf50f333bd8f3ea7040aaf89 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 24 Feb 2021 12:06:28 -0800 Subject: [PATCH 106/172] video: fbdev: acornfb: remove free_unused_pages() Patch series "mm: simplify free_highmem_page() and free_reserved_page()". Let's simplify and unify free_highmem_page() and free_reserved_page(). This patch (of 2): This function is never used and it is one of the last remaining user of __free_reserved_page(). Let's just drop it. Link: https://lkml.kernel.org/r/20210126182113.19892-1-david@redhat.com Link: https://lkml.kernel.org/r/20210126182113.19892-2-david@redhat.com Fixes: ffd29195ed720188 ("drivers/video/acornfb.c: remove dead code") Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Cc: "Gustavo A. R. Silva" Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/fbdev/acornfb.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c index bcc92aecf666..1b72edc01cfb 100644 --- a/drivers/video/fbdev/acornfb.c +++ b/drivers/video/fbdev/acornfb.c @@ -921,40 +921,6 @@ static int acornfb_detect_monitortype(void) return 4; } -/* - * This enables the unused memory to be freed on older Acorn machines. - * We are freeing memory on behalf of the architecture initialisation - * code here. - */ -static inline void -free_unused_pages(unsigned int virtual_start, unsigned int virtual_end) -{ - int mb_freed = 0; - - /* - * Align addresses - */ - virtual_start = PAGE_ALIGN(virtual_start); - virtual_end = PAGE_ALIGN(virtual_end); - - while (virtual_start < virtual_end) { - struct page *page; - - /* - * Clear page reserved bit, - * set count to 1, and free - * the page. - */ - page = virt_to_page(virtual_start); - __free_reserved_page(page); - - virtual_start += PAGE_SIZE; - mb_freed += PAGE_SIZE / 1024; - } - - printk("acornfb: freed %dK memory\n", mb_freed); -} - static int acornfb_probe(struct platform_device *dev) { unsigned long size; From a0cd7a7c4bc004587d1f4785a320f58e72d880eb Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 24 Feb 2021 12:06:32 -0800 Subject: [PATCH 107/172] mm: simplify free_highmem_page() and free_reserved_page() adjust_managed_page_count() as called by free_reserved_page() properly handles pages in a highmem zone, so we can reuse it for free_highmem_page(). We can now get rid of totalhigh_pages_inc() and simplify free_reserved_page(). Link: https://lkml.kernel.org/r/20210126182113.19892-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Anshuman Khandual Cc: Thomas Gleixner Cc: "Peter Zijlstra (Intel)" Cc: Mike Rapoport Cc: Michal Hocko Cc: Wei Yang Cc: "Gustavo A. R. Silva" Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/highmem-internal.h | 5 ----- include/linux/mm.h | 16 ++-------------- mm/page_alloc.c | 11 ----------- 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 1bbe96dc8be6..7902c7d8b55f 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -127,11 +127,6 @@ static inline unsigned long totalhigh_pages(void) return (unsigned long)atomic_long_read(&_totalhigh_pages); } -static inline void totalhigh_pages_inc(void) -{ - atomic_long_inc(&_totalhigh_pages); -} - static inline void totalhigh_pages_add(long count) { atomic_long_add(count, &_totalhigh_pages); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2601a5cce0ef..76cab132c295 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2310,32 +2310,20 @@ extern void free_initmem(void); extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); -#ifdef CONFIG_HIGHMEM -/* - * Free a highmem page into the buddy system, adjusting totalhigh_pages - * and totalram_pages. - */ -extern void free_highmem_page(struct page *page); -#endif - extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ -static inline void __free_reserved_page(struct page *page) +static inline void free_reserved_page(struct page *page) { ClearPageReserved(page); init_page_count(page); __free_page(page); -} - -static inline void free_reserved_page(struct page *page) -{ - __free_reserved_page(page); adjust_managed_page_count(page, 1); } +#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35e230cadb94..ddccc59f2f72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7691,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -#ifdef CONFIG_HIGHMEM -void free_highmem_page(struct page *page) -{ - __free_reserved_page(page); - totalram_pages_inc(); - atomic_long_inc(&page_zone(page)->managed_pages); - totalhigh_pages_inc(); -} -#endif - - void __init mem_init_print_info(const char *str) { unsigned long physpages, codesize, datasize, rosize, bss_size; From 3b2ebeaf98a028d5dd4ec63095855ef507920276 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 24 Feb 2021 12:06:35 -0800 Subject: [PATCH 108/172] mm/gfp: add kernel-doc for gfp_t The generated html will link to the definition of the gfp_t automatically once we define it. Move the one-paragraph overview of GFP flags from the documentation directory into gfp.h and pull gfp.h into the documentation. This generates warnings with clang (https://lkml.kernel.org/r/20210219195509.GA59987@24bbad8f3778), so use a #if 0 to hide it from the compiler for now. Link: https://lkml.kernel.org/r/20210215204909.3824509-1-willy@infradead.org Link: https://lkml.kernel.org/r/20210220003049.GZ2858050@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/mm-api.rst | 7 ++----- include/linux/gfp.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 2adffb3f7914..201b5423303b 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -19,11 +19,8 @@ User Space Memory Access Memory Allocation Controls ========================== -Functions which need to allocate memory often use GFP flags to express -how that memory should be allocated. The GFP acronym stands for "get -free pages", the underlying memory allocation function. Not every GFP -flag is allowed to every function which may allocate memory. Most -users will want to use a plain ``GFP_KERNEL``. +.. kernel-doc:: include/linux/gfp.h + :internal: .. kernel-doc:: include/linux/gfp.h :doc: Page mobility and placement hints diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 80544d5c08e7..220cd553a9e7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,20 @@ #include #include +/* The typedef is in types.h but we want the documentation here */ +#if 0 +/** + * typedef gfp_t - Memory allocation flags. + * + * GFP flags are commonly used throughout Linux to indicate how memory + * should be allocated. The GFP acronym stands for get_free_pages(), + * the underlying memory allocation function. Not every GFP flag is + * supported by every function which may allocate memory. Most users + * will want to use a plain ``GFP_KERNEL``. + */ +typedef unsigned int __bitwise gfp_t; +#endif + struct vm_area_struct; /* From 30c9cf49270423f8cb0d2c152486e248f375cccb Mon Sep 17 00:00:00 2001 From: Aili Yao Date: Wed, 24 Feb 2021 12:06:39 -0800 Subject: [PATCH 109/172] mm,hwpoison: send SIGBUS to PF_MCE_EARLY processes on action required events When a memory uncorrected error is triggered by process who accessed the address with error, It's Action Required Case for only current process which triggered this; This Action Required case means Action optional to other process who share the same page. Usually killing current process will be sufficient, other processes sharing the same page will get be signaled when they really touch the poisoned page. But there is another scenario that other processes sharing the same page want to be signaled early with PF_MCE_EARLY set. In this case, we should get them into kill list and signal BUS_MCEERR_AO to them. So in this patch, task_early_kill will check current process if force_early is set, and if not current,the code will fallback to find_early_kill_thread() to check if there is PF_MCE_EARLY process who cares the error. In kill_proc(), BUS_MCEERR_AR is only send to current, other processes in kill list will be signaled with BUS_MCEERR_AO. Link: https://lkml.kernel.org/r/20210122132424.313c8f5f.yaoaili@kingsoft.com Signed-off-by: Aili Yao Reviewed-by: Oscar Salvador Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e9481632fcd1..55c671904aac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -243,9 +243,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pfn, t->comm, t->pid); if (flags & MF_ACTION_REQUIRED) { - WARN_ON_ONCE(t != current); - ret = force_sig_mceerr(BUS_MCEERR_AR, + if (t == current) + ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, addr_lsb); + else + /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ + ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, + addr_lsb, t); } else { /* * Don't use force here, it's convenient if the signal @@ -440,26 +444,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) * Determine whether a given process is "early kill" process which expects * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly - * specified) if the process is "early kill," and otherwise returns NULL. + * specified) if the process is "early kill" and otherwise returns NULL. * - * Note that the above is true for Action Optional case, but not for Action - * Required case where SIGBUS should sent only to the current thread. + * Note that the above is true for Action Optional case. For Action Required + * case, it's only meaningful to the current thread which need to be signaled + * with SIGBUS, this error is Action Optional for other non current + * processes sharing the same error page,if the process is "early kill", the + * task_struct of the dedicated thread will also be returned. */ static struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { if (!tsk->mm) return NULL; - if (force_early) { - /* - * Comparing ->mm here because current task might represent - * a subthread, while tsk always points to the main thread. - */ - if (tsk->mm == current->mm) - return current; - else - return NULL; - } + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (force_early && tsk->mm == current->mm) + return current; + return find_early_kill_thread(tsk); } From fca40573e0f742dfd81cf20b8a7c6ce0e543b8b6 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 24 Feb 2021 12:06:42 -0800 Subject: [PATCH 110/172] mm/huge_memory.c: update tlb entry if pmd is changed When set_pmd_at is called in function do_huge_pmd_anonymous_page, new tlb entry can be added by software on MIPS platform. Here add update_mmu_cache_pmd when pmd entry is set, and update_mmu_cache_pmd is defined as empty excepts arc/mips platform. This patch has no negative effect on other platforms except arc/mips system. Link: http://lkml.kernel.org/r/1592990792-1923-2-git-send-email-maobibo@loongson.cn Signed-off-by: Bibo Mao Cc: Anshuman Khandual Cc: Daniel Silsby Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Mike Rapoport Cc: Paul Burton Cc: Ralf Baechle Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86a3015ba54f..080b471c4fbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -636,6 +636,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); @@ -749,6 +750,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, haddr, vmf->pmd, zero_page); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } } else { From aba677f94b7d1004a9477464d78111d9082546f9 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Wed, 24 Feb 2021 12:06:46 -0800 Subject: [PATCH 111/172] MIPS: do not call flush_tlb_all when setting pmd entry Function set_pmd_at is to set pmd entry, if tlb entry need to be flushed, there exists pmdp_huge_clear_flush alike function before set_pmd_at is called. So it is not necessary to call flush_tlb_all in this function. In these scenarios, tlb for the pmd range needs to be flushed: - privilege degrade such as wrprotect is set on the pmd entry - pmd entry is cleared - there is exception if set_pmd_at is issued by dup_mmap, since flush_tlb_mm is called for parent process, it is not necessary to flush tlb in function copy_huge_pmd. Link: http://lkml.kernel.org/r/1592990792-1923-3-git-send-email-maobibo@loongson.cn Signed-off-by: Bibo Mao Cc: Anshuman Khandual Cc: Daniel Silsby Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Mike Rapoport Cc: Paul Burton Cc: Ralf Baechle Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index bd4b0656add3..61891af25019 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -45,7 +45,6 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { *pmdp = pmd; - flush_tlb_all(); } #endif /* defined(CONFIG_TRANSPARENT_HUGEPAGE) */ diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index 183ff9f9c026..7536f7804c44 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -100,7 +100,6 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { *pmdp = pmd; - flush_tlb_all(); } void __init pagetable_init(void) From cc2205a67dec5a700227a693fc113441e73e4641 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:06:50 -0800 Subject: [PATCH 112/172] mm/hugetlb: fix potential double free in hugetlb_register_node() error path In hugetlb_sysfs_add_hstate(), we would do kobject_put() on hstate_kobjs when failed to create sysfs group but forget to set hstate_kobjs to NULL. Then in hugetlb_register_node() error path, we may free it again via hugetlb_unregister_node(). Link: https://lkml.kernel.org/r/20210107123249.36964-1-linmiaohe@huawei.com Fixes: a3437870160c ("hugetlb: new sysfs interface") Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 905a7d549b00..dd56e205e7ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2988,8 +2988,10 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) + if (retval) { kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + } return retval; } From a1ba9da8f0f9a37d900ff7eff66482cf7de8015e Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Wed, 24 Feb 2021 12:06:54 -0800 Subject: [PATCH 113/172] mm/hugetlb.c: fix unnecessary address expansion of pmd sharing The current code would unnecessarily expand the address range. Consider one example, (start, end) = (1G-2M, 3G+2M), and (vm_start, vm_end) = (1G-4M, 3G+4M), the expected adjustment should be keep (1G-2M, 3G+2M) without expand. But the current result will be (1G-4M, 3G+4M). Actually, the range (1G-4M, 1G) and (3G, 3G+4M) would never been involved in pmd sharing. After this patch, we will check that the vma span at least one PUD aligned size and the start,end range overlap the aligned range of vma. With above example, the aligned vma range is (1G, 3G), so if (start, end) range is within (1G-4M, 1G), or within (3G, 3G+4M), then no adjustment to both start and end. Otherwise, we will have chance to adjust start downwards or end upwards without exceeding (vm_start, vm_end). Mike: : The 'adjusted range' is used for calls to mmu notifiers and cache(tlb) : flushing. Since the current code unnecessarily expands the range in some : cases, more entries than necessary would be flushed. This would/could : result in performance degradation. However, this is highly dependent on : the user runtime. Is there a combination of vma layout and calls to : actually hit this issue? If the issue is hit, will those entries : unnecessarily flushed be used again and need to be unnecessarily reloaded? Link: https://lkml.kernel.org/r/20210104081631.2921415-1-lixinhai.lxh@gmail.com Fixes: 75802ca66354 ("mm/hugetlb: fix calculation of adjust_range_if_pmd_sharing_possible") Signed-off-by: Li Xinhai Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd56e205e7ab..66edef56a070 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5288,21 +5288,23 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { - unsigned long a_start, a_end; + unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), + v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (!(vma->vm_flags & VM_MAYSHARE)) + /* + * vma need span at least one aligned PUD size and the start,end range + * must at least partialy within it. + */ + if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || + (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ - a_start = ALIGN_DOWN(*start, PUD_SIZE); - a_end = ALIGN(*end, PUD_SIZE); + if (*start > v_start) + *start = ALIGN_DOWN(*start, PUD_SIZE); - /* - * Intersect the range with the vma range, since pmd sharing won't be - * across vma after all - */ - *start = max(vma->vm_start, a_start); - *end = min(vma->vm_end, a_end); + if (*end < v_end) + *end = ALIGN(*end, PUD_SIZE); } /* From 0aa7f3544aaa02a7df5095dc1bc338bcd73b7872 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:06:57 -0800 Subject: [PATCH 114/172] mm/hugetlb: avoid unnecessary hugetlb_acct_memory() call When reservation accounting remains unchanged, hugetlb_acct_memory() will do nothing except holding and releasing hugetlb_lock. We should avoid this unnecessary hugetlb_lock lock/unlock cycle which is happening on 'most' hugetlb munmap operations by check delta against 0 at the beginning of hugetlb_acct_memory. Link: https://lkml.kernel.org/r/20210115092013.61012-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 66edef56a070..0173f9f2fcc6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3591,6 +3591,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; + if (!delta) + return 0; + spin_lock(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page From c78a7f3639932c48b4e1d329fc80fd26aa1a2fa3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:01 -0800 Subject: [PATCH 115/172] mm/hugetlb: use helper huge_page_order and pages_per_huge_page Since commit a5516438959d ("hugetlb: modular state for hugetlb page size"), we can use huge_page_order to access hstate->order and pages_per_huge_page to fetch the pages per huge page. But gather_bootmem_prealloc() forgot to use it. Link: https://lkml.kernel.org/r/20210114114435.40075-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0173f9f2fcc6..9b7fa72fa8fd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2476,7 +2476,7 @@ static void __init gather_bootmem_prealloc(void) struct hstate *h = m->hstate; WARN_ON(page_count(page) != 1); - prep_compound_huge_page(page, h->order); + prep_compound_huge_page(page, huge_page_order(h)); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ @@ -2488,7 +2488,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (hstate_is_gigantic(h)) - adjust_managed_page_count(page, 1 << h->order); + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } } From 1d88433bb00853bed0c776b6ad9156855c127da0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:05 -0800 Subject: [PATCH 116/172] mm/hugetlb: fix use after free when subpool max_hpages accounting is not enabled If a hugetlbfs filesystem is created with the min_size option and without the size option, used_hpages is always 0 and might lead to release subpool prematurely because it indicates no pages are used now while there might be. In order to fix this issue, we should check used_hpages == 0 iff max_hpages accounting is enabled. As max_hpages accounting should be enabled in most common case, this is not worth a Cc stable. [mike.kravetz@oracle.com: new changelog] Link: https://lkml.kernel.org/r/20210126115510.53374-1-linmiaohe@huawei.com Signed-off-by: Hongxiang Lou Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9b7fa72fa8fd..e8d39b0fe4bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -97,16 +97,26 @@ static inline void ClearPageHugeFreed(struct page *head) /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static inline bool subpool_is_free(struct hugepage_subpool *spool) +{ + if (spool->count) + return false; + if (spool->max_hpages != -1) + return spool->used_hpages == 0; + if (spool->min_hpages != -1) + return spool->rsv_hpages == spool->min_hpages; + + return true; +} + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { - bool free = (spool->count == 0) && (spool->used_hpages == 0); - spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ - if (free) { + if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); From c93b0a99260741a4fe39c0a8b73f45f34a5b7868 Mon Sep 17 00:00:00 2001 From: Jiapeng Zhong Date: Wed, 24 Feb 2021 12:07:09 -0800 Subject: [PATCH 117/172] mm/hugetlb: simplify the calculation of variables Fix the following coccicheck warnings: mm/hugetlb.c:3372:20-22: WARNING !A || A && B is equivalent to !A || B. Link: https://lkml.kernel.org/r/1611643468-52233-1-git-send-email-abaci-bugfix@linux.alibaba.com Signed-off-by: Jiapeng Zhong Reported-by: Abaci Robot Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e8d39b0fe4bd..6920c71d7e0d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3420,8 +3420,7 @@ static unsigned int allowed_mems_nr(struct hstate *h) mpol_allowed = policy_nodemask_current(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || - (mpol_allowed && node_isset(node, *mpol_allowed))) + if (!mpol_allowed || node_isset(node, *mpol_allowed)) nr += array[node]; } From 0fa5bc4023c188082024833b3deffd5543b93bc9 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 24 Feb 2021 12:07:12 -0800 Subject: [PATCH 118/172] mm/hugetlb: grab head page refcount once for group of subpages Patch series "mm/hugetlb: follow_hugetlb_page() improvements", v2. While looking at ZONE_DEVICE struct page reuse particularly the last patch[0], I found two possible improvements for follow_hugetlb_page() which is solely used for get_user_pages()/pin_user_pages(). The first patch batches page refcount updates while the second tidies up storing the subpages/vmas. Both together bring the cost of slow variant of gup() cost from ~87.6k usecs to ~5.8k usecs. libhugetlbfs tests seem to pass as well gup_test benchmarks with hugetlbfs vmas. This patch (of 2): follow_hugetlb_page() once it locks the pmd/pud, checks all its N subpages in a huge page and grabs a reference for each one. Similar to gup-fast, have follow_hugetlb_page() grab the head page refcount only after counting all its subpages that are part of the just faulted huge page. Consequently we reduce the number of atomics necessary to pin said huge page, which improves non-fast gup() considerably: - 16G with 1G huge page size gup_test -f /mnt/huge/file -m 16384 -r 10 -L -S -n 512 -w PIN_LONGTERM_BENCHMARK: ~87.6k us -> ~12.8k us Link: https://lkml.kernel.org/r/20210128182632.24562-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20210128182632.24562-2-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ mm/gup.c | 5 ++--- mm/hugetlb.c | 43 ++++++++++++++++++++++++------------------- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 76cab132c295..77e64e3eac80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1187,6 +1187,9 @@ static inline void get_page(struct page *page) } bool __must_check try_grab_page(struct page *page, unsigned int flags); +__maybe_unused struct page *try_grab_compound_head(struct page *page, int refs, + unsigned int flags); + static inline __must_check bool try_get_page(struct page *page) { diff --git a/mm/gup.c b/mm/gup.c index e4c224cd9661..e40579624f10 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -static __maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, - unsigned int flags) +__maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6920c71d7e0d..d4fc5db6bc32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4796,7 +4796,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - int err = -EFAULT; + int err = -EFAULT, refs; while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -4916,26 +4916,11 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } + refs = 0; + same_page: - if (pages) { + if (pages) pages[i] = mem_map_offset(page, pfn_offset); - /* - * try_grab_page() should always succeed here, because: - * a) we hold the ptl lock, and b) we've just checked - * that the huge page is present in the page tables. If - * the huge page is present, then the tail pages must - * also be present. The ptl prevents the head page and - * tail pages from being rearranged in any way. So this - * page must be available at this point, unless the page - * refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { - spin_unlock(ptl); - remainder = 0; - err = -ENOMEM; - break; - } - } if (vmas) vmas[i] = vma; @@ -4944,6 +4929,7 @@ same_page: ++pfn_offset; --remainder; ++i; + ++refs; if (vaddr < vma->vm_end && remainder && pfn_offset < pages_per_huge_page(h)) { /* @@ -4951,6 +4937,25 @@ same_page: * of this compound page. */ goto same_page; + } else if (pages) { + /* + * try_grab_compound_head() should always succeed here, + * because: a) we hold the ptl lock, and b) we've just + * checked that the huge page is present in the page + * tables. If the huge page is present, then the tail + * pages must also be present. The ptl prevents the + * head page and tail pages from being rearranged in + * any way. So this page must be available at this + * point, unless the page refcount overflowed: + */ + if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1], + refs, + flags))) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } } spin_unlock(ptl); } From 82e5d378b0e4736899e7f8f9f0f03138228f9a45 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 24 Feb 2021 12:07:16 -0800 Subject: [PATCH 119/172] mm/hugetlb: refactor subpage recording For a given hugepage backing a VA, there's a rather ineficient loop which is solely responsible for storing subpages in GUP @pages/@vmas array. For each subpage we check whether it's within range or size of @pages and keep increment @pfn_offset and a couple other variables per subpage iteration. Simplify this logic and minimize the cost of each iteration to just store the output page/vma. Instead of incrementing number of @refs iteratively, we do it through pre-calculation of @refs and only with a tight loop for storing pinned subpages/vmas. Additionally, retain existing behaviour with using mem_map_offset() when recording the subpages for configurations that don't have a contiguous mem_map. pinning consequently improves bringing us close to {pin,get}_user_pages_fast: - 16G with 1G huge page size gup_test -f /mnt/huge/file -m 16384 -r 30 -L -S -n 512 -w PIN_LONGTERM_BENCHMARK: ~12.8k us -> ~5.8k us PIN_FAST_BENCHMARK: ~3.7k us Link: https://lkml.kernel.org/r/20210128182632.24562-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d4fc5db6bc32..cf6653879bb4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4787,6 +4787,20 @@ out_release_nounlock: goto out; } +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages, + struct vm_area_struct **vmas) +{ + int nr; + + for (nr = 0; nr < refs; nr++) { + if (likely(pages)) + pages[nr] = mem_map_offset(page, nr); + if (vmas) + vmas[nr] = vma; + } +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -4916,28 +4930,16 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } - refs = 0; + refs = min3(pages_per_huge_page(h) - pfn_offset, + (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder); -same_page: - if (pages) - pages[i] = mem_map_offset(page, pfn_offset); + if (pages || vmas) + record_subpages_vmas(mem_map_offset(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL, + vmas ? vmas + i : NULL); - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - ++pfn_offset; - --remainder; - ++i; - ++refs; - if (vaddr < vma->vm_end && remainder && - pfn_offset < pages_per_huge_page(h)) { - /* - * We use pfn_offset to avoid touching the pageframes - * of this compound page. - */ - goto same_page; - } else if (pages) { + if (pages) { /* * try_grab_compound_head() should always succeed here, * because: a) we hold the ptl lock, and b) we've just @@ -4948,7 +4950,7 @@ same_page: * any way. So this page must be available at this * point, unless the page refcount overflowed: */ - if (WARN_ON_ONCE(!try_grab_compound_head(pages[i-1], + if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], refs, flags))) { spin_unlock(ptl); @@ -4957,6 +4959,11 @@ same_page: break; } } + + vaddr += (refs << PAGE_SHIFT); + remainder -= refs; + i += refs; + spin_unlock(ptl); } *nr_pages = remainder; From 6c26d3108393211ecfd44d89404cfb744027bafd Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:19 -0800 Subject: [PATCH 120/172] mm/hugetlb: fix some comment typos Fix typos sasitfy to satisfy, reservtion to reservation, hugegpage to hugepage and uniprocesor to uniprocessor in comments. Link: https://lkml.kernel.org/r/20210128112028.64831-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Souptick Joarder Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b5807f23caf8..ac3cb239a7ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -37,7 +37,7 @@ struct hugepage_subpool { struct hstate *hstate; long min_hpages; /* Minimum huge pages or -1 if no minimum. */ long rsv_hpages; /* Pages reserved against global pool to */ - /* sasitfy minimum size. */ + /* satisfy minimum size. */ }; struct resv_map { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf6653879bb4..fc895e25920c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1434,7 +1434,7 @@ static void __free_huge_page(struct page *page) * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the - * reservtion, do not call hugepage_subpool_put_pages() as this will + * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { @@ -3707,7 +3707,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the - * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) @@ -4491,7 +4491,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) } #else /* - * For uniprocesor systems we always use a single mutex, so just + * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) From 5291c09b3edb657f23c1939750c702ba2d74932f Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Wed, 24 Feb 2021 12:07:22 -0800 Subject: [PATCH 121/172] mm/hugetlb: remove redundant check in preparing and destroying gigantic page Gigantic page is a compound page and its order is more than 1. Thus it must be available for hpage_pincount. Let's remove the redundant check for gigantic page. Link: https://lkml.kernel.org/r/20210202112002.73170-1-yanfei.xu@windriver.com Signed-off-by: Yanfei Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fc895e25920c..527e2b89e209 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1234,8 +1234,7 @@ static void destroy_compound_gigantic_page(struct page *page, struct page *p = page + 1; atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { clear_compound_head(p); @@ -1563,9 +1562,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); - - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); } /* From 578b7725d4bde8eca23218278d1d8103dd0c3dde Mon Sep 17 00:00:00 2001 From: Zhiyuan Dai Date: Wed, 24 Feb 2021 12:07:26 -0800 Subject: [PATCH 122/172] mm/hugetlb.c: fix typos in comments Fix typo in comment. Link: https://lkml.kernel.org/r/1612256106-9436-1-git-send-email-daizhiyuan@phytium.com.cn Signed-off-by: Zhiyuan Dai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 527e2b89e209..9d3a1416fc2f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4028,7 +4028,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, /* * This is called when the original mapper is failing to COW a MAP_PRIVATE - * mappping it owns the reserve page for. The intention is to unmap the page + * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ From 2efeb8da992b955fa7705259e4b2f5937979deff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:29 -0800 Subject: [PATCH 123/172] mm/huge_memory.c: remove unused return value of set_huge_zero_page() The return value of set_huge_zero_page() is always ignored. So we should drop such return value. Link: https://lkml.kernel.org/r/20210203084816.46307-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 080b471c4fbf..4cdcc4da36e8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -691,20 +691,19 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; if (!pmd_none(*pmd)) - return false; + return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); - return true; } vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) From bae84953815793f68ddd8edeadd3f4e32676a2c8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 24 Feb 2021 12:07:32 -0800 Subject: [PATCH 124/172] mm/pmem: avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled Differentiate between hardware not supporting hugepages and user disabling THP via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' For the devdax namespace, the kernel handles the above via the supported_alignment attribute and failing to initialize the namespace if the namespace align value is not supported on the platform. For the fsdax namespace, the kernel will continue to initialize the namespace. This can result in the kernel creating a huge pte entry even though the hardware don't support the same. We do want hugepage support with pmem even if the end-user disabled THP via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence differentiate between hardware/firmware lacking support vs user-controlled disable of THP and prevent a huge fault if the hardware lacks hugepage support. Link: https://lkml.kernel.org/r/20210205023956.417587-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Dan Williams Cc: "Kirill A . Shutemov" Cc: Jan Kara Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 15 +++++++++------ mm/huge_memory.c | 6 +++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a19f35f836b..ba973efcd369 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -78,6 +78,7 @@ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, } enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, @@ -123,6 +124,13 @@ extern unsigned long transparent_hugepage_flags; */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { + + /* + * If the hardware/firmware marked hugepage support disabled. + */ + if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) + return false; + if (vma->vm_flags & VM_NOHUGEPAGE) return false; @@ -134,12 +142,7 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; - /* - * For dax vmas, try to always use hugepage mappings. If the kernel does - * not support hugepages, fsdax mappings will fallback to PAGE_SIZE - * mappings, and device-dax namespaces, that try to guarantee a given - * mapping size, will fail to enable - */ + if (vma_is_dax(vma)) return true; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4cdcc4da36e8..d77605c30f2e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -386,7 +386,11 @@ static int __init hugepage_init(void) struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; + /* + * Hardware doesn't support hugepages, hence disable + * DAX PMD support. + */ + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; return -EINVAL; } From 8938494c8567ebd9ebf2a230e1707ee1f9805342 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:36 -0800 Subject: [PATCH 125/172] hugetlb_cgroup: use helper pages_per_huge_page() in hugetlb_cgroup We could use helper function pages_per_huge_page() to get the number of pages in a hstate to simplify the code slightly. Link: https://lkml.kernel.org/r/20210205084513.29624-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9182848dda3e..f68b51fcda3d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -113,7 +113,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, rsvd_parent); limit = round_down(PAGE_COUNTER_MAX, - 1 << huge_page_order(&hstates[idx])); + pages_per_huge_page(&hstates[idx])); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), @@ -460,7 +460,7 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, - 1 << huge_page_order(&hstates[idx])); + pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: @@ -507,7 +507,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret; idx = MEMFILE_IDX(of_cft(of)->private); - nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); + nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: From 07e51edf839ab85187acf013384ceecbbba40b0b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:39 -0800 Subject: [PATCH 126/172] mm/hugetlb: use helper function range_in_vma() in page_table_shareable() We could use helper function range_in_vma() to check whether the vma is in the desired range to simplify the code. Link: https://lkml.kernel.org/r/20210204112949.43051-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9d3a1416fc2f..4c956e64a10c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5282,7 +5282,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - sbase < svma->vm_start || svma->vm_end < s_end) + !range_in_vma(svma, sbase, s_end)) return 0; return saddr; From 3f1b0162f6f6ae8a9012819b07d433bd0ec37d25 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:43 -0800 Subject: [PATCH 127/172] mm/hugetlb: remove unnecessary VM_BUG_ON_PAGE on putback_active_hugepage() All callers know they are operating on a hugetlb head page. So this VM_BUG_ON_PAGE can not catch anything useful. Link: https://lkml.kernel.org/r/20210209071151.44731-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Oscar Salvador Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c956e64a10c..ddfd1f99c884 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5622,7 +5622,6 @@ unlock: void putback_active_hugepage(struct page *page) { - VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); set_page_huge_active(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); From aca78307bfdaf3f99e040616f41aab7f8a566dfc Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:07:46 -0800 Subject: [PATCH 128/172] mm/hugetlb: use helper huge_page_size() to get hugepage size We can use helper huge_page_size() to get the hugepage size directly to simplify the code slightly. [linmiaohe@huawei.com: use helper huge_page_size() to get hugepage size] Link: https://lkml.kernel.org/r/20210209021803.49211-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20210208082450.15716-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ddfd1f99c884..6786b313e6ac 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3248,7 +3248,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); @@ -3523,7 +3523,7 @@ void hugetlb_report_meminfo(struct seq_file *m) for_each_hstate(h) { unsigned long count = h->nr_huge_pages; - total += (PAGE_SIZE << huge_page_order(h)) * count; + total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, @@ -3536,10 +3536,10 @@ void hugetlb_report_meminfo(struct seq_file *m) h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, - (PAGE_SIZE << huge_page_order(h)) / 1024); + huge_page_size(h) / SZ_1K); } - seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); + seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) @@ -3573,7 +3573,7 @@ void hugetlb_show_meminfo(void) h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) @@ -3696,9 +3696,7 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { - struct hstate *hstate = hstate_vma(vma); - - return 1UL << huge_page_shift(hstate); + return huge_page_size(hstate_vma(vma)); } /* From dbfee5aee7e54f83d96ceb8e3e80717fac62ad63 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:07:50 -0800 Subject: [PATCH 129/172] hugetlb: fix update_and_free_page contig page struct assumption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit page structs are not guaranteed to be contiguous for gigantic pages. The routine update_and_free_page can encounter a gigantic page, yet it assumes page structs are contiguous when setting page flags in subpages. If update_and_free_page encounters non-contiguous page structs, we can see “BUG: Bad page state in process …” errors. Non-contiguous page structs are generally not an issue. However, they can exist with a specific kernel configuration and hotplug operations. For example: Configure the kernel with CONFIG_SPARSEMEM and !CONFIG_SPARSEMEM_VMEMMAP. Then, hotplug add memory for the area where the gigantic page will be allocated. Zi Yan outlined steps to reproduce here [1]. [1] https://lore.kernel.org/linux-mm/16F7C58B-4D79-41C5-9B64-A1A1628F4AF2@nvidia.com/ Link: https://lkml.kernel.org/r/20210217184926.33567-1-mike.kravetz@oracle.com Fixes: 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") Signed-off-by: Zi Yan Signed-off-by: Mike Kravetz Cc: Zi Yan Cc: Davidlohr Bueso Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Joao Martins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6786b313e6ac..3f6a50373d4f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1321,14 +1321,16 @@ static inline void destroy_compound_gigantic_page(struct page *page, static void update_and_free_page(struct hstate *h, struct page *page) { int i; + struct page *subpage = page; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + for (i = 0; i < pages_per_huge_page(h); + i++, subpage = mem_map_next(subpage, page, i)) { + subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | 1 << PG_writeback); From 3272cfc2525b3a2810a59312d7a1e6f04a0ca3ef Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:07:54 -0800 Subject: [PATCH 130/172] hugetlb: fix copy_huge_page_from_user contig page struct assumption page structs are not guaranteed to be contiguous for gigantic pages. The routine copy_huge_page_from_user can encounter gigantic pages, yet it assumes page structs are contiguous when copying pages from user space. Since page structs for the target gigantic page are not contiguous, the data copied from user space could overwrite other pages not associated with the gigantic page and cause data corruption. Non-contiguous page structs are generally not an issue. However, they can exist with a specific kernel configuration and hotplug operations. For example: Configure the kernel with CONFIG_SPARSEMEM and !CONFIG_SPARSEMEM_VMEMMAP. Then, hotplug add memory for the area where the gigantic page will be allocated. Link: https://lkml.kernel.org/r/20210217184926.33567-2-mike.kravetz@oracle.com Fixes: 8fb5debc5fcd ("userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd support") Signed-off-by: Mike Kravetz Cc: Zi Yan Cc: Davidlohr Bueso Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Oscar Salvador Cc: Joao Martins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ef4e10c60b5f..784249f3307b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5177,17 +5177,19 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + struct page *subpage = dst_page; - for (i = 0; i < pages_per_huge_page; i++) { + for (i = 0; i < pages_per_huge_page; + i++, subpage = mem_map_next(subpage, dst_page, i)) { if (allow_pagefault) - page_kaddr = kmap(dst_page + i); + page_kaddr = kmap(subpage); else - page_kaddr = kmap_atomic(dst_page + i); + page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, (const void __user *)(src + i * PAGE_SIZE), PAGE_SIZE); if (allow_pagefault) - kunmap(dst_page + i); + kunmap(subpage); else kunmap_atomic(page_kaddr); From 7ecc956551f8a66618f71838c790a9b0b4f9ca10 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Wed, 24 Feb 2021 12:07:58 -0800 Subject: [PATCH 131/172] mm/hugetlb: suppress wrong warning info when alloc gigantic page If hugetlb_cma is enabled, it will skip boot time allocation when allocating gigantic page, that doesn't means allocation failure, so suppress this warning info. Link: https://lkml.kernel.org/r/20210219123909.13130-1-chenwandun@huawei.com Fixes: cf11e85fc08c ("mm: hugetlb: optionally allocate gigantic hugepages using cma") Signed-off-by: Chen Wandun Reviewed-by: Mike Kravetz Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3f6a50373d4f..8a1d1f85e21e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2529,7 +2529,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (hugetlb_cma_size) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); - break; + goto free; } if (!alloc_bootmem_huge_page(h)) break; @@ -2547,7 +2547,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } - +free: kfree(node_alloc_noretry); } From c2135f7c570bc274035834848d9bf46ea89ba763 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 24 Feb 2021 12:08:01 -0800 Subject: [PATCH 132/172] mm/vmscan: __isolate_lru_page_prepare() cleanup The function just returns 2 results, so using a 'switch' to deal with its result is unnecessary. Also simplify it to a bool func as Vlastimil suggested. Also remove 'goto' by reusing list_move(), and take Matthew Wilcox's suggestion to update comments in function. Link: https://lkml.kernel.org/r/728874d7-2d93-4049-68c1-dcc3b2d52ccd@linux.alibaba.com Signed-off-by: Alex Shi Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Yu Zhao Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/compaction.c | 2 +- mm/vmscan.c | 70 ++++++++++++++++++++------------------------ 3 files changed, 34 insertions(+), 40 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0d64a2bc7e00..32f665b1ee85 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -356,7 +356,7 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); -extern int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode); +extern bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, diff --git a/mm/compaction.c b/mm/compaction.c index 190ccdaa6c19..8f52532675a9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -988,7 +988,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(!get_page_unless_zero(page))) goto isolate_fail; - if (__isolate_lru_page_prepare(page, isolate_mode) != 0) + if (!__isolate_lru_page_prepare(page, isolate_mode)) goto isolate_fail_put; /* Try isolate the page */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b1b574ad199d..04509994aed4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1539,19 +1539,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * page: page to consider * mode: one of the LRU isolation modes defined above * - * returns 0 on success, -ve errno on failure. + * returns true on success, false on failure. */ -int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) +bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) { - int ret = -EBUSY; - /* Only take pages on the LRU. */ if (!PageLRU(page)) - return ret; + return false; /* Compaction should not handle unevictable pages but CMA can do so */ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) - return ret; + return false; /* * To minimise LRU disruption, the caller can indicate that it only @@ -1564,7 +1562,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) - return ret; + return false; if (PageDirty(page)) { struct address_space *mapping; @@ -1580,20 +1578,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) * from the page cache. */ if (!trylock_page(page)) - return ret; + return false; mapping = page_mapping(page); migrate_dirty = !mapping || mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) - return ret; + return false; } } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) - return ret; + return false; - return 0; + return true; } /* @@ -1677,35 +1675,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * only when the page is being freed somewhere else. */ scan += nr_pages; - switch (__isolate_lru_page_prepare(page, mode)) { - case 0: - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - if (unlikely(!get_page_unless_zero(page))) - goto busy; - - if (!TestClearPageLRU(page)) { - /* - * This page may in other isolation path, - * but we still hold lru_lock. - */ - put_page(page); - goto busy; - } - - nr_taken += nr_pages; - nr_zone_taken[page_zonenum(page)] += nr_pages; - list_move(&page->lru, dst); - break; - - default: -busy: - /* else it is being freed elsewhere */ + if (!__isolate_lru_page_prepare(page, mode)) { + /* It is being freed elsewhere */ list_move(&page->lru, src); + continue; } + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) { + list_move(&page->lru, src); + continue; + } + + if (!TestClearPageLRU(page)) { + /* Another thread is already isolating this page */ + put_page(page); + list_move(&page->lru, src); + continue; + } + + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; + list_move(&page->lru, dst); } /* From 725cac1c7e345c2e35a2de2db57233af279b851f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:08:06 -0800 Subject: [PATCH 133/172] mm/workingset.c: avoid unnecessary max_nodes estimation in count_shadow_nodes() If list_lru_shrink_count is 0, we always return SHRINK_EMPTY regardless of the value of max_nodes. So we can return early if nodes == 0 to save some cpu cycles of approximating a reasonable limit for the nodes. Link: https://lkml.kernel.org/r/20210123073825.46709-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Andrew Morton Acked-by: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index 10e96de945b3..7db8f3dad13c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -461,6 +461,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); + if (!nodes) + return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes @@ -503,9 +505,6 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, max_nodes = pages >> (XA_CHUNK_SHIFT - 3); - if (!nodes) - return SHRINK_EMPTY; - if (nodes <= max_nodes) return 0; return nodes - max_nodes; From 42895ea73bcd37c4a79e4c9f681ab8b82243c7f7 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:09 -0800 Subject: [PATCH 134/172] mm/vmscan.c: use add_page_to_lru_list() Patch series "mm: lru related cleanups", v2. The cleanups are intended to reduce the verbosity in lru list operations and make them less error-prone. A typical example would be how the patches change __activate_page(): static void __activate_page(struct page *page, struct lruvec *lruvec) { if (!PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru); + del_page_from_lru_list(page, lruvec); SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); There are a few more places like __activate_page() and they are unnecessarily repetitive in terms of figuring out which list a page should be added onto or deleted from. And with the duplicated code removed, they are easier to read, IMO. Patch 1 to 5 basically cover the above. Patch 6 and 7 make code more robust by improving bug reporting. Patch 8, 9 and 10 take care of some dangling helpers left in header files. This patch (of 10): There is add_page_to_lru_list(), and move_pages_to_lru() should reuse it, not duplicate it. Link: https://lkml.kernel.org/r/20210122220600.906146-1-yuzhao@google.com Link: https://lore.kernel.org/linux-mm/20201207220949.830352-2-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-2-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Cc: Hugh Dickins Cc: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 04509994aed4..19875660e8f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1823,7 +1823,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; - enum lru_list lru; while (!list_empty(list)) { page = lru_to_page(list); @@ -1868,11 +1867,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * inhibits memcg migration). */ VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); - lru = page_lru(page); + add_page_to_lru_list(page, lruvec, page_lru(page)); nr_pages = thp_nr_pages(page); - - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_add(&page->lru, &lruvec->lists[lru]); nr_moved += nr_pages; if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); From f90d8191ac864df33b1898bc7edc54eaa24e22bc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:13 -0800 Subject: [PATCH 135/172] include/linux/mm_inline.h: shuffle lru list addition and deletion functions These functions will call page_lru() in the following patches. Move them below page_lru() to avoid the forward declaration. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-3-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-3-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Vlastimil Babka Reviewed-by: Miaohe Lin Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8fc71e9d7bb0..2889741f450a 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -45,27 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, #endif } -static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); -} - -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); -} - -static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) -{ - list_del(&page->lru); - update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); -} - /** * page_lru_base_type - which LRU list type should a page be on? * @page: the page to test @@ -125,4 +104,25 @@ static __always_inline enum lru_list page_lru(struct page *page) } return lru; } + +static __always_inline void add_page_to_lru_list(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add(&page->lru, &lruvec->lists[lru]); +} + +static __always_inline void add_page_to_lru_list_tail(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); + list_add_tail(&page->lru, &lruvec->lists[lru]); +} + +static __always_inline void del_page_from_lru_list(struct page *page, + struct lruvec *lruvec, enum lru_list lru) +{ + list_del(&page->lru); + update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); +} #endif From 3a9c9788a3149d9745b7eb2eae811e57ef3b127c Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:17 -0800 Subject: [PATCH 136/172] mm: don't pass "enum lru_list" to lru list addition functions The "enum lru_list" parameter to add_page_to_lru_list() and add_page_to_lru_list_tail() is redundant in the sense that it can be extracted from the "struct page" parameter by page_lru(). A caveat is that we need to make sure PageActive() or PageUnevictable() is correctly set or cleared before calling these two functions. And they are indeed. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-4-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-4-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 8 ++++++-- mm/swap.c | 15 +++++++-------- mm/vmscan.c | 6 ++---- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 2889741f450a..130ba3201d3f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -106,15 +106,19 @@ static __always_inline enum lru_list page_lru(struct page *page) } static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { + enum lru_list lru = page_lru(page); + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); } static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { + enum lru_list lru = page_lru(page); + update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); list_add_tail(&page->lru, &lruvec->lists[lru]); } diff --git a/mm/swap.c b/mm/swap.c index 2cca7141470c..22fad34b9c18 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -231,7 +231,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) if (!PageUnevictable(page)) { del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); - add_page_to_lru_list_tail(page, lruvec, page_lru(page)); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); } } @@ -313,8 +313,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec) del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); __count_vm_events(PGACTIVATE, nr_pages); @@ -543,14 +542,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - add_page_to_lru_list_tail(page, lruvec, lru); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, nr_pages); } @@ -570,7 +569,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, @@ -595,7 +594,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) * anonymous pages */ ClearPageSwapBacked(page); - add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, @@ -1005,7 +1004,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_insertion(page, lru); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 19875660e8f8..09e4f97488c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1867,7 +1867,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * inhibits memcg migration). */ VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); - add_page_to_lru_list(page, lruvec, page_lru(page)); + add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); nr_moved += nr_pages; if (PageActive(page)) @@ -4282,12 +4282,10 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } SetPageLRU(page); From 861404536a3af3c39f1b10959a40def3d8efa2dd Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:21 -0800 Subject: [PATCH 137/172] mm/swap.c: don't pass "enum lru_list" to trace_mm_lru_insertion() The parameter is redundant in the sense that it can be extracted from the "struct page" parameter by page_lru() correctly. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-5-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-5-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/pagemap.h | 11 ++++------- mm/swap.c | 5 +---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 8fd1babae761..e1735fe7c76a 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -27,24 +27,21 @@ TRACE_EVENT(mm_lru_insertion, - TP_PROTO( - struct page *page, - int lru - ), + TP_PROTO(struct page *page), - TP_ARGS(page, lru), + TP_ARGS(page), TP_STRUCT__entry( __field(struct page *, page ) __field(unsigned long, pfn ) - __field(int, lru ) + __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->page = page; __entry->pfn = page_to_pfn(page); - __entry->lru = lru; + __entry->lru = page_lru(page); __entry->flags = trace_pagemap_flags(page); ), diff --git a/mm/swap.c b/mm/swap.c index 22fad34b9c18..a26e3a4fe17b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -957,7 +957,6 @@ EXPORT_SYMBOL(__pagevec_release); static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) { - enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); int nr_pages = thp_nr_pages(page); @@ -993,11 +992,9 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) smp_mb__after_atomic(); if (page_evictable(page)) { - lru = page_lru(page); if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { - lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) @@ -1005,7 +1002,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) } add_page_to_lru_list(page, lruvec); - trace_mm_lru_insertion(page, lru); + trace_mm_lru_insertion(page); } /* From 46ae6b2cc2a47904a368d238425531ea91f3a2a5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:25 -0800 Subject: [PATCH 138/172] mm/swap.c: don't pass "enum lru_list" to del_page_from_lru_list() The parameter is redundant in the sense that it can be potentially extracted from the "struct page" parameter by page_lru(). We need to make sure that existing PageActive() or PageUnevictable() remains until the function returns. A few places don't conform, and simple reordering fixes them. This patch may have left page_off_lru() seemingly odd, and we'll take care of it in the next patch. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-6-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-6-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 5 +++-- mm/compaction.c | 2 +- mm/mlock.c | 3 +-- mm/swap.c | 26 ++++++++++---------------- mm/vmscan.c | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 130ba3201d3f..ffacc6273678 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -124,9 +124,10 @@ static __always_inline void add_page_to_lru_list_tail(struct page *page, } static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec, enum lru_list lru) + struct lruvec *lruvec) { list_del(&page->lru); - update_lru_size(lruvec, lru, page_zonenum(page), -thp_nr_pages(page)); + update_lru_size(lruvec, page_lru(page), page_zonenum(page), + -thp_nr_pages(page)); } #endif diff --git a/mm/compaction.c b/mm/compaction.c index 8f52532675a9..3aef10c61d0e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1034,7 +1034,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, low_pfn += compound_nr(page) - 1; /* Successfully isolated */ - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), thp_nr_pages(page)); diff --git a/mm/mlock.c b/mm/mlock.c index 55b3b3672977..73960bb3464d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) */ if (TestClearPageLRU(page)) { lruvec = relock_page_lruvec_irq(page, lruvec); - del_page_from_lru_list(page, lruvec, - page_lru(page)); + del_page_from_lru_list(page, lruvec); continue; } else __munlock_isolation_failed(page); diff --git a/mm/swap.c b/mm/swap.c index a26e3a4fe17b..ff910f636df2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -85,7 +85,8 @@ static void __page_cache_release(struct page *page) lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + page_off_lru(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); @@ -229,7 +230,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { if (!PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); @@ -308,10 +309,9 @@ void lru_note_cost_page(struct page *page) static void __activate_page(struct page *page, struct lruvec *lruvec) { if (!PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru); + del_page_from_lru_list(page, lruvec); SetPageActive(page); add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); @@ -518,8 +518,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, */ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) { - int lru; - bool active; + bool active = PageActive(page); int nr_pages = thp_nr_pages(page); if (PageUnevictable(page)) @@ -529,10 +528,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) if (page_mapped(page)) return; - active = PageActive(page); - lru = page_lru_base_type(page); - - del_page_from_lru_list(page, lruvec, lru + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); @@ -563,10 +559,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { if (PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); add_page_to_lru_list(page, lruvec); @@ -581,11 +576,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) { if (PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - bool active = PageActive(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, - LRU_INACTIVE_ANON + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); /* @@ -919,7 +912,8 @@ void release_pages(struct page **pages, int nr) VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + page_off_lru(page); } __ClearPageWaiters(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 09e4f97488c9..7c65d47e6612 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1766,7 +1766,7 @@ int isolate_lru_page(struct page *page) get_page(page); lruvec = lock_page_lruvec_irq(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -4283,8 +4283,8 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { VM_BUG_ON_PAGE(PageActive(page), page); + del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); - del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } From 875601796267214f286d3581fe74f2805d060fe8 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:28 -0800 Subject: [PATCH 139/172] mm: add __clear_page_lru_flags() to replace page_off_lru() Similar to page_off_lru(), the new function does non-atomic clearing of PageLRU() in addition to PageActive() and PageUnevictable(), on a page that has no references left. If PageActive() and PageUnevictable() are both set, refuse to clear either and leave them to bad_page(). This is a behavior change that is meant to help debug. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-7-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-7-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 28 ++++++++++------------------ mm/swap.c | 6 ++---- mm/vmscan.c | 3 +-- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ffacc6273678..ef3fd79222e5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -61,27 +61,19 @@ static inline enum lru_list page_lru_base_type(struct page *page) } /** - * page_off_lru - which LRU list was page on? clearing its lru flags. - * @page: the page to test - * - * Returns the LRU list a page was on, as an index into the array of LRU - * lists; and clears its Unevictable or Active flags, ready for freeing. + * __clear_page_lru_flags - clear page lru flags before releasing a page + * @page: the page that was on lru and now has a zero reference */ -static __always_inline enum lru_list page_off_lru(struct page *page) +static __always_inline void __clear_page_lru_flags(struct page *page) { - enum lru_list lru; + __ClearPageLRU(page); - if (PageUnevictable(page)) { - __ClearPageUnevictable(page); - lru = LRU_UNEVICTABLE; - } else { - lru = page_lru_base_type(page); - if (PageActive(page)) { - __ClearPageActive(page); - lru += LRU_ACTIVE; - } - } - return lru; + /* this shouldn't happen, so leave the flags to bad_page() */ + if (PageActive(page) && PageUnevictable(page)) + return; + + __ClearPageActive(page); + __ClearPageUnevictable(page); } /** diff --git a/mm/swap.c b/mm/swap.c index ff910f636df2..1455fa56a0ee 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -84,9 +84,8 @@ static void __page_cache_release(struct page *page) lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); del_page_from_lru_list(page, lruvec); - page_off_lru(page); + __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); @@ -911,9 +910,8 @@ void release_pages(struct page **pages, int nr) lock_batch = 0; VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); del_page_from_lru_list(page, lruvec); - page_off_lru(page); + __clear_page_lru_flags(page); } __ClearPageWaiters(page); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7c65d47e6612..452dd3818aa3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1849,8 +1849,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); if (unlikely(put_page_testzero(page))) { - __ClearPageLRU(page); - __ClearPageActive(page); + __clear_page_lru_flags(page); if (unlikely(PageCompound(page))) { spin_unlock_irq(&lruvec->lru_lock); From bc7112719e1e80e4208eef3fc9bd8d2b6c263e7d Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:32 -0800 Subject: [PATCH 140/172] mm: VM_BUG_ON lru page flags Move scattered VM_BUG_ONs to two essential places that cover all lru list additions and deletions. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-8-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-8-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 4 ++++ mm/swap.c | 2 -- mm/vmscan.c | 1 - 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ef3fd79222e5..6d907a4dd6ad 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -66,6 +66,8 @@ static inline enum lru_list page_lru_base_type(struct page *page) */ static __always_inline void __clear_page_lru_flags(struct page *page) { + VM_BUG_ON_PAGE(!PageLRU(page), page); + __ClearPageLRU(page); /* this shouldn't happen, so leave the flags to bad_page() */ @@ -87,6 +89,8 @@ static __always_inline enum lru_list page_lru(struct page *page) { enum lru_list lru; + VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); + if (PageUnevictable(page)) lru = LRU_UNEVICTABLE; else { diff --git a/mm/swap.c b/mm/swap.c index 1455fa56a0ee..ab3258afcbeb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -83,7 +83,6 @@ static void __page_cache_release(struct page *page) unsigned long flags; lruvec = lock_page_lruvec_irqsave(page, &flags); - VM_BUG_ON_PAGE(!PageLRU(page), page); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); @@ -909,7 +908,6 @@ void release_pages(struct page **pages, int nr) if (prev_lruvec != lruvec) lock_batch = 0; - VM_BUG_ON_PAGE(!PageLRU(page), page); del_page_from_lru_list(page, lruvec); __clear_page_lru_flags(page); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 452dd3818aa3..348a90096550 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4281,7 +4281,6 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { - VM_BUG_ON_PAGE(PageActive(page), page); del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); add_page_to_lru_list(page, lruvec); From c1770e34f3e7640887d8129fc05d13fe17101301 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:36 -0800 Subject: [PATCH 141/172] include/linux/mm_inline.h: fold page_lru_base_type() into its sole caller We've removed all other references to this function. Link: https://lore.kernel.org/linux-mm/20201207220949.830352-9-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-9-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6d907a4dd6ad..7183c7a03f09 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -45,21 +45,6 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, #endif } -/** - * page_lru_base_type - which LRU list type should a page be on? - * @page: the page to test - * - * Used for LRU list index arithmetic. - * - * Returns the base LRU type - file or anon - @page should be on. - */ -static inline enum lru_list page_lru_base_type(struct page *page) -{ - if (page_is_file_lru(page)) - return LRU_INACTIVE_FILE; - return LRU_INACTIVE_ANON; -} - /** * __clear_page_lru_flags - clear page lru flags before releasing a page * @page: the page that was on lru and now has a zero reference @@ -92,12 +77,12 @@ static __always_inline enum lru_list page_lru(struct page *page) VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); if (PageUnevictable(page)) - lru = LRU_UNEVICTABLE; - else { - lru = page_lru_base_type(page); - if (PageActive(page)) - lru += LRU_ACTIVE; - } + return LRU_UNEVICTABLE; + + lru = page_is_file_lru(page) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; + if (PageActive(page)) + lru += LRU_ACTIVE; + return lru; } From 289ccba18af436f2b65ec69b2be1b086ec9f24a4 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:40 -0800 Subject: [PATCH 142/172] include/linux/mm_inline.h: fold __update_lru_size() into its sole caller All other references to the function were removed after commit a892cb6b977f ("mm/vmscan.c: use update_lru_size() in update_lru_sizes()"). Link: https://lore.kernel.org/linux-mm/20201207220949.830352-10-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-10-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 7183c7a03f09..355ea1ee32bd 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -24,7 +24,7 @@ static inline int page_is_file_lru(struct page *page) return !PageSwapBacked(page); } -static __always_inline void __update_lru_size(struct lruvec *lruvec, +static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, int nr_pages) { @@ -33,13 +33,6 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); -} - -static __always_inline void update_lru_size(struct lruvec *lruvec, - enum lru_list lru, enum zone_type zid, - int nr_pages) -{ - __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif From 2091339d59e7808e9b39a79f48e3d17ef7389b97 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 24 Feb 2021 12:08:44 -0800 Subject: [PATCH 143/172] mm/vmscan.c: make lruvec_lru_size() static All other references to the function were removed after commit b910718a948a ("mm: vmscan: detect file thrashing at the reclaim root"). Link: https://lore.kernel.org/linux-mm/20201207220949.830352-11-yuzhao@google.com/ Link: https://lkml.kernel.org/r/20210122220600.906146-11-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: Alex Shi Cc: Hugh Dickins Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Michal Hocko Cc: Roman Gushchin Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 -- mm/vmscan.c | 3 ++- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc99e9241846..9198b7ade85f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -892,8 +892,6 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } -extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); - #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else diff --git a/mm/vmscan.c b/mm/vmscan.c index 348a90096550..fea6b43bc1f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) */ -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) +static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) { unsigned long size = 0; int zid; From aeddcee6c17bd8cf80675495d39c4daceaf5b506 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 24 Feb 2021 12:08:47 -0800 Subject: [PATCH 144/172] mm: workingset: clarify eviction order and distance calculation The premise of the refault distance is that it can be seen as a deficit of the inactive list space, so that if the inactive list would have had (R - E) more slots, the page would not have been evicted but promoted to the active list instead. However, the way the code is ordered right now set us to be off by one, so the real number of slots would be (R - E) + 1. I stumbled upon this when trying to understand the code and it puzzled me that the comments did not match what the code did. This it not an issue at all since evictions and refaults tend to happen in a number large enough that being off-by-one does not have any impact - and since the compiler and CPUs are free to rearrange the execution sequence anyway. But as Johannes says, it is better to re-arrange the code in the proper order since otherwise would be misleading to somebody who is actively reading and trying to understand the logic of the code - like it happened to me. Link: https://lkml.kernel.org/r/20210201060651.3781-1-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/workingset.c b/mm/workingset.c index 7db8f3dad13c..cd39902c1062 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -263,10 +263,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(!PageLocked(page), page); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } From d6995da311221a05c8aef3bda2629e5cb14c7302 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:08:51 -0800 Subject: [PATCH 145/172] hugetlb: use page.private for hugetlb specific page flags Patch series "create hugetlb flags to consolidate state", v3. While discussing a series of hugetlb fixes in [1], it became evident that the hugetlb specific page state information is stored in a somewhat haphazard manner. Code dealing with state information would be easier to read, understand and maintain if this information was stored in a consistent manner. This series uses page.private of the hugetlb head page for storing a set of hugetlb specific page flags. Routines are priovided for test, set and clear of the flags. [1] https://lore.kernel.org/r/20210106084739.63318-1-songmuchun@bytedance.com This patch (of 4): As hugetlbfs evolved, state information about hugetlb pages was added. One 'convenient' way of doing this was to use available fields in tail pages. Over time, it has become difficult to know the meaning or contents of fields simply by looking at a small bit of code. Sometimes, the naming is just confusing. For example: The PagePrivate flag indicates a huge page reservation was consumed and needs to be restored if an error is encountered and the page is freed before it is instantiated. The page.private field contains the pointer to a subpool if the page is associated with one. In an effort to make the code more readable, use page.private to contain hugetlb specific page flags. These flags will have test, set and clear functions similar to those used for 'normal' page flags. More importantly, an enum of flag values will be created with names that actually reflect their purpose. In this patch, - Create infrastructure for hugetlb specific page flag functions - Move subpool pointer to page[1].private to make way for flags Create routines with meaningful names to modify subpool field - Use new HPageRestoreReserve flag instead of PagePrivate Conversion of other state information will happen in subsequent patches. Link: https://lkml.kernel.org/r/20210122195231.324857-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20210122195231.324857-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 ++------ include/linux/hugetlb.h | 68 +++++++++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 48 +++++++++++++++-------------- 3 files changed, 96 insertions(+), 32 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b7a72f577aab..907f6405e805 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -973,15 +973,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - /* - * page_private is subpool pointer in hugetlb pages. Transfer to - * new page. PagePrivate is not associated with page_private for - * hugetlb pages and can not be set here as only page_huge_active - * pages can be migrated. - */ - if (page_private(page)) { - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); + if (hugetlb_page_subpool(page)) { + hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); + hugetlb_set_page_subpool(page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ac3cb239a7ec..249c65e1d8ca 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -472,6 +472,60 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); #endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +/* + * huegtlb page specific state flags. These flags are located in page.private + * of the hugetlb head page. Functions created via the below macros should be + * used to manipulate these flags. + * + * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at + * allocation time. Cleared when page is fully instantiated. Free + * routine checks flag to restore a reservation on error paths. + */ +enum hugetlb_page_flags { + HPG_restore_reserve = 0, + __NR_HPAGEFLAGS, +}; + +/* + * Macros to create test, set and clear function definitions for + * hugetlb specific page flags. + */ +#ifdef CONFIG_HUGETLB_PAGE +#define TESTHPAGEFLAG(uname, flname) \ +static inline int HPage##uname(struct page *page) \ + { return test_bit(HPG_##flname, &(page->private)); } + +#define SETHPAGEFLAG(uname, flname) \ +static inline void SetHPage##uname(struct page *page) \ + { set_bit(HPG_##flname, &(page->private)); } + +#define CLEARHPAGEFLAG(uname, flname) \ +static inline void ClearHPage##uname(struct page *page) \ + { clear_bit(HPG_##flname, &(page->private)); } +#else +#define TESTHPAGEFLAG(uname, flname) \ +static inline int HPage##uname(struct page *page) \ + { return 0; } + +#define SETHPAGEFLAG(uname, flname) \ +static inline void SetHPage##uname(struct page *page) \ + { } + +#define CLEARHPAGEFLAG(uname, flname) \ +static inline void ClearHPage##uname(struct page *page) \ + { } +#endif + +#define HPAGEFLAG(uname, flname) \ + TESTHPAGEFLAG(uname, flname) \ + SETHPAGEFLAG(uname, flname) \ + CLEARHPAGEFLAG(uname, flname) \ + +/* + * Create functions associated with hugetlb page flags + */ +HPAGEFLAG(RestoreReserve, restore_reserve) + #ifdef CONFIG_HUGETLB_PAGE #define HSTATE_NAME_LEN 32 @@ -531,6 +585,20 @@ extern unsigned int default_hstate_idx; #define default_hstate (hstates[default_hstate_idx]) +/* + * hugetlb page subpool pointer located in hpage[1].private + */ +static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage) +{ + return (struct hugepage_subpool *)(hpage+1)->private; +} + +static inline void hugetlb_set_page_subpool(struct page *hpage, + struct hugepage_subpool *subpool) +{ + set_page_private(hpage+1, (unsigned long)subpool); +} + static inline struct hstate *hstate_file(struct file *f) { return hstate_inode(file_inode(f)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8a1d1f85e21e..f5e85dabb7a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1143,7 +1143,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } @@ -1418,20 +1418,19 @@ static void __free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct hugepage_subpool *spool = - (struct hugepage_subpool *)page_private(page); + struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); - set_page_private(page, 0); + hugetlb_set_page_subpool(page, NULL); page->mapping = NULL; - restore_reserve = PagePrivate(page); - ClearPagePrivate(page); + restore_reserve = HPageRestoreReserve(page); + ClearHPageRestoreReserve(page); /* - * If PagePrivate() was set on page, page allocation consumed a + * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the @@ -2263,24 +2262,24 @@ static long vma_add_reservation(struct hstate *h, * This routine is called to restore a reservation on error paths. In the * specific error paths, a huge page was allocated (via alloc_huge_page) * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set PagePrivate - * in the newly allocated page. When the page is freed via free_huge_page, - * the global reservation count will be incremented if PagePrivate is set. - * However, free_huge_page can not adjust the reserve map. Adjust the - * reserve map here to be consistent with global reserve count adjustments - * to be made by free_huge_page. + * alloc_huge_page would have consumed the reservation and set + * HPageRestoreReserve in the newly allocated page. When the page is freed + * via free_huge_page, the global reservation count will be incremented if + * HPageRestoreReserve is set. However, free_huge_page can not adjust the + * reserve map. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. */ static void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { - if (unlikely(PagePrivate(page))) { + if (unlikely(HPageRestoreReserve(page))) { long rc = vma_needs_reservation(h, vma, address); if (unlikely(rc < 0)) { /* * Rare out of memory condition in reserve map - * manipulation. Clear PagePrivate so that + * manipulation. Clear HPageRestoreReserve so that * global reserve count will not be incremented * by free_huge_page. This will make it appear * as though the reservation for this page was @@ -2289,7 +2288,7 @@ static void restore_reserve_on_error(struct hstate *h, * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else if (rc) { rc = vma_add_reservation(h, vma, address); if (unlikely(rc < 0)) @@ -2297,7 +2296,7 @@ static void restore_reserve_on_error(struct hstate *h, * See above comment about rare out of * memory condition. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else vma_end_reservation(h, vma, address); } @@ -2378,7 +2377,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } spin_lock(&hugetlb_lock); @@ -2396,7 +2395,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); - set_page_private(page, (unsigned long)spool); + hugetlb_set_page_subpool(page, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -3170,6 +3169,9 @@ static int __init hugetlb_init(void) { int i; + BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < + __NR_HPAGEFLAGS); + if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); @@ -4207,7 +4209,7 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearPagePrivate(new_page); + ClearHPageRestoreReserve(new_page); /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -4274,7 +4276,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, if (err) return err; - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); /* * set page dirty so that it will not be removed from cache/file @@ -4436,7 +4438,7 @@ retry: goto backout; if (anon_rmap) { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); @@ -4750,7 +4752,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (vm_shared) { page_dup_rmap(page, true); } else { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } From 8f251a3d5ce3bdea73bd045ed35db64f32e0d0d9 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:08:56 -0800 Subject: [PATCH 146/172] hugetlb: convert page_huge_active() HPageMigratable flag Use the new hugetlb page specific flag HPageMigratable to replace the page_huge_active interfaces. By it's name, page_huge_active implied that a huge page was on the active list. However, that is not really what code checking the flag wanted to know. It really wanted to determine if the huge page could be migrated. This happens when the page is actually added to the page cache and/or task page table. This is the reasoning behind the name change. The VM_BUG_ON_PAGE() calls in the *_huge_active() interfaces are not really necessary as we KNOW the page is a hugetlb page. Therefore, they are removed. The routine page_huge_active checked for PageHeadHuge before testing the active bit. This is unnecessary in the case where we hold a reference or lock and know it is a hugetlb head page. page_huge_active is also called without holding a reference or lock (scan_movable_pages), and can race with code freeing the page. The extra check in page_huge_active shortened the race window, but did not prevent the race. Offline code calling scan_movable_pages already deals with these races, so removing the check is acceptable. Add comment to racy code. [songmuchun@bytedance.com: remove set_page_huge_active() declaration from include/linux/hugetlb.h] Link: https://lkml.kernel.org/r/CAMZfGtUda+KoAZscU0718TN61cSFwp4zy=y2oZ=+6Z2TAZZwng@mail.gmail.com Link: https://lkml.kernel.org/r/20210122195231.324857-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/linux/hugetlb.h | 7 ++++-- include/linux/page-flags.h | 6 ----- mm/hugetlb.c | 45 ++++++++++---------------------------- mm/memory_hotplug.c | 9 +++++++- 5 files changed, 25 insertions(+), 44 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 907f6405e805..5a8ed6bd4f87 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -735,7 +735,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); - set_page_huge_active(page); + SetHPageMigratable(page); /* * unlock_page because locked by add_to_page_cache() * put_page() due to reference from alloc_huge_page() diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 249c65e1d8ca..77f2a032fe32 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -480,9 +480,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. + * HPG_migratable - Set after a newly allocated page is added to the page + * cache and/or page tables. Indicates the page is a candidate for + * migration. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, + HPG_migratable, __NR_HPAGEFLAGS, }; @@ -525,6 +529,7 @@ static inline void ClearHPage##uname(struct page *page) \ * Create functions associated with hugetlb page flags */ HPAGEFLAG(RestoreReserve, restore_reserve) +HPAGEFLAG(Migratable, migratable) #ifdef CONFIG_HUGETLB_PAGE @@ -838,8 +843,6 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif -void set_page_huge_active(struct page *page); - #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index ec5d0290e0ee..db914477057b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -592,15 +592,9 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); -bool page_huge_active(struct page *page); #else TESTPAGEFLAG_FALSE(Huge) TESTPAGEFLAG_FALSE(HeadHuge) - -static inline bool page_huge_active(struct page *page) -{ - return 0; -} #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f5e85dabb7a3..727c09713627 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1364,30 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -/* - * Test to determine whether the hugepage is "active/in-use" (i.e. being linked - * to hstate->hugepage_activelist.) - * - * This function can be called for tail pages, but never returns true for them. - */ -bool page_huge_active(struct page *page) -{ - return PageHeadHuge(page) && PagePrivate(&page[1]); -} - -/* never called for tail page */ -void set_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - SetPagePrivate(&page[1]); -} - -static void clear_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - ClearPagePrivate(&page[1]); -} - /* * Internal hugetlb specific page flag. Do not use outside of the hugetlb * code @@ -1449,7 +1425,7 @@ static void __free_huge_page(struct page *page) } spin_lock(&hugetlb_lock); - clear_page_huge_active(page); + ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), @@ -4218,7 +4194,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); - set_page_huge_active(new_page); + SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -4455,12 +4431,12 @@ retry: spin_unlock(ptl); /* - * Only make newly allocated pages active. Existing pages found - * in the pagecache could be !page_huge_active() if they have been - * isolated for migration. + * Only set HPageMigratable in newly allocated pages. Existing pages + * found in the pagecache may not have HPageMigratableset if they have + * been isolated for migration. */ if (new_page) - set_page_huge_active(page); + SetHPageMigratable(page); unlock_page(page); out: @@ -4771,7 +4747,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - set_page_huge_active(page); + SetHPageMigratable(page); if (vm_shared) unlock_page(page); ret = 0; @@ -5610,12 +5586,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list) bool ret = true; spin_lock(&hugetlb_lock); - if (!PageHeadHuge(page) || !page_huge_active(page) || + if (!PageHeadHuge(page) || + !HPageMigratable(page) || !get_page_unless_zero(page)) { ret = false; goto unlock; } - clear_page_huge_active(page); + ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: spin_unlock(&hugetlb_lock); @@ -5625,7 +5602,7 @@ unlock: void putback_active_hugepage(struct page *page) { spin_lock(&hugetlb_lock); - set_page_huge_active(page); + SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ddcb1cd24c60..abe43c1ae920 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (!PageHuge(page)) continue; head = compound_head(page); - if (page_huge_active(head)) + /* + * This test is racy as we hold no reference or lock. The + * hugetlb page could have been free'ed and head is no longer + * a hugetlb page before the following check. In such unlikely + * cases false positives and negatives are possible. Calling + * code must deal with these scenarios. + */ + if (HPageMigratable(head)) goto found; skip = compound_nr(head) - (page - head); pfn += skip - 1; From 9157c31186c358c5750dea50ac5705d61d7fc917 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:00 -0800 Subject: [PATCH 147/172] hugetlb: convert PageHugeTemporary() to HPageTemporary flag Use new hugetlb specific HPageTemporary flag to replace the PageHugeTemporary() interfaces. PageHugeTemporary does contain a PageHuge() check. However, this interface is only used within hugetlb code where we know we are dealing with a hugetlb page. Therefore, the check can be eliminated. Link: https://lkml.kernel.org/r/20210122195231.324857-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++++ mm/hugetlb.c | 36 +++++++----------------------------- 2 files changed, 13 insertions(+), 29 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 77f2a032fe32..e97498a21010 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -483,10 +483,15 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. + * HPG_temporary - - Set on a page that is temporarily allocated from the buddy + * allocator. Typically used for migration target pages when no pages + * are available in the pool. The hugetlb free page path will + * immediately free pages with this flag set to the buddy allocator. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, + HPG_temporary, __NR_HPAGEFLAGS, }; @@ -530,6 +535,7 @@ static inline void ClearHPage##uname(struct page *page) \ */ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) +HPAGEFLAG(Temporary, temporary) #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 727c09713627..4d7d4535a313 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1364,28 +1364,6 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -/* - * Internal hugetlb specific page flag. Do not use outside of the hugetlb - * code - */ -static inline bool PageHugeTemporary(struct page *page) -{ - if (!PageHuge(page)) - return false; - - return (unsigned long)page[2].mapping == -1U; -} - -static inline void SetPageHugeTemporary(struct page *page) -{ - page[2].mapping = (void *)-1U; -} - -static inline void ClearPageHugeTemporary(struct page *page) -{ - page[2].mapping = NULL; -} - static void __free_huge_page(struct page *page) { /* @@ -1433,9 +1411,9 @@ static void __free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (PageHugeTemporary(page)) { + if (HPageTemporary(page)) { list_del(&page->lru); - ClearPageHugeTemporary(page); + ClearHPageTemporary(page); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ @@ -1869,7 +1847,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetPageHugeTemporary(page); + SetHPageTemporary(page); spin_unlock(&hugetlb_lock); put_page(page); return NULL; @@ -1900,7 +1878,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetPageHugeTemporary(page); + SetHPageTemporary(page); return page; } @@ -5625,12 +5603,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (PageHugeTemporary(newpage)) { + if (HPageTemporary(newpage)) { int old_nid = page_to_nid(oldpage); int new_nid = page_to_nid(newpage); - SetPageHugeTemporary(oldpage); - ClearPageHugeTemporary(newpage); + SetHPageTemporary(oldpage); + ClearHPageTemporary(newpage); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { From 6c037149014027d50175da5be4ae4531374dcbe0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:04 -0800 Subject: [PATCH 148/172] hugetlb: convert PageHugeFreed to HPageFreed flag Use new hugetlb specific HPageFreed flag to replace the PageHugeFreed interfaces. Link: https://lkml.kernel.org/r/20210122195231.324857-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 23 ++++------------------- 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e97498a21010..eb2682fd97b6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -487,11 +487,13 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. + * HPG_freed - Set when page is on the free lists. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, HPG_migratable, HPG_temporary, + HPG_freed, __NR_HPAGEFLAGS, }; @@ -536,6 +538,7 @@ static inline void ClearHPage##uname(struct page *page) \ HPAGEFLAG(RestoreReserve, restore_reserve) HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) +HPAGEFLAG(Freed, freed) #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4d7d4535a313..5377e1dad044 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -79,21 +79,6 @@ DEFINE_SPINLOCK(hugetlb_lock); static int num_fault_mutexes; struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; -static inline bool PageHugeFreed(struct page *head) -{ - return page_private(head + 4) == -1UL; -} - -static inline void SetPageHugeFreed(struct page *head) -{ - set_page_private(head + 4, -1UL); -} - -static inline void ClearPageHugeFreed(struct page *head) -{ - set_page_private(head + 4, 0); -} - /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); @@ -1053,7 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetPageHugeFreed(page); + SetHPageFreed(page); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1070,7 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); - ClearPageHugeFreed(page); + ClearHPageFreed(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -1485,7 +1470,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; - ClearPageHugeFreed(page); + ClearHPageFreed(page); spin_unlock(&hugetlb_lock); } @@ -1756,7 +1741,7 @@ retry: * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!PageHugeFreed(head))) { + if (unlikely(!HPageFreed(head))) { spin_unlock(&hugetlb_lock); cond_resched(); From d95c0337774b1dc74d271e7475a96fe8838332ea Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:08 -0800 Subject: [PATCH 149/172] include/linux/hugetlb.h: add synchronization information for new hugetlb specific flags Add comments, no functional change. Link: https://lkml.kernel.org/r/62a80585-2a73-10cc-4a2d-5721540d4ad2@oracle.com Signed-off-by: Mike Kravetz Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index eb2682fd97b6..3a541c6cf5c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -480,14 +480,24 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at * allocation time. Cleared when page is fully instantiated. Free * routine checks flag to restore a reservation on error paths. + * Synchronization: Examined or modified by code that knows it has + * the only reference to page. i.e. After allocation but before use + * or when the page is being freed. * HPG_migratable - Set after a newly allocated page is added to the page * cache and/or page tables. Indicates the page is a candidate for * migration. + * Synchronization: Initially set after new page allocation with no + * locking. When examined and modified during migration processing + * (isolate, migrate, putback) the hugetlb_lock is held. * HPG_temporary - - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. + * Synchronization: Can be set after huge page allocation from buddy when + * code knows it has only reference. All other examinations and + * modifications require hugetlb_lock. * HPG_freed - Set when page is on the free lists. + * Synchronization: hugetlb_lock held for examination and modification. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, From ff5461176213d5fd5cfb7e981f9add4d856e415a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:11 -0800 Subject: [PATCH 150/172] hugetlb: fix uninitialized subpool pointer Gerald Schaefer reported a panic on s390 in hugepage_subpool_put_pages() with linux-next 5.12.0-20210222. Call trace: hugepage_subpool_put_pages.part.0+0x2c/0x138 __free_huge_page+0xce/0x310 alloc_pool_huge_page+0x102/0x120 set_max_huge_pages+0x13e/0x350 hugetlb_sysctl_handler_common+0xd8/0x110 hugetlb_sysctl_handler+0x48/0x58 proc_sys_call_handler+0x138/0x238 new_sync_write+0x10e/0x198 vfs_write.part.0+0x12c/0x238 ksys_write+0x68/0xf8 do_syscall+0x82/0xd0 __do_syscall+0xb4/0xc8 system_call+0x72/0x98 This is a result of the change which moved the hugetlb page subpool pointer from page->private to page[1]->private. When new pages are allocated from the buddy allocator, the private field of the head page will be cleared, but the private field of subpages is not modified. Therefore, old values may remain. Fix by initializing hugetlb page subpool pointer in prep_new_huge_page(). Link: https://lkml.kernel.org/r/20210223215544.313871-1-mike.kravetz@oracle.com Fixes: f1280272ae4d ("hugetlb: use page.private for hugetlb specific page flags") Signed-off-by: Mike Kravetz Reported-by: Gerald Schaefer Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Gerald Schaefer Cc: Muchun Song Cc: Heiko Carstens Cc: Sven Schnelle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5377e1dad044..8e5f494291c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1465,6 +1465,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); spin_lock(&hugetlb_lock); From 519983645a9f2ec339cabfa0c6ef7b09be985dd0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 24 Feb 2021 12:09:15 -0800 Subject: [PATCH 151/172] mm/vmscan: restore zone_reclaim_mode ABI I went to go add a new RECLAIM_* mode for the zone_reclaim_mode sysctl. Like a good kernel developer, I also went to go update the documentation. I noticed that the bits in the documentation didn't match the bits in the #defines. The VM never explicitly checks the RECLAIM_ZONE bit. The bit is, however implicitly checked when checking 'node_reclaim_mode==0'. The RECLAIM_ZONE #define was removed in a cleanup. That, by itself is fine. But, when the bit was removed (bit 0) the _other_ bit locations also got changed. That's not OK because the bit values are documented to mean one specific thing. Users surely do not expect the meaning to change from kernel to kernel. The end result is that if someone had a script that did: sysctl vm.zone_reclaim_mode=1 it would have gone from enabling node reclaim for clean unmapped pages to writing out pages during node reclaim after the commit in question. That's not great. Put the bits back the way they were and add a comment so something like this is a bit harder to do again. Update the documentation to make it clear that the first bit is ignored. Link: https://lkml.kernel.org/r/20210219172555.FF0CDF23@viggo.jf.intel.com Signed-off-by: Dave Hansen Fixes: 648b5cf368e0 ("mm/vmscan: remove unused RECLAIM_OFF/RECLAIM_ZONE") Reviewed-by: Ben Widawsky Reviewed-by: Oscar Salvador Acked-by: David Rientjes Acked-by: Christoph Lameter Cc: Alex Shi Cc: Daniel Wagner Cc: "Tobin C. Harding" Cc: Christoph Lameter Cc: Andrew Morton Cc: Huang Ying Cc: Dan Williams Cc: Qian Cai Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 10 +++++----- mm/vmscan.c | 9 +++++++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index e35a3f2fb006..586cd4b86428 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -983,11 +983,11 @@ that benefit from having their data cached, zone_reclaim_mode should be left disabled as the caching effect is likely to be more important than data locality. -zone_reclaim may be enabled if it's known that the workload is partitioned -such that each partition fits within a NUMA node and that accessing remote -memory would cause a measurable performance reduction. The page allocator -will then reclaim easily reusable pages (those page cache pages that are -currently not used) before allocating off node pages. +Consider enabling one or more zone_reclaim mode bits if it's known that the +workload is partitioned such that each partition fits within a NUMA node +and that accessing remote memory would cause a measurable performance +reduction. The page allocator will take additional actions before +allocating off node pages. Allowing zone reclaim to write out pages stops processes that are writing large amounts of data from dirtying pages on other nodes. Zone diff --git a/mm/vmscan.c b/mm/vmscan.c index fea6b43bc1f9..562e87cbd7a1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4085,8 +4085,13 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for NODE_RECLAIM. This determines the fraction of pages From 70ad3196a68b0857b49811da7a94ad4f5a8e75bb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:09:19 -0800 Subject: [PATCH 152/172] z3fold: remove unused attribute for release_z3fold_page Since commit dcf5aedb24f8 ("z3fold: stricter locking and more careful reclaim"), release_z3fold_page() is used again. So we can drop the unused attribute safely. Link: https://lkml.kernel.org/r/20210120084008.58432-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index dacb0d70fa61..8e53cdf4d190 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -541,8 +541,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) spin_unlock(&pool->stale_lock); } -static void __attribute__((__unused__)) - release_z3fold_page(struct kref *ref) +static void release_z3fold_page(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); From c457cd96f18c7137287700c409d2ae16c6395256 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:09:22 -0800 Subject: [PATCH 153/172] z3fold: simplify the zhdr initialization code in init_z3fold_page() We can simplify the zhdr initialization by memset() the zhdr first instead of set struct member to zero one by one. This would also make code more compact and clear. Link: https://lkml.kernel.org/r/20210120085851.16159-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 8e53cdf4d190..c1ccf6bb0ffb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -413,16 +413,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, if (!slots) return NULL; + memset(zhdr, 0, sizeof(*zhdr)); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); - zhdr->first_chunks = 0; - zhdr->middle_chunks = 0; - zhdr->last_chunks = 0; - zhdr->first_num = 0; - zhdr->start_middle = 0; zhdr->cpu = -1; - zhdr->foreign_handles = 0; - zhdr->mapped_count = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); From d99fd5feb0ac1d56c36c760a8d922a46bd6c5521 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Wed, 24 Feb 2021 12:09:25 -0800 Subject: [PATCH 154/172] mm/compaction: remove rcu_read_lock during page compaction isolate_migratepages_block() used rcu_read_lock() with the intention of safeguarding against the mem_cgroup being destroyed concurrently; but its TestClearPageLRU already protects against that. Delete the unnecessary rcu_read_lock() and _unlock(). Hugh Dickins helped on commit log polishing, Thanks! Link: https://lkml.kernel.org/r/1608614453-10739-3-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Hugh Dickins Cc: Hugh Dickins Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3aef10c61d0e..69ca3d6f9764 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -995,7 +995,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!TestClearPageLRU(page)) goto isolate_fail_put; - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); /* If we already hold the lock, we can skip some rechecking */ @@ -1005,7 +1004,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1026,8 +1024,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, SetPageLRU(page); goto isolate_fail_put; } - } else - rcu_read_unlock(); + } /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) From e2d26aa5fb393e930eb03628e8add7bd600a8b97 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:09:29 -0800 Subject: [PATCH 155/172] mm/compaction: remove duplicated VM_BUG_ON_PAGE !PageLocked The VM_BUG_ON_PAGE(!PageLocked(page), page) is also done in PageMovable. Remove this explicitly one. Link: https://lkml.kernel.org/r/20210109081420.46030-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 69ca3d6f9764..b5220f275525 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -137,7 +137,6 @@ EXPORT_SYMBOL(__SetPageMovable); void __ClearPageMovable(struct page *page) { - VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageMovable(page), page); /* * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE From 40d7e2032007f9b2ea9aad7c1399cff3bef0239c Mon Sep 17 00:00:00 2001 From: Charan Teja Reddy Date: Wed, 24 Feb 2021 12:09:32 -0800 Subject: [PATCH 156/172] mm/compaction: correct deferral logic for proactive compaction should_proactive_compact_node() returns true when sum of the weighted fragmentation score of all the zones in the node is greater than the wmark_high of compaction, which then triggers the proactive compaction that operates on the individual zones of the node. But proactive compaction runs on the zone only when its weighted fragmentation score is greater than wmark_low(=wmark_high - 10). This means that the sum of the weighted fragmentation scores of all the zones can exceed the wmark_high but individual weighted fragmentation zone scores can still be less than wmark_low which makes the unnecessary trigger of the proactive compaction only to return doing nothing. Issue with the return of proactive compaction with out even trying is its deferral. It is simply deferred for 1 << COMPACT_MAX_DEFER_SHIFT if the scores across the proactive compaction is same, thinking that compaction didn't make any progress but in reality it didn't even try. With the delay between successive retries for proactive compaction is 500msec, it can result into the deferral for ~30sec with out even trying the proactive compaction. Test scenario is that: compaction_proactiveness=50 thus the wmark_low = 50 and wmark_high = 60. System have 2 zones(Normal and Movable) with sizes 5GB and 6GB respectively. After opening some apps on the android, the weighted fragmentation scores of these zones are 47 and 49 respectively. Since the sum of these fragmentation scores are above the wmark_high which triggers the proactive compaction and there since the individual zones weighted fragmentation scores are below wmark_low, it returns without trying the proactive compaction. As a result the weighted fragmentation scores of the zones are still 47 and 49 which makes the existing logic to defer the compaction thinking that noprogress is made across the compaction. Fix this by checking just zone fragmentation score, not the weighted, in __compact_finished() and use the zones weighted fragmentation score in fragmentation_score_node(). In the test case above, If the weighted average of is above wmark_high, then individual score (not adjusted) of atleast one zone has to be above wmark_high. Thus it avoids the unnecessary trigger and deferrals of the proactive compaction. Link: https://lkml.kernel.org/r/1610989938-31374-1-git-send-email-charante@codeaurora.org Signed-off-by: Charan Teja Reddy Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Reviewed-by: Khalid Aziz Acked-by: David Rientjes Cc: Michal Hocko Cc: Nitin Gupta Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b5220f275525..50b31e2ec6cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1922,20 +1922,28 @@ static bool kswapd_is_running(pg_data_t *pgdat) /* * A zone's fragmentation score is the external fragmentation wrt to the - * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value - * in the range [0, 100]. + * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100]. + */ +static unsigned int fragmentation_score_zone(struct zone *zone) +{ + return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); +} + +/* + * A weighted zone's fragmentation score is the external fragmentation + * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It + * returns a value in the range [0, 100]. * * The scaling factor ensures that proactive compaction focuses on larger * zones like ZONE_NORMAL, rather than smaller, specialized zones like * ZONE_DMA32. For smaller zones, the score value remains close to zero, * and thus never exceeds the high threshold for proactive compaction. */ -static unsigned int fragmentation_score_zone(struct zone *zone) +static unsigned int fragmentation_score_zone_weighted(struct zone *zone) { unsigned long score; - score = zone->present_pages * - extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + score = zone->present_pages * fragmentation_score_zone(zone); return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); } @@ -1955,7 +1963,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) struct zone *zone; zone = &pgdat->node_zones[zoneid]; - score += fragmentation_score_zone(zone); + score += fragmentation_score_zone_weighted(zone); } return score; From 15d28d0d11609c7a4f217b3d85e26456d9beb134 Mon Sep 17 00:00:00 2001 From: Wonhyuk Yang Date: Wed, 24 Feb 2021 12:09:36 -0800 Subject: [PATCH 157/172] mm/compaction: fix misbehaviors of fast_find_migrateblock() In the fast_find_migrateblock(), it iterates ocer the freelist to find the proper pageblock. But there are some misbehaviors. First, if the page we found is equal to cc->migrate_pfn, it is considered that we didn't find a suitable pageblock. Secondly, if the loop was terminated because order is less than PAGE_ALLOC_COSTLY_ORDER, it could be considered that we found a suitable one. Thirdly, if the skip bit is set on the page block and we goto continue, it doesn't check nr_scanned. Fourthly, if the page block's skip bit is set, it checks that page block is the last of list, which is unnecessary. Link: https://lkml.kernel.org/r/20210128130411.6125-1-vvghjk1234@gmail.com Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: Wonhyuk Yang Acked-by: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 50b31e2ec6cb..3d9be07633d9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1698,6 +1698,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) unsigned long pfn = cc->migrate_pfn; unsigned long high_pfn; int order; + bool found_block = false; /* Skip hints are relied on to avoid repeats on the fast search */ if (cc->ignore_skip_hint) @@ -1740,7 +1741,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); for (order = cc->order - 1; - order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; + order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; order--) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; @@ -1755,7 +1756,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) list_for_each_entry(freepage, freelist, lru) { unsigned long free_pfn; - nr_scanned++; + if (nr_scanned++ >= limit) { + move_freelist_tail(freelist, freepage); + break; + } + free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { /* @@ -1764,12 +1769,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * the list assumes an entry is deleted, not * reordered. */ - if (get_pageblock_skip(freepage)) { - if (list_is_last(freelist, &freepage->lru)) - break; - + if (get_pageblock_skip(freepage)) continue; - } /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); @@ -1777,15 +1778,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); cc->fast_search_fail = 0; + found_block = true; set_pageblock_skip(freepage); break; } - - if (nr_scanned >= limit) { - cc->fast_search_fail++; - move_freelist_tail(freelist, freepage); - break; - } } spin_unlock_irqrestore(&cc->zone->lock, flags); } @@ -1796,9 +1792,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * If fast scanning failed then use a cached entry for a page block * that had free pages as the basis for starting a linear scan. */ - if (pfn == cc->migrate_pfn) + if (!found_block) { + cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); - + } return pfn; } From 6e2b7044c199229a3d20cefbd3184968238c4184 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 24 Feb 2021 12:09:39 -0800 Subject: [PATCH 158/172] mm, compaction: make fast_isolate_freepages() stay within zone Compaction always operates on pages from a single given zone when isolating both pages to migrate and freepages. Pageblock boundaries are intersected with zone boundaries to be safe in case zone starts or ends in the middle of pageblock. The use of pageblock_pfn_to_page() protects against non-contiguous pageblocks. The functions fast_isolate_freepages() and fast_isolate_around() don't currently protect the fast freepage isolation thoroughly enough against these corner cases, and can result in freepage isolation operate outside of zone boundaries: - in fast_isolate_freepages() if we get a pfn from the first pageblock of a zone that starts in the middle of that pageblock, 'highest' can be a pfn outside of the zone. If we fail to isolate anything in this function, we may then call fast_isolate_around() on a pfn outside of the zone and there effectively do a set_pageblock_skip(page_to_pfn(highest)) which may currently hit a VM_BUG_ON() in some configurations - fast_isolate_around() checks only the zone end boundary and not beginning, nor that the pageblock is contiguous (with pageblock_pfn_to_page()) so it's possible that we end up calling isolate_freepages_block() on a range of pfn's from two different zones and end up e.g. isolating freepages under the wrong zone's lock. This patch should fix the above issues. Link: https://lkml.kernel.org/r/20210217173300.6394-1-vbabka@suse.cz Fixes: 5a811889de10 ("mm, compaction: use free lists to quickly locate a migration target") Signed-off-by: Vlastimil Babka Acked-by: David Rientjes Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Michal Hocko Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3d9be07633d9..e04f4476e68e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1284,7 +1284,7 @@ static void fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) { unsigned long start_pfn, end_pfn; - struct page *page = pfn_to_page(pfn); + struct page *page; /* Do not search around if there are enough pages already */ if (cc->nr_freepages >= cc->nr_migratepages) @@ -1295,8 +1295,12 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long return; /* Pageblock boundaries */ - start_pfn = pageblock_start_pfn(pfn); - end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; + start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn); + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)); + + page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone); + if (!page) + return; /* Scan before */ if (start_pfn != pfn) { @@ -1398,7 +1402,8 @@ fast_isolate_freepages(struct compact_control *cc) pfn = page_to_pfn(freepage); if (pfn >= highest) - highest = pageblock_start_pfn(pfn); + highest = max(pageblock_start_pfn(pfn), + cc->zone->zone_start_pfn); if (pfn >= low_pfn) { cc->fast_search_fail = 0; @@ -1468,7 +1473,8 @@ fast_isolate_freepages(struct compact_control *cc) } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { page = pageblock_pfn_to_page(min_pfn, - pageblock_end_pfn(min_pfn), + min(pageblock_end_pfn(min_pfn), + zone_end_pfn(cc->zone)), cc->zone); cc->free_pfn = min_pfn; } From bda420b985054a3badafef23807c4b4fa38a3dff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 24 Feb 2021 12:09:43 -0800 Subject: [PATCH 159/172] numa balancing: migrate on fault among multiple bound nodes Now, NUMA balancing can only optimize the page placement among the NUMA nodes if the default memory policy is used. Because the memory policy specified explicitly should take precedence. But this seems too strict in some situations. For example, on a system with 4 NUMA nodes, if the memory of an application is bound to the node 0 and 1, NUMA balancing can potentially migrate the pages between the node 0 and 1 to reduce cross-node accessing without breaking the explicit memory binding policy. So in this patch, we add MPOL_F_NUMA_BALANCING mode flag to set_mempolicy() when mode is MPOL_BIND. With the flag specified, NUMA balancing will be enabled within the thread to optimize the page placement within the constrains of the specified memory binding policy. With the newly added flag, the NUMA balancing control mechanism becomes, - sysctl knob numa_balancing can enable/disable the NUMA balancing globally. - even if sysctl numa_balancing is enabled, the NUMA balancing will be disabled for the memory areas or applications with the explicit memory policy by default. - MPOL_F_NUMA_BALANCING can be used to enable the NUMA balancing for the applications when specifying the explicit memory policy (MPOL_BIND). Various page placement optimization based on the NUMA balancing can be done with these flags. As the first step, in this patch, if the memory of the application is bound to multiple nodes (MPOL_BIND), and in the hint page fault handler the accessing node are in the policy nodemask, the page will be tried to be migrated to the accessing node to reduce the cross-node accessing. If the newly added MPOL_F_NUMA_BALANCING flag is specified by an application on an old kernel version without its support, set_mempolicy() will return -1 and errno will be set to EINVAL. The application can use this behavior to run on both old and new kernel versions. And if the MPOL_F_NUMA_BALANCING flag is specified for the mode other than MPOL_BIND, set_mempolicy() will return -1 and errno will be set to EINVAL as before. Because we don't support optimization based on the NUMA balancing for these modes. In the previous version of the patch, we tried to reuse MPOL_MF_LAZY for mbind(). But that flag is tied to MPOL_MF_MOVE.*, so it seems not a good API/ABI for the purpose of the patch. And because it's not clear whether it's necessary to enable NUMA balancing for a specific memory area inside an application, so we only add the flag at the thread level (set_mempolicy()) instead of the memory area level (mbind()). We can do that when it become necessary. To test the patch, we run a test case as follows on a 4-node machine with 192 GB memory (48 GB per node). 1. Change pmbench memory accessing benchmark to call set_mempolicy() to bind its memory to node 1 and 3 and enable NUMA balancing. Some related code snippets are as follows, #include #include struct bitmask *bmp; int ret; bmp = numa_parse_nodestring("1,3"); ret = set_mempolicy(MPOL_BIND | MPOL_F_NUMA_BALANCING, bmp->maskp, bmp->size + 1); /* If MPOL_F_NUMA_BALANCING isn't supported, fall back to MPOL_BIND */ if (ret < 0 && errno == EINVAL) ret = set_mempolicy(MPOL_BIND, bmp->maskp, bmp->size + 1); if (ret < 0) { perror("Failed to call set_mempolicy"); exit(-1); } 2. Run a memory eater on node 3 to use 40 GB memory before running pmbench. 3. Run pmbench with 64 processes, the working-set size of each process is 640 MB, so the total working-set size is 64 * 640 MB = 40 GB. The CPU and the memory (as in step 1.) of all pmbench processes is bound to node 1 and 3. So, after CPU usage is balanced, some pmbench processes run on the CPUs of the node 3 will access the memory of the node 1. 4. After the pmbench processes run for 100 seconds, kill the memory eater. Now it's possible for some pmbench processes to migrate their pages from node 1 to node 3 to reduce cross-node accessing. Test results show that, with the patch, the pages can be migrated from node 1 to node 3 after killing the memory eater, and the pmbench score can increase about 17.5%. Link: https://lkml.kernel.org/r/20210120061235.148637-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Rik van Riel Cc: Johannes Weiner Cc: "Matthew Wilcox (Oracle)" Cc: Dave Hansen Cc: Andi Kleen Cc: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/mempolicy.h | 4 +++- mm/mempolicy.c | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 3354774af61e..8948467b3992 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -28,12 +28,14 @@ enum { /* Flags for set_mempolicy */ #define MPOL_F_STATIC_NODES (1 << 15) #define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_NUMA_BALANCING (1 << 13) /* Optimize with NUMA balancing if possible */ /* * MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to * either set_mempolicy() or mbind(). */ -#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES) +#define MPOL_MODE_FLAGS \ + (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES | MPOL_F_NUMA_BALANCING) /* Flags for get_mempolicy */ #define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2c3a86502053..6961238c7ef5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } + if (flags & MPOL_F_NUMA_BALANCING) { + if (new && new->mode == MPOL_BIND) { + new->flags |= (MPOL_F_MOF | MPOL_F_MORON); + } else { + ret = -EINVAL; + mpol_put(new); + goto out; + } + } + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { mpol_put(new); @@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* Optimize placement among multiple nodes via NUMA balancing */ + if (pol->flags & MPOL_F_MORON) { + if (node_isset(thisnid, pol->v.nodes)) + break; + goto out; + } /* * allows binding to multiple nodes. From ce33135cdee6e2c2874e9d1198a6df0c5f356080 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:09:47 -0800 Subject: [PATCH 160/172] mm/mempolicy: use helper range_in_vma() in queue_pages_test_walk() The helper range_in_vma() is introduced via commit 017b1660df89 ("mm: migration: fix migration of huge PMD shared pages"). But we forgot to use it in queue_pages_test_walk(). Link: https://lkml.kernel.org/r/20210130091352.20220-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6961238c7ef5..ab51132547b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -677,7 +677,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long flags = qp->flags; /* range check first */ - VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); + VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; From f8159c13905bba26f3e1782a521dacf7a66fc1ce Mon Sep 17 00:00:00 2001 From: Tang Yizhou Date: Wed, 24 Feb 2021 12:09:50 -0800 Subject: [PATCH 161/172] mm, oom: fix a comment in dump_task() If p is a kthread, it will be checked in oom_unkillable_task() so we can delete the corresponding comment. Link: https://lkml.kernel.org/r/20210125133006.7242-1-tangyizhou@huawei.com Signed-off-by: Tang Yizhou Acked-by: David Rientjes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c9a33ffe38b7..9efaf430cfd3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -395,9 +395,8 @@ static int dump_task(struct task_struct *p, void *arg) task = find_lock_task_mm(p); if (!task) { /* - * This is a kthread or all of p's threads have already - * detached their mm's. There's no need to report - * them; they can't be oom killed anyway. + * All of p's threads have already detached their mm's. There's + * no need to report them; they can't be oom killed anyway. */ return 0; } From 33b8f84a4ee78491a8f4f9e4c5520c9da4a10983 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:54 -0800 Subject: [PATCH 162/172] mm/hugetlb: change hugetlb_reserve_pages() to type bool While reviewing a bug in hugetlb_reserve_pages, it was noticed that all callers ignore the return value. Any failure is considered an ENOMEM error by the callers. Change the function to be of type bool. The function will return true if the reservation was successful, false otherwise. Callers currently assume a zero return code indicates success. Change the callers to look for true to indicate success. No functional change, only code cleanup. Link: https://lkml.kernel.org/r/20201221192542.15732-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Dan Carpenter Cc: Michal Hocko Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 37 ++++++++++++++----------------------- 3 files changed, 17 insertions(+), 26 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5a8ed6bd4f87..3eca85a4d940 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; - if (hugetlb_reserve_pages(inode, + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vma->vm_flags)) @@ -1493,7 +1493,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (hugetlb_reserve_pages(inode, 0, + if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a541c6cf5c3..cccd1aab69dd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -139,7 +139,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte, unsigned long dst_addr, unsigned long src_addr, struct page **pagep); -int hugetlb_reserve_pages(struct inode *inode, long from, long to, +bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e5f494291c6..8fb42c6dd74b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5016,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, return pages << h->order; } -int hugetlb_reserve_pages(struct inode *inode, +/* Return true if reservation was successful, false otherwise. */ +bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long ret, chg, add = -1; + long chg, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -5031,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return -EINVAL; + return false; } /* @@ -5040,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return 0; + return true; /* * Shared mappings base their reservation on the number of pages that @@ -5062,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return -ENOMEM; + return false; chg = to - from; @@ -5070,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) { - ret = chg; + if (chg < 0) goto out_err; - } - ret = hugetlb_cgroup_charge_cgroup_rsvd( - hstate_index(h), chg * pages_per_huge_page(h), &h_cg); - - if (ret < 0) { - ret = -ENOMEM; + if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - } if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs @@ -5096,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode, * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); - if (gbl_reserve < 0) { - ret = -ENOSPC; + if (gbl_reserve < 0) goto out_uncharge_cgroup; - } /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); - if (ret < 0) { + if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; - } /* * Account for the reservations made. Shared mappings record regions @@ -5126,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode, if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); - ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* @@ -5147,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode, hugetlb_acct_memory(h, -rsv_adjust); } } - return 0; + return true; + out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); @@ -5163,7 +5154,7 @@ out_err: region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); - return ret; + return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, From a4fa34cdcd18296c097e2648fe894d28c5cf9709 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 24 Feb 2021 12:09:58 -0800 Subject: [PATCH 163/172] hugetlbfs: remove special hugetlbfs_set_page_dirty() Matthew Wilcox noticed that hugetlbfs_set_page_dirty always returns 0. Instead, it should return 1 or 0 depending on the previous state of the dirty bit. In addition, the call to compound_head is redundant as it is also performed in calling routine set_page_dirty. Replace the hugetlbfs specific routine hugetlbfs_set_page_dirty with __set_page_dirty_no_writeback as it addresses both of these issues. Link: https://lkml.kernel.org/r/20201221192542.15732-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Suggested-by: Matthew Wilcox Cc: Dan Carpenter Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3eca85a4d940..dc4aceffa2c7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -952,17 +952,6 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns, return error; } -/* - * mark the head page dirty - */ -static int hugetlbfs_set_page_dirty(struct page *page) -{ - struct page *head = compound_head(page); - - SetPageDirty(head); - return 0; -} - static int hugetlbfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -1150,7 +1139,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, - .set_page_dirty = hugetlbfs_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .migratepage = hugetlbfs_migrate_page, .error_remove_page = hugetlbfs_error_remove_page, }; From d0146756a0993d3a01407b38cd87d965ccda72c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:01 -0800 Subject: [PATCH 164/172] hugetlbfs: remove useless BUG_ON(!inode) in hugetlbfs_setattr() When we reach here with inode = NULL, we should have crashed as inode has already been dereferenced via hstate_inode. So this BUG_ON(!inode) does not take effect and should be removed. Link: https://lkml.kernel.org/r/20210118110700.52506-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dc4aceffa2c7..71a00dbfa612 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -761,8 +761,6 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - BUG_ON(!inode); - error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; From 3b2275a8d83a29e579b4f96f4c431d824e5f4a16 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:04 -0800 Subject: [PATCH 165/172] hugetlbfs: use helper macro default_hstate in init_hugetlbfs_fs Since commit e5ff215941d5 ("hugetlb: multiple hstates for multiple page sizes"), we can use macro default_hstate to get the struct hstate which we use by default. But init_hugetlbfs_fs() forgot to use it. Link: https://lkml.kernel.org/r/20210116091827.20982-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 71a00dbfa612..32c8ae4e1e48 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1543,7 +1543,7 @@ static int __init init_hugetlbfs_fs(void) goto out_free; /* default hstate mount is required */ - mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; From c7e285e31f76453bc958006ebe5311a6cca909e3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:08 -0800 Subject: [PATCH 166/172] hugetlbfs: correct obsolete function name in hugetlbfs_read_iter() Since commit 36e789144267 ("kill do_generic_mapping_read"), the function do_generic_mapping_read() is renamed to do_generic_file_read(). And then commit 47c27bc46946 ("fs: pass iocb to do_generic_file_read") renamed it to generic_file_buffered_read(). So replace do_generic_mapping_read() with generic_file_buffered_read() to keep comment uptodate. Link: https://lkml.kernel.org/r/20210118063210.47118-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32c8ae4e1e48..974eee6736d2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -310,7 +310,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to do_generic_mapping_read(), we can't use that + * data. Its *very* similar to generic_file_buffered_read(), we can't use that * since it has PAGE_SIZE assumptions. */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) From 88ce3fef47f3f382985ecefe8f290b6ff05b4335 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:11 -0800 Subject: [PATCH 167/172] hugetlbfs: remove meaningless variable avoid_reserve The variable avoid_reserve is meaningless because we never changed its value and just passed it to alloc_huge_page(). So remove it to make code more clear that in hugetlbfs_fallocate, we never avoid reserve when alloc hugepage yet. Also add a comment offered by Mike Kravetz to explain this. Link: https://lkml.kernel.org/r/20210120071508.9078-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 974eee6736d2..7982adc3d98d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -680,7 +680,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, */ struct page *page; unsigned long addr; - int avoid_reserve = 0; cond_resched(); @@ -716,8 +715,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, continue; } - /* Allocate page and add to page cache */ - page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + /* + * Allocate page without setting the avoid_reserve argument. + * There certainly are no reserves associated with the + * pseudo_vma. However, there could be shared mappings with + * reserves for the file at the inode level. If we fallocate + * pages in these areas, we need to consume the reserves + * to keep reservation accounting consistent. + */ + page = alloc_huge_page(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(page)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); From a25fddced835ae53d18eb4bddabd719b4cebf624 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:14 -0800 Subject: [PATCH 168/172] hugetlbfs: make hugepage size conversion more readable The calculation 1U << (h->order + PAGE_SHIFT - 10) is actually equal to (PAGE_SHIFT << (h->order)) >> 10. So we can make it more readable by replace it with huge_page_size(h) >> 10. Link: https://lkml.kernel.org/r/20210122083141.24548-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7982adc3d98d..46fad3d9265b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1520,8 +1520,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) put_fs_context(fc); } if (IS_ERR(mnt)) - pr_err("Cannot mount internal hugetlbfs for page size %uK", - 1U << (h->order + PAGE_SHIFT - 10)); + pr_err("Cannot mount internal hugetlbfs for page size %luK", + huge_page_size(h) >> 10); return mnt; } From 398c0da7364c907ccc662416585c19c5523cf678 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:18 -0800 Subject: [PATCH 169/172] hugetlbfs: correct some obsolete comments about inode i_mutex Since commit 9902af79c01a ("parallel lookups: actual switch to rwsem"), i_mutex of inode is converted to i_rwsem. So replace i_mutex with i_rwsem to make comments up to date. Link: https://lkml.kernel.org/r/20210127093111.36672-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 46fad3d9265b..f5758ba372ae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -604,7 +604,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; @@ -777,7 +777,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, if (newsize & ~huge_page_mask(h)) return -EINVAL; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; From 1935ebd3cf6c44038479bb2e7b4dd99bd492b3f2 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:21 -0800 Subject: [PATCH 170/172] hugetlbfs: fix some comment typos Fix typos reserv to reserve, minimim to minimum. No functional change intended. Link: https://lkml.kernel.org/r/20210130092351.28072-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f5758ba372ae..394da2ab08ad 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -442,15 +442,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. - * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents * page faults in the truncated range by checking i_size. i_size is * modified while holding i_mmap_rwsem. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. - * Only when releasing a page is the associated region/reserv map - * deleted. The region/reserv map for ranges without associated + * Only when releasing a page is the associated region/reserve map + * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but @@ -1343,7 +1343,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is - * specified. Any needed reservations (for minimim size) are taken + * specified. Any needed reservations (for minimum size) are taken * taken when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { From e5d319dedafd21211fd19ea28a3f50da7368d6ff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 24 Feb 2021 12:10:25 -0800 Subject: [PATCH 171/172] hugetlbfs: remove unneeded return value of hugetlb_vmtruncate() The function hugetlb_vmtruncate() is guaranteed to always success since commit 7aa91e104028 ("hugetlb: allow extending ftruncate on hugetlbfs"). So we should remove the unneeded return value which is always 0. Link: https://lkml.kernel.org/r/20210208084637.47789-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 394da2ab08ad..701c82c36138 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -567,7 +567,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; @@ -582,7 +582,6 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); - return 0; } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -781,9 +780,7 @@ static int hugetlbfs_setattr(struct user_namespace *mnt_userns, if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; - error = hugetlb_vmtruncate(inode, newsize); - if (error) - return error; + hugetlb_vmtruncate(inode, newsize); } setattr_copy(&init_user_ns, inode, attr); From a553e3cd2053501b658feec2be9a3b662eb1b22b Mon Sep 17 00:00:00 2001 From: Chengyang Fan Date: Wed, 24 Feb 2021 12:10:28 -0800 Subject: [PATCH 172/172] mm/migrate: remove unneeded semicolons Remove superfluous semicolons after function definitions. Link: https://lkml.kernel.org/r/20210115110131.2359683-1-cy.fan@huawei.com Signed-off-by: Chengyang Fan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4594838a0f7c..3a389633b68f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -89,7 +89,7 @@ extern int PageMovable(struct page *page); extern void __SetPageMovable(struct page *page, struct address_space *mapping); extern void __ClearPageMovable(struct page *page); #else -static inline int PageMovable(struct page *page) { return 0; }; +static inline int PageMovable(struct page *page) { return 0; } static inline void __SetPageMovable(struct page *page, struct address_space *mapping) {