From d32311fed70d12f14e585feb4653571b1e2b0e6d Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Sep 2005 14:41:40 +1000 Subject: [PATCH 01/98] [PATCH] Introduce sg_set_buf sg_init_one is a nice tool for the block layer. However, users of struct scatterlist in other subsystems don't usually need the DMA attributes. For them it's a waste of time and space to initialise the whole struct scatterlist structure. Therefore this patch adds a new function sg_set_buf to initialise a scatterlist without zeroing the DMA attributes. Signed-off-by: Herbert Xu --- include/linux/scatterlist.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 7f717e95ae37..66ff545552f7 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -1,14 +1,23 @@ #ifndef _LINUX_SCATTERLIST_H #define _LINUX_SCATTERLIST_H -static inline void sg_init_one(struct scatterlist *sg, - u8 *buf, unsigned int buflen) -{ - memset(sg, 0, sizeof(*sg)); +#include +#include +#include +static inline void sg_set_buf(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ sg->page = virt_to_page(buf); sg->offset = offset_in_page(buf); sg->length = buflen; } +static inline void sg_init_one(struct scatterlist *sg, void *buf, + unsigned int buflen) +{ + memset(sg, 0, sizeof(*sg)); + sg_set_buf(sg, buf, buflen); +} + #endif /* _LINUX_SCATTERLIST_H */ From 378f058cc49bcda7fa63d3cd86d2f9a0a5188b1c Mon Sep 17 00:00:00 2001 From: David Hardeman Date: Sat, 17 Sep 2005 17:55:31 +1000 Subject: [PATCH 02/98] [PATCH] Use sg_set_buf/sg_init_one where applicable This patch uses sg_set_buf/sg_init_one in some places where it was duplicated. Signed-off-by: David Hardeman Cc: James Bottomley Cc: Greg KH Cc: "David S. Miller" Cc: Jeff Garzik Signed-off-by: Andrew Morton Signed-off-by: Herbert Xu --- crypto/hmac.c | 19 +++------- crypto/tcrypt.c | 52 ++++++++------------------- drivers/md/dm-crypt.c | 12 +++---- drivers/net/wireless/airo.c | 5 ++- drivers/scsi/arm/scsi.h | 6 ++-- drivers/scsi/libata-core.c | 10 ++---- drivers/scsi/sg.c | 5 ++- drivers/usb/misc/usbtest.c | 7 ++-- net/ipv6/addrconf.c | 10 ++---- net/sunrpc/auth_gss/gss_krb5_crypto.c | 23 ++++-------- 10 files changed, 44 insertions(+), 105 deletions(-) diff --git a/crypto/hmac.c b/crypto/hmac.c index da0456b37109..46120dee5ada 100644 --- a/crypto/hmac.c +++ b/crypto/hmac.c @@ -18,18 +18,15 @@ #include #include #include -#include +#include #include "internal.h" static void hash_key(struct crypto_tfm *tfm, u8 *key, unsigned int keylen) { struct scatterlist tmp; - tmp.page = virt_to_page(key); - tmp.offset = offset_in_page(key); - tmp.length = keylen; + sg_set_buf(&tmp, key, keylen); crypto_digest_digest(tfm, &tmp, 1, key); - } int crypto_alloc_hmac_block(struct crypto_tfm *tfm) @@ -69,9 +66,7 @@ void crypto_hmac_init(struct crypto_tfm *tfm, u8 *key, unsigned int *keylen) for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++) ipad[i] ^= 0x36; - tmp.page = virt_to_page(ipad); - tmp.offset = offset_in_page(ipad); - tmp.length = crypto_tfm_alg_blocksize(tfm); + sg_set_buf(&tmp, ipad, crypto_tfm_alg_blocksize(tfm)); crypto_digest_init(tfm); crypto_digest_update(tfm, &tmp, 1); @@ -103,16 +98,12 @@ void crypto_hmac_final(struct crypto_tfm *tfm, u8 *key, for (i = 0; i < crypto_tfm_alg_blocksize(tfm); i++) opad[i] ^= 0x5c; - tmp.page = virt_to_page(opad); - tmp.offset = offset_in_page(opad); - tmp.length = crypto_tfm_alg_blocksize(tfm); + sg_set_buf(&tmp, opad, crypto_tfm_alg_blocksize(tfm)); crypto_digest_init(tfm); crypto_digest_update(tfm, &tmp, 1); - tmp.page = virt_to_page(out); - tmp.offset = offset_in_page(out); - tmp.length = crypto_tfm_alg_digestsize(tfm); + sg_set_buf(&tmp, out, crypto_tfm_alg_digestsize(tfm)); crypto_digest_update(tfm, &tmp, 1); crypto_digest_final(tfm, out); diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 68639419c5bd..577a3aff3113 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -86,7 +86,6 @@ static void hexdump(unsigned char *buf, unsigned int len) static void test_hash(char *algo, struct hash_testvec *template, unsigned int tcount) { - char *p; unsigned int i, j, k, temp; struct scatterlist sg[8]; char result[64]; @@ -116,10 +115,7 @@ static void test_hash(char *algo, struct hash_testvec *template, printk("test %u:\n", i + 1); memset(result, 0, 64); - p = hash_tv[i].plaintext; - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = hash_tv[i].psize; + sg_set_buf(&sg[0], hash_tv[i].plaintext, hash_tv[i].psize); crypto_digest_init(tfm); if (tfm->crt_u.digest.dit_setkey) { @@ -154,10 +150,8 @@ static void test_hash(char *algo, struct hash_testvec *template, hash_tv[i].plaintext + temp, hash_tv[i].tap[k]); temp += hash_tv[i].tap[k]; - p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page(p); - sg[k].offset = offset_in_page(p); - sg[k].length = hash_tv[i].tap[k]; + sg_set_buf(&sg[k], &xbuf[IDX[k]], + hash_tv[i].tap[k]); } crypto_digest_digest(tfm, sg, hash_tv[i].np, result); @@ -179,7 +173,6 @@ static void test_hash(char *algo, struct hash_testvec *template, static void test_hmac(char *algo, struct hmac_testvec *template, unsigned int tcount) { - char *p; unsigned int i, j, k, temp; struct scatterlist sg[8]; char result[64]; @@ -210,11 +203,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template, printk("test %u:\n", i + 1); memset(result, 0, sizeof (result)); - p = hmac_tv[i].plaintext; klen = hmac_tv[i].ksize; - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = hmac_tv[i].psize; + sg_set_buf(&sg[0], hmac_tv[i].plaintext, hmac_tv[i].psize); crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, 1, result); @@ -243,10 +233,8 @@ static void test_hmac(char *algo, struct hmac_testvec *template, hmac_tv[i].plaintext + temp, hmac_tv[i].tap[k]); temp += hmac_tv[i].tap[k]; - p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page(p); - sg[k].offset = offset_in_page(p); - sg[k].length = hmac_tv[i].tap[k]; + sg_set_buf(&sg[k], &xbuf[IDX[k]], + hmac_tv[i].tap[k]); } crypto_hmac(tfm, hmac_tv[i].key, &klen, sg, @@ -270,7 +258,7 @@ static void test_cipher(char *algo, int mode, int enc, { unsigned int ret, i, j, k, temp; unsigned int tsize; - char *p, *q; + char *q; struct crypto_tfm *tfm; char *key; struct cipher_testvec *cipher_tv; @@ -330,10 +318,8 @@ static void test_cipher(char *algo, int mode, int enc, goto out; } - p = cipher_tv[i].input; - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = cipher_tv[i].ilen; + sg_set_buf(&sg[0], cipher_tv[i].input, + cipher_tv[i].ilen); if (!mode) { crypto_cipher_set_iv(tfm, cipher_tv[i].iv, @@ -389,10 +375,8 @@ static void test_cipher(char *algo, int mode, int enc, cipher_tv[i].input + temp, cipher_tv[i].tap[k]); temp += cipher_tv[i].tap[k]; - p = &xbuf[IDX[k]]; - sg[k].page = virt_to_page(p); - sg[k].offset = offset_in_page(p); - sg[k].length = cipher_tv[i].tap[k]; + sg_set_buf(&sg[k], &xbuf[IDX[k]], + cipher_tv[i].tap[k]); } if (!mode) { @@ -436,9 +420,7 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p, int bcount; int ret; - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = blen; + sg_set_buf(&sg[0], p, blen); for (start = jiffies, end = start + sec * HZ, bcount = 0; time_before(jiffies, end); bcount++) { @@ -464,9 +446,7 @@ static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p, int ret = 0; int i; - sg[0].page = virt_to_page(p); - sg[0].offset = offset_in_page(p); - sg[0].length = blen; + sg_set_buf(&sg[0], p, blen); local_bh_disable(); local_irq_disable(); @@ -709,9 +689,7 @@ static void test_crc32c(void) for (i = 0; i < NUMVEC; i++) { for (j = 0; j < VECSIZE; j++) test_vec[i][j] = ++b; - sg[i].page = virt_to_page(test_vec[i]); - sg[i].offset = offset_in_page(test_vec[i]); - sg[i].length = VECSIZE; + sg_set_buf(&sg[i], test_vec[i], VECSIZE); } seed = SEEDTESTVAL; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 28c1a628621f..cf6631056683 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include "dm.h" @@ -164,9 +164,7 @@ static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti, return -ENOMEM; } - sg.page = virt_to_page(cc->key); - sg.offset = offset_in_page(cc->key); - sg.length = cc->key_size; + sg_set_buf(&sg, cc->key, cc->key_size); crypto_digest_digest(hash_tfm, &sg, 1, salt); crypto_free_tfm(hash_tfm); @@ -207,14 +205,12 @@ static void crypt_iv_essiv_dtr(struct crypt_config *cc) static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector) { - struct scatterlist sg = { NULL, }; + struct scatterlist sg; memset(iv, 0, cc->iv_size); *(u64 *)iv = cpu_to_le64(sector); - sg.page = virt_to_page(iv); - sg.offset = offset_in_page(iv); - sg.length = cc->iv_size; + sg_set_buf(&sg, iv, cc->iv_size); crypto_cipher_encrypt((struct crypto_tfm *)cc->iv_gen_private, &sg, &sg, cc->iv_size); diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index 4c11699bad91..1609ce11389d 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -1590,9 +1591,7 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct aes_counter[12] = (u8)(counter >> 24); counter++; memcpy (plain, aes_counter, 16); - sg[0].page = virt_to_page(plain); - sg[0].offset = ((long) plain & ~PAGE_MASK); - sg[0].length = 16; + sg_set_buf(&sg[0], plain, 16); crypto_cipher_encrypt(tfm, sg, sg, 16); cipher = kmap(sg[0].page) + sg[0].offset; for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) { diff --git a/drivers/scsi/arm/scsi.h b/drivers/scsi/arm/scsi.h index 48e1c4d9738b..19937640e2e7 100644 --- a/drivers/scsi/arm/scsi.h +++ b/drivers/scsi/arm/scsi.h @@ -10,6 +10,8 @@ * Commonly used scsi driver functions. */ +#include + #define BELT_AND_BRACES /* @@ -22,9 +24,7 @@ static inline int copy_SCp_to_sg(struct scatterlist *sg, Scsi_Pointer *SCp, int BUG_ON(bufs + 1 > max); - sg->page = virt_to_page(SCp->ptr); - sg->offset = offset_in_page(SCp->ptr); - sg->length = SCp->this_residual; + sg_set_buf(sg, SCp->ptr, SCp->this_residual); if (bufs) memcpy(sg + 1, SCp->buffer + 1, diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index b1b1c6f01419..5ca97605ff35 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include "scsi.h" #include "scsi_priv.h" @@ -2576,19 +2577,12 @@ void ata_qc_prep(struct ata_queued_cmd *qc) void ata_sg_init_one(struct ata_queued_cmd *qc, void *buf, unsigned int buflen) { - struct scatterlist *sg; - qc->flags |= ATA_QCFLAG_SINGLE; - memset(&qc->sgent, 0, sizeof(qc->sgent)); qc->sg = &qc->sgent; qc->n_elem = 1; qc->buf_virt = buf; - - sg = qc->sg; - sg->page = virt_to_page(buf); - sg->offset = (unsigned long) buf & ~PAGE_MASK; - sg->length = buflen; + sg_init_one(qc->sg, buf, buflen); } /** diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 861e51375d70..07fee811c09e 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -49,6 +49,7 @@ static int sg_version_num = 30533; /* 2 digits for each component */ #include #include #include +#include #include "scsi.h" #include @@ -1992,9 +1993,7 @@ sg_build_indirect(Sg_scatter_hold * schp, Sg_fd * sfp, int buff_size) if (!p) break; } - sclp->page = virt_to_page(p); - sclp->offset = offset_in_page(p); - sclp->length = ret_sz; + sg_set_buf(sclp, p, ret_sz); SCSI_LOG_TIMEOUT(5, printk("sg_build_build: k=%d, a=0x%p, len=%d\n", k, sg_scatg2virt(sclp), ret_sz)); diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index 90a96257d6ce..2997f558159b 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include @@ -381,7 +381,6 @@ alloc_sglist (int nents, int max, int vary) sg = kmalloc (nents * sizeof *sg, SLAB_KERNEL); if (!sg) return NULL; - memset (sg, 0, nents * sizeof *sg); for (i = 0; i < nents; i++) { char *buf; @@ -394,9 +393,7 @@ alloc_sglist (int nents, int max, int vary) memset (buf, 0, size); /* kmalloc pages are always physically contiguous! */ - sg [i].page = virt_to_page (buf); - sg [i].offset = offset_in_page (buf); - sg [i].length = size; + sg_init_one(&sg[i], buf, size); if (vary) { size += vary; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a970b4727ce8..41edc14851e8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -75,7 +75,7 @@ #ifdef CONFIG_IPV6_PRIVACY #include #include -#include +#include #endif #include @@ -1217,12 +1217,8 @@ static int __ipv6_regen_rndid(struct inet6_dev *idev) struct net_device *dev; struct scatterlist sg[2]; - sg[0].page = virt_to_page(idev->entropy); - sg[0].offset = offset_in_page(idev->entropy); - sg[0].length = 8; - sg[1].page = virt_to_page(idev->work_eui64); - sg[1].offset = offset_in_page(idev->work_eui64); - sg[1].length = 8; + sg_set_buf(&sg[0], idev->entropy, 8); + sg_set_buf(&sg[1], idev->work_eui64, 8); dev = idev->dev; diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 3f3d5437f02d..e65e1f979275 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include @@ -75,9 +75,7 @@ krb5_encrypt( memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg[0].page = virt_to_page(out); - sg[0].offset = offset_in_page(out); - sg[0].length = length; + sg_set_buf(&sg[0], out, length); ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); @@ -117,9 +115,7 @@ krb5_decrypt( memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg[0].page = virt_to_page(out); - sg[0].offset = offset_in_page(out); - sg[0].length = length; + sg_set_buf(&sg[0], out, length); ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); @@ -132,13 +128,6 @@ out: EXPORT_SYMBOL(krb5_decrypt); -static void -buf_to_sg(struct scatterlist *sg, char *ptr, int len) { - sg->page = virt_to_page(ptr); - sg->offset = offset_in_page(ptr); - sg->length = len; -} - static int process_xdr_buf(struct xdr_buf *buf, int offset, int len, int (*actor)(struct scatterlist *, void *), void *data) @@ -152,7 +141,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len, thislen = buf->head[0].iov_len - offset; if (thislen > len) thislen = len; - buf_to_sg(sg, buf->head[0].iov_base + offset, thislen); + sg_set_buf(sg, buf->head[0].iov_base + offset, thislen); ret = actor(sg, data); if (ret) goto out; @@ -195,7 +184,7 @@ process_xdr_buf(struct xdr_buf *buf, int offset, int len, thislen = buf->tail[0].iov_len - offset; if (thislen > len) thislen = len; - buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen); + sg_set_buf(sg, buf->tail[0].iov_base + offset, thislen); ret = actor(sg, data); len -= thislen; } @@ -241,7 +230,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, goto out; crypto_digest_init(tfm); - buf_to_sg(sg, header, hdrlen); + sg_set_buf(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); process_xdr_buf(body, body_offset, body->len - body_offset, checksummer, tfm); From 6df5b9f48dd0e77fa796b9b7d3fde7cc5f1237f2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 19 Sep 2005 22:30:11 +1000 Subject: [PATCH 03/98] [CRYPTO] Simplify one-member scatterlist expressions This patch rewrites various occurences of &sg[0] where sg is an array of length one to simply sg. Signed-off-by: Herbert Xu --- crypto/tcrypt.c | 8 ++++---- drivers/net/wireless/airo.c | 4 ++-- net/sunrpc/auth_gss/gss_krb5_crypto.c | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 577a3aff3113..53f4ee804bdb 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -415,12 +415,12 @@ out: static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p, int blen, int sec) { - struct scatterlist sg[8]; + struct scatterlist sg[1]; unsigned long start, end; int bcount; int ret; - sg_set_buf(&sg[0], p, blen); + sg_set_buf(sg, p, blen); for (start = jiffies, end = start + sec * HZ, bcount = 0; time_before(jiffies, end); bcount++) { @@ -441,12 +441,12 @@ static int test_cipher_jiffies(struct crypto_tfm *tfm, int enc, char *p, static int test_cipher_cycles(struct crypto_tfm *tfm, int enc, char *p, int blen) { - struct scatterlist sg[8]; + struct scatterlist sg[1]; unsigned long cycles = 0; int ret = 0; int i; - sg_set_buf(&sg[0], p, blen); + sg_set_buf(sg, p, blen); local_bh_disable(); local_irq_disable(); diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index 1609ce11389d..750c0167539c 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -1591,9 +1591,9 @@ static void emmh32_setseed(emmh32_context *context, u8 *pkey, int keylen, struct aes_counter[12] = (u8)(counter >> 24); counter++; memcpy (plain, aes_counter, 16); - sg_set_buf(&sg[0], plain, 16); + sg_set_buf(sg, plain, 16); crypto_cipher_encrypt(tfm, sg, sg, 16); - cipher = kmap(sg[0].page) + sg[0].offset; + cipher = kmap(sg->page) + sg->offset; for (j=0; (j<16) && (i< (sizeof(context->coeff)/sizeof(context->coeff[0]))); ) { context->coeff[i++] = ntohl(*(u32 *)&cipher[j]); j += 4; diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index e65e1f979275..97c981fa6b8e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -75,7 +75,7 @@ krb5_encrypt( memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg_set_buf(&sg[0], out, length); + sg_set_buf(sg, out, length); ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); @@ -115,7 +115,7 @@ krb5_decrypt( memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); memcpy(out, in, length); - sg_set_buf(&sg[0], out, length); + sg_set_buf(sg, out, length); ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); From 1b40efd772f4419fbc1a8940506424246985c333 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 3 Oct 2005 15:15:36 +1000 Subject: [PATCH 04/98] [CRYPTO] Check cra_alignmask against cra_blocksize The cipher code relies on the fact that the block size is a multiple of the required alignment. So we should check this at the time of algorith registration. We also ensure that the block size is bounded by the page size. Signed-off-by: Herbert Xu --- crypto/api.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crypto/api.c b/crypto/api.c index 959c4e5f264f..40ae42e9b6a6 100644 --- a/crypto/api.c +++ b/crypto/api.c @@ -215,7 +215,10 @@ int crypto_register_alg(struct crypto_alg *alg) if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; - if (alg->cra_alignmask > PAGE_SIZE) + if (alg->cra_alignmask & alg->cra_blocksize) + return -EINVAL; + + if (alg->cra_blocksize > PAGE_SIZE) return -EINVAL; down_write(&crypto_alg_sem); From 0169e284f6b6b263cc7c2ed25986b96cd6fda610 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sat, 29 Oct 2005 21:25:10 -0400 Subject: [PATCH 05/98] [libata] remove ata_chk_err(), ->check_err() hook. We now depend on ->tf_read() to provide us with the contents of the Error shadow register. --- drivers/scsi/ahci.c | 9 --------- drivers/scsi/libata-core.c | 41 +++++++++----------------------------- drivers/scsi/sata_mv.c | 18 ----------------- drivers/scsi/sata_sil24.c | 8 -------- include/linux/libata.h | 2 -- 5 files changed, 9 insertions(+), 69 deletions(-) diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c index fe8187d6f58b..03829aedfd39 100644 --- a/drivers/scsi/ahci.c +++ b/drivers/scsi/ahci.c @@ -192,7 +192,6 @@ static void ahci_port_stop(struct ata_port *ap); static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf); static void ahci_qc_prep(struct ata_queued_cmd *qc); static u8 ahci_check_status(struct ata_port *ap); -static u8 ahci_check_err(struct ata_port *ap); static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc); static void ahci_remove_one (struct pci_dev *pdev); @@ -221,7 +220,6 @@ static const struct ata_port_operations ahci_ops = { .check_status = ahci_check_status, .check_altstatus = ahci_check_status, - .check_err = ahci_check_err, .dev_select = ata_noop_dev_select, .tf_read = ahci_tf_read, @@ -458,13 +456,6 @@ static u8 ahci_check_status(struct ata_port *ap) return readl(mmio + PORT_TFDATA) & 0xFF; } -static u8 ahci_check_err(struct ata_port *ap) -{ - void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr; - - return (readl(mmio + PORT_TFDATA) >> 8) & 0xFF; -} - static void ahci_tf_read(struct ata_port *ap, struct ata_taskfile *tf) { struct ahci_port_priv *pp = ap->private_data; diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index b1b1c6f01419..d2f71a2331bb 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -371,7 +371,7 @@ static void ata_tf_read_pio(struct ata_port *ap, struct ata_taskfile *tf) struct ata_ioports *ioaddr = &ap->ioaddr; tf->command = ata_check_status(ap); - tf->feature = ata_chk_err(ap); + tf->feature = inb(ioaddr->error_addr); tf->nsect = inb(ioaddr->nsect_addr); tf->lbal = inb(ioaddr->lbal_addr); tf->lbam = inb(ioaddr->lbam_addr); @@ -405,7 +405,7 @@ static void ata_tf_read_mmio(struct ata_port *ap, struct ata_taskfile *tf) struct ata_ioports *ioaddr = &ap->ioaddr; tf->command = ata_check_status(ap); - tf->feature = ata_chk_err(ap); + tf->feature = readb((void __iomem *)ioaddr->error_addr); tf->nsect = readb((void __iomem *)ioaddr->nsect_addr); tf->lbal = readb((void __iomem *)ioaddr->lbal_addr); tf->lbam = readb((void __iomem *)ioaddr->lbam_addr); @@ -525,30 +525,6 @@ u8 ata_altstatus(struct ata_port *ap) } -/** - * ata_chk_err - Read device error reg - * @ap: port where the device is - * - * Reads ATA taskfile error register for - * currently-selected device and return its value. - * - * Note: may NOT be used as the check_err() entry in - * ata_port_operations. - * - * LOCKING: - * Inherited from caller. - */ -u8 ata_chk_err(struct ata_port *ap) -{ - if (ap->ops->check_err) - return ap->ops->check_err(ap); - - if (ap->flags & ATA_FLAG_MMIO) { - return readb((void __iomem *) ap->ioaddr.error_addr); - } - return inb(ap->ioaddr.error_addr); -} - /** * ata_tf_to_fis - Convert ATA taskfile to SATA FIS structure * @tf: Taskfile to convert @@ -901,8 +877,8 @@ static u8 ata_dev_try_classify(struct ata_port *ap, unsigned int device) memset(&tf, 0, sizeof(tf)); - err = ata_chk_err(ap); ap->ops->tf_read(ap, &tf); + err = tf.feature; dev->class = ATA_DEV_NONE; @@ -1139,7 +1115,6 @@ static void ata_dev_identify(struct ata_port *ap, unsigned int device) unsigned int major_version; u16 tmp; unsigned long xfer_modes; - u8 status; unsigned int using_edd; DECLARE_COMPLETION(wait); struct ata_queued_cmd *qc; @@ -1193,8 +1168,11 @@ retry: else wait_for_completion(&wait); - status = ata_chk_status(ap); - if (status & ATA_ERR) { + spin_lock_irqsave(&ap->host_set->lock, flags); + ap->ops->tf_read(ap, &qc->tf); + spin_unlock_irqrestore(&ap->host_set->lock, flags); + + if (qc->tf.command & ATA_ERR) { /* * arg! EDD works for all test cases, but seems to return * the ATA signature for some ATAPI devices. Until the @@ -1207,7 +1185,7 @@ retry: * to have this problem. */ if ((using_edd) && (qc->tf.command == ATA_CMD_ID_ATA)) { - u8 err = ata_chk_err(ap); + u8 err = qc->tf.feature; if (err & ATA_ABORTED) { dev->class = ATA_DEV_ATAPI; qc->cursg = 0; @@ -4873,7 +4851,6 @@ EXPORT_SYMBOL_GPL(ata_tf_to_fis); EXPORT_SYMBOL_GPL(ata_tf_from_fis); EXPORT_SYMBOL_GPL(ata_check_status); EXPORT_SYMBOL_GPL(ata_altstatus); -EXPORT_SYMBOL_GPL(ata_chk_err); EXPORT_SYMBOL_GPL(ata_exec_command); EXPORT_SYMBOL_GPL(ata_port_start); EXPORT_SYMBOL_GPL(ata_port_stop); diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c index 422e0b6f603a..dcef5fe8600b 100644 --- a/drivers/scsi/sata_mv.c +++ b/drivers/scsi/sata_mv.c @@ -258,7 +258,6 @@ struct mv_host_priv { static void mv_irq_clear(struct ata_port *ap); static u32 mv_scr_read(struct ata_port *ap, unsigned int sc_reg_in); static void mv_scr_write(struct ata_port *ap, unsigned int sc_reg_in, u32 val); -static u8 mv_check_err(struct ata_port *ap); static void mv_phy_reset(struct ata_port *ap); static void mv_host_stop(struct ata_host_set *host_set); static int mv_port_start(struct ata_port *ap); @@ -296,7 +295,6 @@ static const struct ata_port_operations mv_ops = { .tf_load = ata_tf_load, .tf_read = ata_tf_read, .check_status = ata_check_status, - .check_err = mv_check_err, .exec_command = ata_exec_command, .dev_select = ata_std_dev_select, @@ -1184,22 +1182,6 @@ static irqreturn_t mv_interrupt(int irq, void *dev_instance, return IRQ_RETVAL(handled); } -/** - * mv_check_err - Return the error shadow register to caller. - * @ap: ATA channel to manipulate - * - * Marvell requires DMA to be stopped before accessing shadow - * registers. So we do that, then return the needed register. - * - * LOCKING: - * Inherited from caller. FIXME: protect mv_stop_dma with lock? - */ -static u8 mv_check_err(struct ata_port *ap) -{ - mv_stop_dma(ap); /* can't read shadow regs if DMA on */ - return readb((void __iomem *) ap->ioaddr.error_addr); -} - /** * mv_phy_reset - Perform eDMA reset followed by COMRESET * @ap: ATA channel to manipulate diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c index 51855d3bac64..e18a1e2bb65e 100644 --- a/drivers/scsi/sata_sil24.c +++ b/drivers/scsi/sata_sil24.c @@ -225,7 +225,6 @@ struct sil24_host_priv { }; static u8 sil24_check_status(struct ata_port *ap); -static u8 sil24_check_err(struct ata_port *ap); static u32 sil24_scr_read(struct ata_port *ap, unsigned sc_reg); static void sil24_scr_write(struct ata_port *ap, unsigned sc_reg, u32 val); static void sil24_tf_read(struct ata_port *ap, struct ata_taskfile *tf); @@ -280,7 +279,6 @@ static const struct ata_port_operations sil24_ops = { .check_status = sil24_check_status, .check_altstatus = sil24_check_status, - .check_err = sil24_check_err, .dev_select = ata_noop_dev_select, .tf_read = sil24_tf_read, @@ -363,12 +361,6 @@ static u8 sil24_check_status(struct ata_port *ap) return pp->tf.command; } -static u8 sil24_check_err(struct ata_port *ap) -{ - struct sil24_port_priv *pp = ap->private_data; - return pp->tf.feature; -} - static int sil24_scr_map[] = { [SCR_CONTROL] = 0, [SCR_STATUS] = 1, diff --git a/include/linux/libata.h b/include/linux/libata.h index 00a8a5738858..a4cce9936a80 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -347,7 +347,6 @@ struct ata_port_operations { void (*exec_command)(struct ata_port *ap, const struct ata_taskfile *tf); u8 (*check_status)(struct ata_port *ap); u8 (*check_altstatus)(struct ata_port *ap); - u8 (*check_err)(struct ata_port *ap); void (*dev_select)(struct ata_port *ap, unsigned int device); void (*phy_reset) (struct ata_port *ap); @@ -434,7 +433,6 @@ extern void ata_noop_dev_select (struct ata_port *ap, unsigned int device); extern void ata_std_dev_select (struct ata_port *ap, unsigned int device); extern u8 ata_check_status(struct ata_port *ap); extern u8 ata_altstatus(struct ata_port *ap); -extern u8 ata_chk_err(struct ata_port *ap); extern void ata_exec_command(struct ata_port *ap, const struct ata_taskfile *tf); extern int ata_port_start (struct ata_port *ap); extern void ata_port_stop (struct ata_port *ap); From 930fc45a49ddebe7555cc5c837d82b9c27e65ff4 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:15:41 -0700 Subject: [PATCH 06/98] [PATCH] vmalloc_node This patch adds vmalloc_node(size, node) -> Allocate necessary memory on the specified node and get_vm_area_node(size, flags, node) and the other functions that it depends on. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 8 ++++- mm/vmalloc.c | 73 ++++++++++++++++++++++++++++++++--------- 2 files changed, 64 insertions(+), 17 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3701a0673d2c..1d5577b2b752 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -32,10 +32,14 @@ struct vm_struct { * Highlevel APIs for driver use */ extern void *vmalloc(unsigned long size); +extern void *vmalloc_node(unsigned long size, int node); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot); +extern void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, + pgprot_t prot, int node); extern void vfree(void *addr); extern void *vmap(struct page **pages, unsigned int count, @@ -48,6 +52,8 @@ extern void vunmap(void *addr); extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags); extern struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end); +extern struct vm_struct *get_vm_area_node(unsigned long size, + unsigned long flags, int node); extern struct vm_struct *remove_vm_area(void *addr); extern struct vm_struct *__remove_vm_area(void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1150229b6366..5e9120598799 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5,6 +5,7 @@ * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 + * Numa awareness, Christoph Lameter, SGI, June 2005 */ #include @@ -158,8 +159,8 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) return err; } -struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, - unsigned long start, unsigned long end) +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end, int node) { struct vm_struct **p, *tmp, *area; unsigned long align = 1; @@ -178,7 +179,7 @@ struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, addr = ALIGN(start, align); size = PAGE_ALIGN(size); - area = kmalloc(sizeof(*area), GFP_KERNEL); + area = kmalloc_node(sizeof(*area), GFP_KERNEL, node); if (unlikely(!area)) return NULL; @@ -231,6 +232,12 @@ out: return NULL; } +struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, + unsigned long start, unsigned long end) +{ + return __get_vm_area_node(size, flags, start, end, -1); +} + /** * get_vm_area - reserve a contingous kernel virtual area * @@ -246,6 +253,11 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); } +struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node) +{ + return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node); +} + /* Caller must hold vmlist_lock */ struct vm_struct *__remove_vm_area(void *addr) { @@ -342,7 +354,6 @@ void vfree(void *addr) BUG_ON(in_interrupt()); __vunmap(addr, 1); } - EXPORT_SYMBOL(vfree); /** @@ -360,7 +371,6 @@ void vunmap(void *addr) BUG_ON(in_interrupt()); __vunmap(addr, 0); } - EXPORT_SYMBOL(vunmap); /** @@ -392,10 +402,10 @@ void *vmap(struct page **pages, unsigned int count, return area->addr; } - EXPORT_SYMBOL(vmap); -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + pgprot_t prot, int node) { struct page **pages; unsigned int nr_pages, array_size, i; @@ -406,9 +416,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) - pages = __vmalloc(array_size, gfp_mask, PAGE_KERNEL); + pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); else - pages = kmalloc(array_size, (gfp_mask & ~__GFP_HIGHMEM)); + pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); area->pages = pages; if (!area->pages) { remove_vm_area(area->addr); @@ -418,7 +428,10 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) memset(area->pages, 0, array_size); for (i = 0; i < area->nr_pages; i++) { - area->pages[i] = alloc_page(gfp_mask); + if (node < 0) + area->pages[i] = alloc_page(gfp_mask); + else + area->pages[i] = alloc_pages_node(node, gfp_mask, 0); if (unlikely(!area->pages[i])) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; @@ -435,18 +448,25 @@ fail: return NULL; } +void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_area_node(area, gfp_mask, prot, -1); +} + /** - * __vmalloc - allocate virtually contiguous memory + * __vmalloc_node - allocate virtually contiguous memory * * @size: allocation size * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages + * @node node to use for allocation or -1 * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node) { struct vm_struct *area; @@ -454,13 +474,18 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) if (!size || (size >> PAGE_SHIFT) > num_physpages) return NULL; - area = get_vm_area(size, VM_ALLOC); + area = get_vm_area_node(size, VM_ALLOC, node); if (!area) return NULL; - return __vmalloc_area(area, gfp_mask, prot); + return __vmalloc_area_node(area, gfp_mask, prot, node); } +EXPORT_SYMBOL(__vmalloc_node); +void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +{ + return __vmalloc_node(size, gfp_mask, prot, -1); +} EXPORT_SYMBOL(__vmalloc); /** @@ -478,9 +503,26 @@ void *vmalloc(unsigned long size) { return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } - EXPORT_SYMBOL(vmalloc); +/** + * vmalloc_node - allocate memory on a specific node + * + * @size: allocation size + * @node; numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight cotrol over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vmalloc_node(unsigned long size, int node) +{ + return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); +} +EXPORT_SYMBOL(vmalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -515,7 +557,6 @@ void *vmalloc_32(unsigned long size) { return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); } - EXPORT_SYMBOL(vmalloc_32); long vread(char *buf, char *addr, unsigned long count) From 4b8f573b5db02a3017afbba49026a6aef480174f Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 29 Oct 2005 18:15:42 -0700 Subject: [PATCH 07/98] [PATCH] TIMERS: add missing compensation for HZ == 250 Add missing compensation for (HZ == 250) != (1 << SHIFT_HZ) in second_overflow(). Signed-off-by: YOSHIFUJI Hideaki Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/timer.c b/kernel/timer.c index 3ba10fa35b60..6a2e5f8dc725 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -752,6 +752,15 @@ static void second_overflow(void) else time_adj += (time_adj >> 2) + (time_adj >> 5); #endif +#if HZ == 250 + /* Compensate for (HZ==250) != (1 << SHIFT_HZ). + * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14) + */ + if (time_adj < 0) + time_adj -= (-time_adj >> 6) + (-time_adj >> 7); + else + time_adj += (time_adj >> 6) + (time_adj >> 7); +#endif #if HZ == 1000 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) From 63f324cf0792ed69089b79d6921ba3aaea97af50 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Sat, 29 Oct 2005 18:15:43 -0700 Subject: [PATCH 08/98] [PATCH] fix alpha breakage barrier.h uses barrier() in non-SMP case. And doesn't include compiler.h. Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/barrier.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-alpha/barrier.h b/include/asm-alpha/barrier.h index 229c83fe77cb..681ff581afa5 100644 --- a/include/asm-alpha/barrier.h +++ b/include/asm-alpha/barrier.h @@ -1,6 +1,8 @@ #ifndef __BARRIER_H #define __BARRIER_H +#include + #define mb() \ __asm__ __volatile__("mb": : :"memory") From eb92f4ef320b738e41ad43476a5d05c8a20d5cc7 Mon Sep 17 00:00:00 2001 From: Rik Van Riel Date: Sat, 29 Oct 2005 18:15:44 -0700 Subject: [PATCH 09/98] [PATCH] add sem_is_read/write_locked() Add sem_is_read/write_locked functions to the read/write semaphores, along the same lines of the *_is_locked spinlock functions. The swap token tuning patch uses sem_is_read_locked; sem_is_write_locked is added for completeness. Signed-off-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/rwsem.h | 5 +++++ include/asm-i386/rwsem.h | 5 +++++ include/asm-ia64/rwsem.h | 5 +++++ include/asm-ppc/rwsem.h | 5 +++++ include/asm-ppc64/rwsem.h | 5 +++++ include/asm-s390/rwsem.h | 5 +++++ include/asm-sh/rwsem.h | 5 +++++ include/asm-sparc64/rwsem.h | 5 +++++ include/asm-x86_64/rwsem.h | 5 +++++ include/linux/rwsem-spinlock.h | 5 +++++ 10 files changed, 50 insertions(+) diff --git a/include/asm-alpha/rwsem.h b/include/asm-alpha/rwsem.h index 8e058a67c9a4..fafdd4f7010a 100644 --- a/include/asm-alpha/rwsem.h +++ b/include/asm-alpha/rwsem.h @@ -262,5 +262,10 @@ static inline long rwsem_atomic_update(long val, struct rw_semaphore *sem) #endif } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _ALPHA_RWSEM_H */ diff --git a/include/asm-i386/rwsem.h b/include/asm-i386/rwsem.h index 7625a675852f..be4ab859238e 100644 --- a/include/asm-i386/rwsem.h +++ b/include/asm-i386/rwsem.h @@ -284,5 +284,10 @@ LOCK_PREFIX "xadd %0,(%2)" return tmp+delta; } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _I386_RWSEM_H */ diff --git a/include/asm-ia64/rwsem.h b/include/asm-ia64/rwsem.h index e18b5ab0cb75..1327c91ea39c 100644 --- a/include/asm-ia64/rwsem.h +++ b/include/asm-ia64/rwsem.h @@ -186,4 +186,9 @@ __downgrade_write (struct rw_semaphore *sem) #define rwsem_atomic_add(delta, sem) atomic64_add(delta, (atomic64_t *)(&(sem)->count)) #define rwsem_atomic_update(delta, sem) atomic64_add_return(delta, (atomic64_t *)(&(sem)->count)) +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* _ASM_IA64_RWSEM_H */ diff --git a/include/asm-ppc/rwsem.h b/include/asm-ppc/rwsem.h index 3e738f483c11..3501ea72f88c 100644 --- a/include/asm-ppc/rwsem.h +++ b/include/asm-ppc/rwsem.h @@ -168,5 +168,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _PPC_RWSEM_XADD_H */ diff --git a/include/asm-ppc64/rwsem.h b/include/asm-ppc64/rwsem.h index bd5c2f093575..7a647fae3765 100644 --- a/include/asm-ppc64/rwsem.h +++ b/include/asm-ppc64/rwsem.h @@ -163,5 +163,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _PPC_RWSEM_XADD_H */ diff --git a/include/asm-s390/rwsem.h b/include/asm-s390/rwsem.h index 8c0cebbfc034..0422a085dd56 100644 --- a/include/asm-s390/rwsem.h +++ b/include/asm-s390/rwsem.h @@ -351,5 +351,10 @@ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) return new; } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _S390_RWSEM_H */ diff --git a/include/asm-sh/rwsem.h b/include/asm-sh/rwsem.h index 1be4337f5259..0262d3d1e5e0 100644 --- a/include/asm-sh/rwsem.h +++ b/include/asm-sh/rwsem.h @@ -166,5 +166,10 @@ static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) return atomic_add_return(delta, (atomic_t *)(&sem->count)); } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _ASM_SH_RWSEM_H */ diff --git a/include/asm-sparc64/rwsem.h b/include/asm-sparc64/rwsem.h index 4568ee4022df..cef5e8270421 100644 --- a/include/asm-sparc64/rwsem.h +++ b/include/asm-sparc64/rwsem.h @@ -56,6 +56,11 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) atomic_add(delta, (atomic_t *)(&sem->count)); } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _SPARC64_RWSEM_H */ diff --git a/include/asm-x86_64/rwsem.h b/include/asm-x86_64/rwsem.h index c002175b6e82..46077e9c1910 100644 --- a/include/asm-x86_64/rwsem.h +++ b/include/asm-x86_64/rwsem.h @@ -274,5 +274,10 @@ LOCK_PREFIX "xaddl %0,(%2)" return tmp+delta; } +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->count != 0); +} + #endif /* __KERNEL__ */ #endif /* _X8664_RWSEM_H */ diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h index b52a2af25f1f..f30f805080ae 100644 --- a/include/linux/rwsem-spinlock.h +++ b/include/linux/rwsem-spinlock.h @@ -61,5 +61,10 @@ extern void FASTCALL(__up_read(struct rw_semaphore *sem)); extern void FASTCALL(__up_write(struct rw_semaphore *sem)); extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem)); +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return (sem->activity != 0); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_RWSEM_SPINLOCK_H */ From fcdae29aa7a5c79f245110f6680afdc1858d3626 Mon Sep 17 00:00:00 2001 From: Rik Van Riel Date: Sat, 29 Oct 2005 18:15:46 -0700 Subject: [PATCH 10/98] [PATCH] swaptoken tuning It turns out that the original swap token implementation, by Song Jiang, only enforced the swap token while the task holding the token is handling a page fault. This patch approximates that, without adding an additional flag to the mm_struct, by checking whether the mm->mmap_sem is held for reading, like the page fault code does. This patch has the effect of automatically, and gradually, disabling the enforcement of the swap token when there is little or no paging going on, and "turning up" the intensity of the swap token code the more the task holding the token is thrashing. Thanks to Song Jiang for pointing out this aspect of the token based thrashing control concept. The new code shows a slight degradation over the old swap token code, but still a big win over running without the swap token. 2.6.12+ swap token disabled $ for i in `seq 10` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done 101.74user 23.13system 8:26.91elapsed 24%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (38597major+430315minor)pagefaults 0swaps 101.98user 24.91system 8:03.06elapsed 26%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (33939major+430457minor)pagefaults 0swaps 101.93user 22.12system 7:34.90elapsed 27%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (33166major+421267minor)pagefaults 0swaps 101.82user 22.38system 8:31.40elapsed 24%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (39338major+433262minor)pagefaults 0swaps 2.6.12+ swap token enabled, timeout 300 seconds $ for i in `seq 4` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done 102.58user 16.08system 3:41.44elapsed 53%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (19707major+285786minor)pagefaults 0swaps 102.07user 19.56system 4:00.64elapsed 50%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (19012major+299259minor)pagefaults 0swaps 102.64user 18.25system 4:07.31elapsed 48%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (21990major+304831minor)pagefaults 0swaps 101.39user 19.41system 5:15.81elapsed 38%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (24850major+323321minor)pagefaults 0swaps 2.6.12+ with new swap token code, timeout 300 seconds $ for i in `seq 4` ; do /usr/bin/time ./qsbench -n 30000000 -p 3 ; done 101.87user 24.66system 5:53.20elapsed 35%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (26848major+363497minor)pagefaults 0swaps 102.83user 19.95system 4:17.25elapsed 47%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (19946major+305722minor)pagefaults 0swaps 102.09user 19.46system 5:12.57elapsed 38%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (25461major+334994minor)pagefaults 0swaps 101.67user 20.61system 4:52.97elapsed 41%CPU (0avgtext+0avgdata 0maxresident)k 0inputs+0outputs (22190major+329508minor)pagefaults 0swaps Signed-off-by: Rik Van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 6 +++++- mm/thrash.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 450f5241b5a5..1fc559e09ca8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -298,7 +298,11 @@ static int page_referenced_one(struct page *page, if (ptep_clear_flush_young(vma, address, pte)) referenced++; - if (mm != current->mm && !ignore_token && has_swap_token(mm)) + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && !ignore_token && + has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) referenced++; (*mapcount)--; diff --git a/mm/thrash.c b/mm/thrash.c index 11461f7ad830..eff3c18c33a1 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -19,7 +19,7 @@ static unsigned long swap_token_check; struct mm_struct * swap_token_mm = &init_mm; #define SWAP_TOKEN_CHECK_INTERVAL (HZ * 2) -#define SWAP_TOKEN_TIMEOUT 0 +#define SWAP_TOKEN_TIMEOUT (300 * HZ) /* * Currently disabled; Needs further code to work at HZ * 300. */ From ba56e91c940146e99ac694c4c7cd7f2b4aaa565d Mon Sep 17 00:00:00 2001 From: "Seth, Rohit" Date: Sat, 29 Oct 2005 18:15:47 -0700 Subject: [PATCH 11/98] [PATCH] mm: page_alloc: increase size of per-cpu-pages Increase the page allocator's per-cpu magazines from 1/4MB to 1/2MB. Over 100+ runs for a workload, the difference in mean is about 2%. The best results for both are almost same. Though the max variation in results with 1/2MB is only 2.2%, whereas with 1/4MB it is 12%. Signed-off-by: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94c864eac9c4..f799217dc2f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1721,29 +1721,29 @@ static int __devinit zone_batchsize(struct zone *zone) /* * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. + * size of the zone. But no more than 1/2 of a meg. * * OK, so we don't know how big the cache is. So guess. */ batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; + if (batch * PAGE_SIZE > 512 * 1024) + batch = (512 * 1024) / PAGE_SIZE; batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. + * We will be trying to allcoate bigger chunks of contiguous + * memory of the order of fls(batch). This should result in + * better cache coloring. * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. + * A sanity check also to ensure that batch is still in limits. */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = (1 << fls(batch + batch/2)); + + if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) + batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); + return batch; } From e46a5e28c201f703c18b47b108bfddec44f897c4 Mon Sep 17 00:00:00 2001 From: "Seth, Rohit" Date: Sat, 29 Oct 2005 18:15:48 -0700 Subject: [PATCH 12/98] [PATCH] mm: set per-cpu-pages lower threshold to zero Set the low water mark for hot pages in pcp to zero. (akpm: for the life of me I cannot remember why we created pcp->low. Neither can Martin and the changelog is silent. Maybe it was just a brainfart, but I have this feeling that there was a reason. If not, we should remove the fields completely. We'll see.) Signed-off-by: Rohit Seth Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f799217dc2f3..60663232fbb2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1755,7 +1755,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 2 * batch; + pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); @@ -1764,7 +1764,7 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->low = 0; pcp->high = 2 * batch; - pcp->batch = max(1UL, 1 * batch); + pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); } From dfcd3c0dc426bb75770c34b40e14f2da8845ea62 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Oct 2005 18:15:48 -0700 Subject: [PATCH 13/98] [PATCH] Convert mempolicies to nodemask_t The NUMA policy code predated nodemask_t so it used open coded bitmaps. Convert everything to nodemask_t. Big patch, but shouldn't have any actual behaviour changes (except I removed one unnecessary check against node_online_map and one unnecessary BUG_ON) Signed-off-by: "Andi Kleen" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 2 +- include/linux/mempolicy.h | 4 +- mm/mempolicy.c | 120 +++++++++++++++++--------------------- 3 files changed, 56 insertions(+), 70 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c7ef3e48e35b..994612bc72d0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -469,7 +469,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " interleave={"); first = 1; for_each_node(n) { - if (test_bit(n, pol->v.nodes)) { + if (node_isset(n, pol->v.nodes)) { if (!first) seq_putc(m,','); else diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 58385ee1c0ac..38e60a099399 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -27,10 +27,10 @@ #include #include -#include #include #include #include +#include struct vm_area_struct; @@ -63,7 +63,7 @@ struct mempolicy { union { struct zonelist *zonelist; /* bind */ short preferred_node; /* preferred */ - DECLARE_BITMAP(nodes, MAX_NUMNODES); /* interleave */ + nodemask_t nodes; /* interleave */ /* undefined for default */ } v; }; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1d5c64df1653..8bc0be1c9efd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,23 +93,10 @@ struct mempolicy default_policy = { .policy = MPOL_DEFAULT, }; -/* Check if all specified nodes are online */ -static int nodes_online(unsigned long *nodes) -{ - DECLARE_BITMAP(online2, MAX_NUMNODES); - - bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES); - if (bitmap_empty(online2, MAX_NUMNODES)) - set_bit(0, online2); - if (!bitmap_subset(nodes, online2, MAX_NUMNODES)) - return -EINVAL; - return 0; -} - /* Do sanity checking on a policy */ -static int mpol_check_policy(int mode, unsigned long *nodes) +static int mpol_check_policy(int mode, nodemask_t *nodes) { - int empty = bitmap_empty(nodes, MAX_NUMNODES); + int empty = nodes_empty(*nodes); switch (mode) { case MPOL_DEFAULT: @@ -124,11 +111,11 @@ static int mpol_check_policy(int mode, unsigned long *nodes) return -EINVAL; break; } - return nodes_online(nodes); + return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } /* Copy a node mask from user space. */ -static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, unsigned long maxnode, int mode) { unsigned long k; @@ -136,7 +123,7 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, unsigned long endmask; --maxnode; - bitmap_zero(nodes, MAX_NUMNODES); + nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; @@ -153,7 +140,7 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { unsigned long t; - if (get_user(t, nmask + k)) + if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { if (t & endmask) @@ -165,30 +152,29 @@ static int get_nodes(unsigned long *nodes, unsigned long __user *nmask, endmask = ~0UL; } - if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long))) + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; - nodes[nlongs-1] &= endmask; + nodes_addr(*nodes)[nlongs-1] &= endmask; /* Update current mems_allowed */ cpuset_update_current_mems_allowed(); /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes); + /* AK: shouldn't this error out instead? */ + cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); return mpol_check_policy(mode, nodes); } /* Generate a custom zonelist for the BIND policy. */ -static struct zonelist *bind_zonelist(unsigned long *nodes) +static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; int num, max, nd; - max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); + max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); if (!zl) return NULL; num = 0; - for (nd = find_first_bit(nodes, MAX_NUMNODES); - nd < MAX_NUMNODES; - nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) { + for_each_node_mask(nd, *nodes) { int k; for (k = MAX_NR_ZONES-1; k >= 0; k--) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -205,11 +191,11 @@ static struct zonelist *bind_zonelist(unsigned long *nodes) } /* Create a new policy */ -static struct mempolicy *mpol_new(int mode, unsigned long *nodes) +static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); + PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -218,10 +204,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) atomic_set(&policy->refcnt, 1); switch (mode) { case MPOL_INTERLEAVE: - bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); + policy->v.nodes = *nodes; break; case MPOL_PREFERRED: - policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); + policy->v.preferred_node = first_node(*nodes); if (policy->v.preferred_node >= MAX_NUMNODES) policy->v.preferred_node = -1; break; @@ -239,7 +225,7 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) /* Ensure all existing pages follow the policy. */ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, unsigned long *nodes) + unsigned long addr, unsigned long end, nodemask_t *nodes) { pte_t *orig_pte; pte_t *pte; @@ -256,7 +242,7 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, if (!pfn_valid(pfn)) continue; nid = pfn_to_nid(pfn); - if (!test_bit(nid, nodes)) + if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(orig_pte); @@ -265,7 +251,7 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, } static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, unsigned long *nodes) + unsigned long addr, unsigned long end, nodemask_t *nodes) { pmd_t *pmd; unsigned long next; @@ -282,7 +268,7 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, } static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, unsigned long *nodes) + unsigned long addr, unsigned long end, nodemask_t *nodes) { pud_t *pud; unsigned long next; @@ -299,7 +285,7 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, } static inline int check_pgd_range(struct mm_struct *mm, - unsigned long addr, unsigned long end, unsigned long *nodes) + unsigned long addr, unsigned long end, nodemask_t *nodes) { pgd_t *pgd; unsigned long next; @@ -318,7 +304,7 @@ static inline int check_pgd_range(struct mm_struct *mm, /* Step 1: check the range */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - unsigned long *nodes, unsigned long flags) + nodemask_t *nodes, unsigned long flags) { int err; struct vm_area_struct *first, *vma, *prev; @@ -403,7 +389,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; - DECLARE_BITMAP(nodes, MAX_NUMNODES); + nodemask_t nodes; int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -419,19 +405,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, if (end == start) return 0; - err = get_nodes(nodes, nmask, maxnode, mode); + err = get_nodes(&nodes, nmask, maxnode, mode); if (err) return err; - new = mpol_new(mode, nodes); + new = mpol_new(mode, &nodes); if (IS_ERR(new)) return PTR_ERR(new); PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes[0]); + mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nodes, flags); + vma = check_range(mm, start, end, &nodes, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -446,45 +432,45 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, { int err; struct mempolicy *new; - DECLARE_BITMAP(nodes, MAX_NUMNODES); + nodemask_t nodes; if (mode < 0 || mode > MPOL_MAX) return -EINVAL; - err = get_nodes(nodes, nmask, maxnode, mode); + err = get_nodes(&nodes, nmask, maxnode, mode); if (err) return err; - new = mpol_new(mode, nodes); + new = mpol_new(mode, &nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; if (new && new->policy == MPOL_INTERLEAVE) - current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + current->il_next = first_node(new->v.nodes); return 0; } /* Fill a zone bitmap for a policy */ -static void get_zonemask(struct mempolicy *p, unsigned long *nodes) +static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) { int i; - bitmap_zero(nodes, MAX_NUMNODES); + nodes_clear(*nodes); switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); break; case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); + *nodes = p->v.nodes; break; case MPOL_PREFERRED: /* or use current node instead of online map? */ if (p->v.preferred_node < 0) - bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); + *nodes = node_online_map; else - __set_bit(p->v.preferred_node, nodes); + node_set(p->v.preferred_node, *nodes); break; default: BUG(); @@ -506,9 +492,10 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - void *nodes, unsigned nbytes) + nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -517,7 +504,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return -EFAULT; copy = nbytes; } - return copy_to_user(mask, nodes, copy) ? -EFAULT : 0; + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } /* Retrieve NUMA policy */ @@ -578,9 +565,9 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = 0; if (nmask) { - DECLARE_BITMAP(nodes, MAX_NUMNODES); - get_zonemask(pol, nodes); - err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes)); + nodemask_t nodes; + get_zonemask(pol, &nodes); + err = copy_nodes_to_user(nmask, maxnode, &nodes); } out: @@ -649,15 +636,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, long err = 0; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; - DECLARE_BITMAP(bm, MAX_NUMNODES); + nodemask_t bm; nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { - err = compat_get_bitmap(bm, nmask, nr_bits); + err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits); nm = compat_alloc_user_space(alloc_size); - err |= copy_to_user(nm, bm, alloc_size); + err |= copy_to_user(nm, nodes_addr(bm), alloc_size); } if (err) @@ -723,9 +710,9 @@ static unsigned interleave_nodes(struct mempolicy *policy) nid = me->il_next; BUG_ON(nid >= MAX_NUMNODES); - next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid); + next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) - next = find_first_bit(policy->v.nodes, MAX_NUMNODES); + next = first_node(policy->v.nodes); me->il_next = next; return nid; } @@ -734,18 +721,17 @@ static unsigned interleave_nodes(struct mempolicy *policy) static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) { - unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); + unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target = (unsigned)off % nnodes; int c; int nid = -1; c = 0; do { - nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); + nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); BUG_ON(nid >= MAX_NUMNODES); - BUG_ON(!test_bit(nid, pol->v.nodes)); return nid; } @@ -878,7 +864,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_DEFAULT: return 1; case MPOL_INTERLEAVE: - return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); + return nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; case MPOL_BIND: { @@ -1117,7 +1103,7 @@ int mpol_set_shared_policy(struct shared_policy *info, PDprintk("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? npol->v.nodes[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); From 662f3a0b94cc92bd708c27b80f8207cd7db93204 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Oct 2005 18:15:49 -0700 Subject: [PATCH 14/98] [PATCH] Remove near all BUGs in mm/mempolicy.c Most of them can never be triggered and were only for development. Signed-off-by: "Andi Kleen" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8bc0be1c9efd..43b1199af591 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -185,7 +185,6 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) policy_zone = k; } } - BUG_ON(num >= max); zl->zones[num] = NULL; return zl; } @@ -709,7 +708,6 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - BUG_ON(nid >= MAX_NUMNODES); next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) next = first_node(policy->v.nodes); @@ -731,18 +729,17 @@ static unsigned offset_il_node(struct mempolicy *pol, nid = next_node(nid, pol->v.nodes); c++; } while (c <= target); - BUG_ON(nid >= MAX_NUMNODES); return nid; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) +static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + unsigned nid) { struct zonelist *zl; struct page *page; - BUG_ON(!node_online(nid)); zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); if (page && page_zone(page) == zl->zones[0]) { @@ -785,8 +782,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) unsigned nid; if (vma) { unsigned long off; - BUG_ON(addr >= vma->vm_end); - BUG_ON(addr < vma->vm_start); off = vma->vm_pgoff; off += (addr - vma->vm_start) >> PAGE_SHIFT; nid = offset_il_node(pol, vma, off); From b57b98d147ef98758886a39efb94f3254542c39b Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sat, 29 Oct 2005 18:15:50 -0700 Subject: [PATCH 15/98] [PATCH] mm/msync.c cleanup This is not problem actually, but sync_page_range() is using for exported function to filesystems. The msync_xxx is more readable at least to me. Signed-off-by: OGAWA Hirofumi Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/msync.c b/mm/msync.c index d0f5a1bce7cb..9cab3f2d5863 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -22,7 +22,7 @@ * threads/the swapper from ripping pte's out from under us. */ -static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; @@ -50,7 +50,7 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte - 1); } -static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; @@ -61,11 +61,11 @@ static inline void sync_pmd_range(struct vm_area_struct *vma, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - sync_pte_range(vma, pmd, addr, next); + msync_pte_range(vma, pmd, addr, next); } while (pmd++, addr = next, addr != end); } -static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; @@ -76,11 +76,11 @@ static inline void sync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - sync_pmd_range(vma, pud, addr, next); + msync_pmd_range(vma, pud, addr, next); } while (pud++, addr = next, addr != end); } -static void sync_page_range(struct vm_area_struct *vma, +static void msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -101,14 +101,14 @@ static void sync_page_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - sync_pud_range(vma, pgd, addr, next); + msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); spin_unlock(&mm->page_table_lock); } #ifdef CONFIG_PREEMPT -static inline void filemap_sync(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static inline void filemap_msync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { const size_t chunk = 64 * 1024; /* bytes */ unsigned long next; @@ -117,15 +117,15 @@ static inline void filemap_sync(struct vm_area_struct *vma, next = addr + chunk; if (next > end || next < addr) next = end; - sync_page_range(vma, addr, next); + msync_page_range(vma, addr, next); cond_resched(); } while (addr = next, addr != end); } #else -static inline void filemap_sync(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +static inline void filemap_msync(struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { - sync_page_range(vma, addr, end); + msync_page_range(vma, addr, end); } #endif @@ -150,7 +150,7 @@ static int msync_interval(struct vm_area_struct *vma, return -EBUSY; if (file && (vma->vm_flags & VM_SHARED)) { - filemap_sync(vma, addr, end); + filemap_msync(vma, addr, end); if (flags & MS_SYNC) { struct address_space *mapping = file->f_mapping; From c340010e4bf824d969a89fa192ecc7a526c0cd24 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 29 Oct 2005 18:15:51 -0700 Subject: [PATCH 16/98] [PATCH] shrink_list(): skip anon pages if not may_swap Martin Hicks' page cache reclaim patch added the 'may_swap' flag to the scan_control struct; and modified shrink_list() not to add anon pages to the swap cache if may_swap is not asserted. Ref: http://marc.theaimsgroup.com/?l=linux-mm&m=111461480725322&w=4 However, further down, if the page is mapped, shrink_list() calls try_to_unmap() which will call try_to_unmap_one() via try_to_unmap_anon (). try_to_unmap_one() will BUG_ON() an anon page that is NOT in the swap cache. Martin says he never encountered this path in his testing, but agrees that it might happen. This patch modifies shrink_list() to skip anon pages that are not already in the swap cache when !may_swap, rather than just not adding them to the cache. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 843c87d1e61f..41d1064aabfb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -417,7 +417,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { + if (PageAnon(page) && !PageSwapCache(page)) { + if (!sc->may_swap) + goto keep_locked; if (!add_to_swap(page)) goto activate_locked; } From 09ad4bbc3a5c93316d7f4ffc0c310d9cbb28c2f0 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:15:52 -0700 Subject: [PATCH 17/98] [PATCH] slab: add additional debugging to detect slabs from the wrong node This patch adds some stack dumps if the slab logic is processing slab blocks from the wrong node. This is necessary in order to detect situations as encountered by Petr. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index d30423f167a2..22bfb0b2ac8b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2419,6 +2419,7 @@ retry: next = slab_bufctl(slabp)[slabp->free]; #if DEBUG slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; + WARN_ON(numa_node_id() != slabp->nodeid); #endif slabp->free = next; } @@ -2633,8 +2634,10 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n check_spinlock_acquired_node(cachep, node); check_slabp(cachep, slabp); - #if DEBUG + /* Verify that the slab belongs to the intended node */ + WARN_ON(slabp->nodeid != node); + if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { printk(KERN_ERR "slab: double free detected in cache " "'%s', objp %p\n", cachep->name, objp); From e040f218bb49a6965a5b77edce05fe47a62dda39 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:53 -0700 Subject: [PATCH 18/98] [PATCH] mm: copy_pte_range progress fix My latency breaking in copy_pte_range didn't work as intended: instead of checking at regularish intervals, after the first interval it checked every time around the loop, too impatient to be preempted. Fix that. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 1db40e935e55..222c13e46130 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -410,7 +410,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, { pte_t *src_pte, *dst_pte; unsigned long vm_flags = vma->vm_flags; - int progress; + int progress = 0; again: dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); @@ -418,17 +418,19 @@ again: return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - progress = 0; spin_lock(&src_mm->page_table_lock); do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ - if (progress >= 32 && (need_resched() || - need_lockbreak(&src_mm->page_table_lock) || - need_lockbreak(&dst_mm->page_table_lock))) - break; + if (progress >= 32) { + progress = 0; + if (need_resched() || + need_lockbreak(&src_mm->page_table_lock) || + need_lockbreak(&dst_mm->page_table_lock)) + break; + } if (pte_none(*src_pte)) { progress++; continue; From 0c942a4539c09adf09097315cc174aefd0eeedf7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:53 -0700 Subject: [PATCH 19/98] [PATCH] mm: msync_pte_range progress Use latency breaking in msync_pte_range like that in copy_pte_range, instead of the ugly CONFIG_PREEMPT filemap_msync alternatives. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 38 ++++++++++++++------------------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/mm/msync.c b/mm/msync.c index 9cab3f2d5863..3b5f1c521d4b 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -26,12 +26,21 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { pte_t *pte; + int progress = 0; +again: pte = pte_offset_map(pmd, addr); do { unsigned long pfn; struct page *page; + if (progress >= 64) { + progress = 0; + if (need_resched() || + need_lockbreak(&vma->vm_mm->page_table_lock)) + break; + } + progress++; if (!pte_present(*pte)) continue; if (!pte_maybe_dirty(*pte)) @@ -46,8 +55,12 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); + progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); + cond_resched_lock(&vma->vm_mm->page_table_lock); + if (addr != end) + goto again; } static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -106,29 +119,6 @@ static void msync_page_range(struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); } -#ifdef CONFIG_PREEMPT -static inline void filemap_msync(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - const size_t chunk = 64 * 1024; /* bytes */ - unsigned long next; - - do { - next = addr + chunk; - if (next > end || next < addr) - next = end; - msync_page_range(vma, addr, next); - cond_resched(); - } while (addr = next, addr != end); -} -#else -static inline void filemap_msync(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - msync_page_range(vma, addr, end); -} -#endif - /* * MS_SYNC syncs the entire file - including mappings. * @@ -150,7 +140,7 @@ static int msync_interval(struct vm_area_struct *vma, return -EBUSY; if (file && (vma->vm_flags & VM_SHARED)) { - filemap_msync(vma, addr, end); + msync_page_range(vma, addr, end); if (flags & MS_SYNC) { struct address_space *mapping = file->f_mapping; From 6237bcd94851e9cf0ecd2520d744779df0f5a9a6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:54 -0700 Subject: [PATCH 20/98] [PATCH] mm: zap_pte_range dont dirty anon zap_pte_range already avoids wasting time to mark_page_accessed on anon pages: it can also skip anon set_page_dirty - the page only needs to be marked dirty if shared with another mm, but that will say pte_dirty too. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 222c13e46130..fd5d4c6dc762 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -574,12 +574,14 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, addr) != page->index) set_pte_at(tlb->mm, addr, pte, pgoff_to_pte(page->index)); - if (pte_dirty(ptent)) - set_page_dirty(page); if (PageAnon(page)) dec_mm_counter(tlb->mm, anon_rss); - else if (pte_young(ptent)) - mark_page_accessed(page); + else { + if (pte_dirty(ptent)) + set_page_dirty(page); + if (pte_young(ptent)) + mark_page_accessed(page); + } tlb->freed++; page_remove_rmap(page); tlb_remove_page(tlb, page); From 72866f6f277ec0ddd6df7a3b6ecdcf59a28de115 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:55 -0700 Subject: [PATCH 21/98] [PATCH] mm: anon is already wrprotected do_anonymous_page's pte_wrprotect causes some confusion: in such a case, vm_page_prot must already be forcing COW, so must omit write permission, and so the pte_wrprotect is redundant. Replace it by a comment to that effect, and reword the comment on unuse_pte which also caused confusion. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++--- mm/swapfile.c | 7 +++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index fd5d4c6dc762..13667681cd16 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1768,13 +1768,14 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pte_t entry; - struct page * page = ZERO_PAGE(addr); - /* Read-only mapping of ZERO_PAGE. */ - entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); + /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ + entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); /* ..except if it's a write access */ if (write_access) { + struct page *page; + /* Allocate our own private page. */ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); diff --git a/mm/swapfile.c b/mm/swapfile.c index 1dcaeda039f4..05c851291241 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -398,10 +398,9 @@ void free_swap_and_cache(swp_entry_t entry) } /* - * Always set the resulting pte to be nowrite (the same as COW pages - * after one process has exited). We don't know just how many PTEs will - * share this swap entry, so be cautious and let do_wp_page work out - * what to do if a write is requested later. + * No need to decide whether this PTE shares the swap entry with others, + * just let do_wp_page work it out if a write is requested later - to + * force COW, vm_page_prot omits write permission from any private vma. * * vma->vm_mm->page_table_lock is held. */ From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:56 -0700 Subject: [PATCH 22/98] [PATCH] mm: vm_stat_account unshackled The original vm_stat_account has fallen into disuse, with only one user, and only one user of vm_stat_unaccount. It's easier to keep track if we convert them all to __vm_stat_account, then free it from its __shackles. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 3 ++- arch/ia64/mm/fault.c | 2 +- include/linux/mm.h | 16 ++-------------- kernel/fork.c | 2 +- mm/mmap.c | 20 ++++++++++---------- mm/mprotect.c | 4 ++-- mm/mremap.c | 4 ++-- 7 files changed, 20 insertions(+), 31 deletions(-) diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d71731ee5b61..f7dfc107cb7b 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon insert_vm_struct(mm, vma); mm->total_vm += size >> PAGE_SHIFT; - vm_stat_account(vma); + vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, + vma_pages(vma)); up_write(&task->mm->mmap_sem); /* diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 3c32af910d60..f21b55549787 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -41,7 +41,7 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address) vma->vm_mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) vma->vm_mm->locked_vm += grow; - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); + vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index e1649578fb0c..376a466743bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, unsigned long, pgprot_t); #ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); +void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); #else -static inline void __vm_stat_account(struct mm_struct *mm, +static inline void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { } #endif /* CONFIG_PROC_FS */ -static inline void vm_stat_account(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); -} - -static inline void vm_stat_unaccount(struct vm_area_struct *vma) -{ - __vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - -vma_pages(vma)); -} - /* update per process rss and vm hiwater data */ extern void update_mem_hiwater(struct task_struct *tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 280bd44ac441..e2ff11f8c1b0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -212,7 +212,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) if (mpnt->vm_flags & VM_DONTCOPY) { long pages = vma_pages(mpnt); mm->total_vm -= pages; - __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, -pages); continue; } diff --git a/mm/mmap.c b/mm/mmap.c index fa11d91242e8..e1780266ac7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -832,7 +832,7 @@ none: } #ifdef CONFIG_PROC_FS -void __vm_stat_account(struct mm_struct *mm, unsigned long flags, +void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { const unsigned long stack_flags @@ -1110,7 +1110,7 @@ munmap_back: } out: mm->total_vm += len >> PAGE_SHIFT; - __vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); + vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { mm->locked_vm += len >> PAGE_SHIFT; make_pages_present(addr, addr + len); @@ -1475,7 +1475,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; - __vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); + vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); return 0; } @@ -1610,15 +1610,15 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) * By the time this function is called, the area struct has been * removed from the process mapping list. */ -static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) +static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - size_t len = area->vm_end - area->vm_start; + long nrpages = vma_pages(vma); - area->vm_mm->total_vm -= len >> PAGE_SHIFT; - if (area->vm_flags & VM_LOCKED) - area->vm_mm->locked_vm -= len >> PAGE_SHIFT; - vm_stat_unaccount(area); - remove_vm_struct(area); + mm->total_vm -= nrpages; + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + remove_vm_struct(vma); } /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 57577f63b305..b426f01c5e9c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -168,8 +168,8 @@ success: vma->vm_flags = newflags; vma->vm_page_prot = newprot; change_protection(vma, start, end, newprot); - __vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); - __vm_stat_account(mm, newflags, vma->vm_file, nrpages); + vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); + vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; fail: diff --git a/mm/mremap.c b/mm/mremap.c index f343fc73a8bd..55df8f53e84d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -233,7 +233,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * since do_munmap() will decrement it by old_len == new_len */ mm->total_vm += new_len >> PAGE_SHIFT; - __vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ @@ -384,7 +384,7 @@ unsigned long do_mremap(unsigned long addr, addr + new_len, vma->vm_pgoff, NULL); current->mm->total_vm += pages; - __vm_stat_account(vma->vm_mm, vma->vm_flags, + vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { current->mm->locked_vm += pages; From 2c0b381467bc2997be9d741a152f3fc75785eedc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:56 -0700 Subject: [PATCH 23/98] [PATCH] mm: remove_vma_list consolidation unmap_vma doesn't amount to much, let's put it inside unmap_vma_list. Except it doesn't unmap anything, unmap_region just did the unmapping: rename it to remove_vma_list. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 36 ++++++++++++------------------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e1780266ac7d..eeefe19a0fac 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1603,35 +1603,23 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) } #endif -/* Normal function to fix up a mapping - * This function is the default for when an area has no specific - * function. This may be used as part of a more specific routine. - * - * By the time this function is called, the area struct has been - * removed from the process mapping list. - */ -static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma) -{ - long nrpages = vma_pages(vma); - - mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; - vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); - remove_vm_struct(vma); -} - /* - * Update the VMA and inode share lists. - * - * Ok - we have the memory areas we should free on the 'free' list, + * Ok - we have the memory areas we should free on the vma list, * so release them, and do the vma updates. + * + * Called with the mm semaphore held. */ -static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { do { struct vm_area_struct *next = vma->vm_next; - unmap_vma(mm, vma); + long nrpages = vma_pages(vma); + + mm->total_vm -= nrpages; + if (vma->vm_flags & VM_LOCKED) + mm->locked_vm -= nrpages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); + remove_vm_struct(vma); vma = next; } while (vma); validate_mm(mm); @@ -1799,7 +1787,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ - unmap_vma_list(mm, vma); + remove_vma_list(mm, vma); return 0; } From a8fb5618dab7e45c8990f3155628d772a9ed45f9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:57 -0700 Subject: [PATCH 24/98] [PATCH] mm: unlink_file_vma, remove_vma Divide remove_vm_struct into two parts: first anon_vma_unlink plus unlink_file_vma, to unlink the vma from the list and tree by which rmap or vmtruncate might find it; then remove_vma to close, fput and free. The intention here is to do the anon_vma_unlink and unlink_file_vma earlier, in free_pgtables before freeing any page tables: so we can be sure that any page tables traversed by rmap and vmtruncate are stable (and other, ordinary cases are stabilized by holding mmap_sem). This will be crucial to traversing pgd,pud,pmd without page_table_lock. But testing the split-out patch showed that lifting the page_table_lock is symbiotically necessary to make this change - the lock ordering is wrong to move those unlinks into free_pgtables while it's under ptlock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/mmap.c | 41 +++++++++++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 376a466743bc..0c64484d8ae0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,6 +834,7 @@ extern int split_vma(struct mm_struct *, extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); extern void exit_mmap(struct mm_struct *); diff --git a/mm/mmap.c b/mm/mmap.c index eeefe19a0fac..a3984fad3fc2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -181,26 +181,44 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, } /* - * Remove one vm structure and free it. + * Unlink a file-based vm structure from its prio_tree, to hide + * vma from rmap and vmtruncate before freeing its page tables. */ -static void remove_vm_struct(struct vm_area_struct *vma) +void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; - might_sleep(); if (file) { struct address_space *mapping = file->f_mapping; spin_lock(&mapping->i_mmap_lock); __remove_shared_vm_struct(vma, file, mapping); spin_unlock(&mapping->i_mmap_lock); } +} + +/* + * Close a vm structure and free it, returning the next. + */ +static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +{ + struct vm_area_struct *next = vma->vm_next; + + /* + * Hide vma from rmap and vmtruncate before freeing page tables: + * to be moved into free_pgtables once page_table_lock is lifted + * from it, but until then lock ordering forbids that move. + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); + + might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (file) - fput(file); - anon_vma_unlink(vma); + if (vma->vm_file) + fput(vma->vm_file); mpol_free(vma_policy(vma)); kmem_cache_free(vm_area_cachep, vma); + return next; } asmlinkage unsigned long sys_brk(unsigned long brk) @@ -1612,15 +1630,13 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { do { - struct vm_area_struct *next = vma->vm_next; long nrpages = vma_pages(vma); mm->total_vm -= nrpages; if (vma->vm_flags & VM_LOCKED) mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); - remove_vm_struct(vma); - vma = next; + vma = remove_vma(vma); } while (vma); validate_mm(mm); } @@ -1944,11 +1960,8 @@ void exit_mmap(struct mm_struct *mm) * Walk the list again, actually closing and freeing it * without holding any MM locks. */ - while (vma) { - struct vm_area_struct *next = vma->vm_next; - remove_vm_struct(vma); - vma = next; - } + while (vma) + vma = remove_vma(vma); BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } From 7c1fd6b964860cdcf44b6b98d7dcd8cc16a0a26d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:58 -0700 Subject: [PATCH 25/98] [PATCH] mm: exit_mmap need not reset exit_mmap resets various mm_struct fields, but the mm is well on its way out, and none of those fields matter by this point. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index a3984fad3fc2..459b9f068ad7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1948,12 +1948,6 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); - mm->mmap = mm->mmap_cache = NULL; - mm->mm_rb = RB_ROOT; - set_mm_counter(mm, rss, 0); - mm->total_vm = 0; - mm->locked_vm = 0; - spin_unlock(&mm->page_table_lock); /* From 65500d234e74fc4e8f18e1a429bc24e51e75de4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:15:59 -0700 Subject: [PATCH 26/98] [PATCH] mm: page fault handlers tidyup Impose a little more consistency on the page fault handlers do_wp_page, do_swap_page, do_anonymous_page, do_no_page, do_file_page: why not pass their arguments in the same order, called the same names? break_cow is all very well, but what it did was inlined elsewhere: easier to compare if it's brought back into do_wp_page. do_file_page's fallback to do_no_page dates from a time when we were testing pte_file by using it wherever possible: currently it's peculiar to nonlinear vmas, so just check that. BUG_ON if not? Better not, it's probably page table corruption, so just show the pte: hmm, there's a pte_ERROR macro, let's use that for do_wp_page's invalid pfn too. Hah! Someone in the ppc64 world noticed pte_ERROR was unused so removed it: restored (and say "pud" not "pmd" in its pud_ERROR). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-ppc64/pgtable.h | 4 +- mm/filemap.c | 2 +- mm/memory.c | 220 ++++++++++++++++-------------------- mm/shmem.c | 2 +- 4 files changed, 102 insertions(+), 126 deletions(-) diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h index c83679c9d2b0..2eb1778a3a15 100644 --- a/include/asm-ppc64/pgtable.h +++ b/include/asm-ppc64/pgtable.h @@ -478,10 +478,12 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, #define __HAVE_ARCH_PTE_SAME #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e)) + printk("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e)) #define pgd_ERROR(e) \ printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) diff --git a/mm/filemap.c b/mm/filemap.c index 1c31b2fd2ca5..8aa344e88489 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1520,7 +1520,7 @@ repeat: page_cache_release(page); return err; } - } else { + } else if (vma->vm_flags & VM_NONLINEAR) { /* No page was found just because we can't read it in now (being * here implies nonblock != 0), but the page may exist, so set * the PTE to fault it in later. */ diff --git a/mm/memory.c b/mm/memory.c index 13667681cd16..eaf79031f573 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1212,29 +1212,11 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -/* - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock - */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) -{ - pte_t entry; - - entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)), - vma); - ptep_establish(vma, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); -} - /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * - * Goto-purists beware: the only reason for goto's here is that it results - * in better assembly code.. The "default" path will see no jumps at all. - * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary @@ -1247,25 +1229,22 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page * We hold the mm semaphore and the page_table_lock on entry and exit * with the page_table_lock released. */ -static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) +static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + pte_t orig_pte) { struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(pte); + unsigned long pfn = pte_pfn(orig_pte); pte_t entry; - int ret; + int ret = VM_FAULT_MINOR; if (unlikely(!pfn_valid(pfn))) { /* - * This should really halt the system so it can be debugged or - * at least the kernel stops what it's doing before it corrupts - * data, but for the moment just pretend this is OOM. + * Page table corrupted: show pte and kill process. */ - pte_unmap(page_table); - printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", - address); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; + pte_ERROR(orig_pte); + ret = VM_FAULT_OOM; + goto unlock; } old_page = pfn_to_page(pfn); @@ -1274,52 +1253,57 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, unlock_page(old_page); if (reuse) { flush_cache_page(vma, address, pfn); - entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), - vma); + entry = pte_mkyoung(orig_pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_set_access_flags(vma, address, page_table, entry, 1); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR|VM_FAULT_WRITE; + ret |= VM_FAULT_WRITE; + goto unlock; } } - pte_unmap(page_table); /* * Ok, we need to copy. Oh, well.. */ if (!PageReserved(old_page)) page_cache_get(old_page); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto no_new_page; + goto oom; if (old_page == ZERO_PAGE(address)) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) - goto no_new_page; + goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) - goto no_new_page; + goto oom; copy_user_highpage(new_page, old_page, address); } + /* * Re-check the pte - we dropped the lock */ - ret = VM_FAULT_MINOR; spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (likely(pte_same(*page_table, pte))) { + if (likely(pte_same(*page_table, orig_pte))) { if (PageAnon(old_page)) dec_mm_counter(mm, anon_rss); if (PageReserved(old_page)) inc_mm_counter(mm, rss); else page_remove_rmap(old_page); + flush_cache_page(vma, address, pfn); - break_cow(vma, new_page, address, page_table); + entry = mk_pte(new_page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + ptep_establish(vma, address, page_table, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1327,13 +1311,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, new_page = old_page; ret |= VM_FAULT_WRITE; } - pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); return ret; - -no_new_page: +oom: page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1661,17 +1645,19 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc * We hold the mm semaphore and the page_table_lock on entry and * should release the pagetable lock on exit.. */ -static int do_swap_page(struct mm_struct * mm, - struct vm_area_struct * vma, unsigned long address, - pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) +static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) { struct page *page; - swp_entry_t entry = pte_to_swp_entry(orig_pte); + swp_entry_t entry; pte_t pte; int ret = VM_FAULT_MINOR; pte_unmap(page_table); spin_unlock(&mm->page_table_lock); + + entry = pte_to_swp_entry(orig_pte); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1685,11 +1671,7 @@ static int do_swap_page(struct mm_struct * mm, page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; - else - ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* Had to read the page from swap area: Major fault */ @@ -1745,6 +1727,7 @@ static int do_swap_page(struct mm_struct * mm, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); lazy_mmu_prot_update(pte); +unlock: pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: @@ -1754,7 +1737,7 @@ out_nomap: spin_unlock(&mm->page_table_lock); unlock_page(page); page_cache_release(page); - goto out; + return ret; } /* @@ -1762,17 +1745,15 @@ out_nomap: * spinlock held to protect against concurrent faults in * multithreaded programs. */ -static int -do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, - pte_t *page_table, pmd_t *pmd, int write_access, - unsigned long addr) +static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) { pte_t entry; /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); - /* ..except if it's a write access */ if (write_access) { struct page *page; @@ -1781,39 +1762,36 @@ do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) - goto no_mem; - page = alloc_zeroed_user_highpage(vma, addr); + goto oom; + page = alloc_zeroed_user_highpage(vma, address); if (!page) - goto no_mem; + goto oom; spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); + page_table = pte_offset_map(pmd, address); if (!pte_none(*page_table)) { - pte_unmap(page_table); page_cache_release(page); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } inc_mm_counter(mm, rss); - entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot)), - vma); + entry = mk_pte(page, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); lru_cache_add_active(page); SetPageReferenced(page); - page_add_anon_rmap(page, vma, addr); + page_add_anon_rmap(page, vma, address); } - set_pte_at(mm, addr, page_table, entry); - pte_unmap(page_table); + set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, entry); + update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); -out: return VM_FAULT_MINOR; -no_mem: +oom: return VM_FAULT_OOM; } @@ -1829,20 +1807,17 @@ no_mem: * This is called with the MM semaphore held and the page table * spinlock held. Exit with the spinlock released. */ -static int -do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) +static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access) { - struct page * new_page; + struct page *new_page; struct address_space *mapping = NULL; pte_t entry; unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; - if (!vma->vm_ops || !vma->vm_ops->nopage) - return do_anonymous_page(mm, vma, page_table, - pmd, write_access, address); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1852,7 +1827,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, smp_rmb(); /* serializes i_size against truncate_count */ } retry: - cond_resched(); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); /* * No smp_rmb is needed here as long as there's a full @@ -1892,9 +1866,11 @@ retry: * retry getting the page. */ if (mapping && unlikely(sequence != mapping->truncate_count)) { - sequence = mapping->truncate_count; spin_unlock(&mm->page_table_lock); page_cache_release(new_page); + cond_resched(); + sequence = mapping->truncate_count; + smp_rmb(); goto retry; } page_table = pte_offset_map(pmd, address); @@ -1924,25 +1900,22 @@ retry: page_add_anon_rmap(new_page, vma, address); } else page_add_file_rmap(new_page); - pte_unmap(page_table); } else { /* One of our sibling threads was faster, back out. */ - pte_unmap(page_table); page_cache_release(new_page); - spin_unlock(&mm->page_table_lock); - goto out; + goto unlock; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); +unlock: + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); -out: return ret; oom: page_cache_release(new_page); - ret = VM_FAULT_OOM; - goto out; + return VM_FAULT_OOM; } /* @@ -1950,29 +1923,28 @@ oom: * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. */ -static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, - unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) +static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pmd_t *pmd, + int write_access, pte_t orig_pte) { - unsigned long pgoff; + pgoff_t pgoff; int err; - BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); - /* - * Fall back to the linear mapping if the fs does not support - * ->populate: - */ - if (!vma->vm_ops->populate || - (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(mm, address, pte); - return do_no_page(mm, vma, address, write_access, pte, pmd); - } - - pgoff = pte_to_pgoff(*pte); - - pte_unmap(pte); + pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { + /* + * Page table corrupted: show pte and kill process. + */ + pte_ERROR(orig_pte); + return VM_FAULT_OOM; + } + /* We can then assume vm->vm_ops && vma->vm_ops->populate */ + + pgoff = pte_to_pgoff(orig_pte); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, + vma->vm_page_prot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) @@ -2002,23 +1974,25 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, * release it when done. */ static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct * vma, unsigned long address, - int write_access, pte_t *pte, pmd_t *pmd) + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; entry = *pte; if (!pte_present(entry)) { - /* - * If it truly wasn't present, we know that kswapd - * and the PTE updates will not touch it later. So - * drop the lock. - */ - if (pte_none(entry)) - return do_no_page(mm, vma, address, write_access, pte, pmd); + if (pte_none(entry)) { + if (!vma->vm_ops || !vma->vm_ops->nopage) + return do_anonymous_page(mm, vma, address, + pte, pmd, write_access); + return do_no_page(mm, vma, address, + pte, pmd, write_access); + } if (pte_file(entry)) - return do_file_page(mm, vma, address, write_access, pte, pmd); - return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); + return do_file_page(mm, vma, address, + pte, pmd, write_access, entry); + return do_swap_page(mm, vma, address, + pte, pmd, write_access, entry); } if (write_access) { @@ -2038,7 +2012,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, /* * By the time we get here, we already hold the mm semaphore */ -int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { pgd_t *pgd; @@ -2072,7 +2046,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, if (!pte) goto oom; - return handle_pte_fault(mm, vma, address, write_access, pte, pmd); + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); oom: spin_unlock(&mm->page_table_lock); diff --git a/mm/shmem.c b/mm/shmem.c index 55e04a0734c1..6796311a23ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1201,7 +1201,7 @@ static int shmem_populate(struct vm_area_struct *vma, page_cache_release(page); return err; } - } else { + } else if (vma->vm_flags & VM_NONLINEAR) { /* No page was found just because we can't read it in * now (being here implies nonblock != 0), but the page * may exist, so set the PTE to fault it in later. */ From 7be7a546994f1222b2312fd348da14e16b6b7b42 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:00 -0700 Subject: [PATCH 27/98] [PATCH] mm: move_page_tables by extents Speeding up mremap's moving of ptes has never been a priority, but the locking will get more complicated shortly, and is already too baroque. Scrap the current one-by-one moving, do an extent at a time: curtailed by end of src and dst pmds (have to use PMD_SIZE: the way pmd_addr_end gets elided doesn't match this usage), and by latency considerations. One nice property of the old method is lost: it never allocated a page table unless absolutely necessary, so you could free empty page tables by mremapping to and fro. Whereas this way, it allocates a dst table wherever there was a src table. I keep diving in to reinstate the old behaviour, then come out preferring not to clutter how it now is. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 166 ++++++++++++++++++++++------------------------------ 1 file changed, 71 insertions(+), 95 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 55df8f53e84d..f4e562098500 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,40 +22,15 @@ #include #include -static pte_t *get_one_pte_map_nested(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte = NULL; - - pgd = pgd_offset(mm, addr); - if (pgd_none_or_clear_bad(pgd)) - goto end; - - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) - goto end; - - pmd = pmd_offset(pud, addr); - if (pmd_none_or_clear_bad(pmd)) - goto end; - - pte = pte_offset_map_nested(pmd, addr); - if (pte_none(*pte)) { - pte_unmap_nested(pte); - pte = NULL; - } -end: - return pte; -} - -static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + /* + * We don't need page_table_lock: we have mmap_sem exclusively. + */ pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) return NULL; @@ -68,35 +43,48 @@ static pte_t *get_one_pte_map(struct mm_struct *mm, unsigned long addr) if (pmd_none_or_clear_bad(pmd)) return NULL; - return pte_offset_map(pmd, addr); + return pmd; } -static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd; - pte_t *pte = NULL; + pmd_t *pmd = NULL; + pte_t *pte; + /* + * We do need page_table_lock: because allocators expect that. + */ + spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); if (!pud) - return NULL; + goto out; + pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); - return pte; + if (!pmd) + goto out; + + pte = pte_alloc_map(mm, pmd, addr); + if (!pte) { + pmd = NULL; + goto out; + } + pte_unmap(pte); +out: + spin_unlock(&mm->page_table_lock); + return pmd; } -static int -move_one_page(struct vm_area_struct *vma, unsigned long old_addr, - struct vm_area_struct *new_vma, unsigned long new_addr) +static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, + unsigned long old_addr, unsigned long old_end, + struct vm_area_struct *new_vma, pmd_t *new_pmd, + unsigned long new_addr) { struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; - int error = 0; - pte_t *src, *dst; + pte_t *old_pte, *new_pte, pte; if (vma->vm_file) { /* @@ -111,74 +99,62 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr, new_vma->vm_truncate_count != vma->vm_truncate_count) new_vma->vm_truncate_count = 0; } + spin_lock(&mm->page_table_lock); + old_pte = pte_offset_map(old_pmd, old_addr); + new_pte = pte_offset_map_nested(new_pmd, new_addr); - src = get_one_pte_map_nested(mm, old_addr); - if (src) { - /* - * Look to see whether alloc_one_pte_map needs to perform a - * memory allocation. If it does then we need to drop the - * atomic kmap - */ - dst = get_one_pte_map(mm, new_addr); - if (unlikely(!dst)) { - pte_unmap_nested(src); - if (mapping) - spin_unlock(&mapping->i_mmap_lock); - dst = alloc_one_pte_map(mm, new_addr); - if (mapping && !spin_trylock(&mapping->i_mmap_lock)) { - spin_unlock(&mm->page_table_lock); - spin_lock(&mapping->i_mmap_lock); - spin_lock(&mm->page_table_lock); - } - src = get_one_pte_map_nested(mm, old_addr); - } - /* - * Since alloc_one_pte_map can drop and re-acquire - * page_table_lock, we should re-check the src entry... - */ - if (src) { - if (dst) { - pte_t pte; - pte = ptep_clear_flush(vma, old_addr, src); - - /* ZERO_PAGE can be dependant on virtual addr */ - pte = move_pte(pte, new_vma->vm_page_prot, - old_addr, new_addr); - set_pte_at(mm, new_addr, dst, pte); - } else - error = -ENOMEM; - pte_unmap_nested(src); - } - if (dst) - pte_unmap(dst); + for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, + new_pte++, new_addr += PAGE_SIZE) { + if (pte_none(*old_pte)) + continue; + pte = ptep_clear_flush(vma, old_addr, old_pte); + /* ZERO_PAGE can be dependant on virtual addr */ + pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); + set_pte_at(mm, new_addr, new_pte, pte); } + + pte_unmap_nested(new_pte - 1); + pte_unmap(old_pte - 1); spin_unlock(&mm->page_table_lock); if (mapping) spin_unlock(&mapping->i_mmap_lock); - return error; } +#define LATENCY_LIMIT (64 * PAGE_SIZE) + static unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len) { - unsigned long offset; + unsigned long extent, next, old_end; + pmd_t *old_pmd, *new_pmd; - flush_cache_range(vma, old_addr, old_addr + len); + old_end = old_addr + len; + flush_cache_range(vma, old_addr, old_end); - /* - * This is not the clever way to do this, but we're taking the - * easy way out on the assumption that most remappings will be - * only a few pages.. This also makes error recovery easier. - */ - for (offset = 0; offset < len; offset += PAGE_SIZE) { - if (move_one_page(vma, old_addr + offset, - new_vma, new_addr + offset) < 0) - break; + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); + next = (old_addr + PMD_SIZE) & PMD_MASK; + if (next - 1 > old_end) + next = old_end; + extent = next - old_addr; + old_pmd = get_old_pmd(vma->vm_mm, old_addr); + if (!old_pmd) + continue; + new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + if (!new_pmd) + break; + next = (new_addr + PMD_SIZE) & PMD_MASK; + if (extent > next - new_addr) + extent = next - new_addr; + if (extent > LATENCY_LIMIT) + extent = LATENCY_LIMIT; + move_ptes(vma, old_pmd, old_addr, old_addr + extent, + new_vma, new_pmd, new_addr); } - return offset; + + return len + old_addr - old_end; /* how much done */ } static unsigned long move_vma(struct vm_area_struct *vma, From 15a23ffa2fc91cebdac44d4aee994f59d5c28dc0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:01 -0700 Subject: [PATCH 28/98] [PATCH] mm: tlb_gather_mmu get_cpu_var tlb_gather_mmu dates from before kernel preemption was allowed, and uses smp_processor_id or __get_cpu_var to find its per-cpu mmu_gather. That works because it's currently only called after getting page_table_lock, which is not dropped until after the matching tlb_finish_mmu. But don't rely on that, it will soon change: now disable preemption internally by proper get_cpu_var in tlb_gather_mmu, put_cpu_var in tlb_finish_mmu. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-arm/tlb.h | 5 +++-- include/asm-arm26/tlb.h | 7 ++++--- include/asm-generic/tlb.h | 10 +++++----- include/asm-ia64/tlb.h | 6 ++++-- include/asm-sparc64/tlb.h | 4 +++- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h index 9bb325c54645..da41df20928f 100644 --- a/include/asm-arm/tlb.h +++ b/include/asm-arm/tlb.h @@ -39,8 +39,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - int cpu = smp_processor_id(); - struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; tlb->freed = 0; @@ -65,6 +64,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); + + put_cpu_var(mmu_gathers); } static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb) diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h index 1316352a58f3..8486b00a6799 100644 --- a/include/asm-arm26/tlb.h +++ b/include/asm-arm26/tlb.h @@ -17,13 +17,12 @@ struct mmu_gather { unsigned int avoided_flushes; }; -extern struct mmu_gather mmu_gathers[NR_CPUS]; +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - int cpu = smp_processor_id(); - struct mmu_gather *tlb = &mmu_gathers[cpu]; + struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; tlb->freed = 0; @@ -52,6 +51,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); + + put_cpu_var(mmu_gathers); } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 7d0298347ee7..c8232622c8d9 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -35,9 +35,7 @@ #endif /* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. This structure - * can be per-CPU or per-MM as the page table lock is held for the duration of - * TLB shootdown. + * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; @@ -57,7 +55,7 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); static inline struct mmu_gather * tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id()); + struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; @@ -85,7 +83,7 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources - * that were required. The page table lock is still held at this point. + * that were required. */ static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) @@ -101,6 +99,8 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); + + put_cpu_var(mmu_gathers); } static inline unsigned int diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h index 3a9a6d1be75c..1b82299d7c1e 100644 --- a/include/asm-ia64/tlb.h +++ b/include/asm-ia64/tlb.h @@ -129,7 +129,7 @@ ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long e static inline struct mmu_gather * tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *tlb = &__get_cpu_var(mmu_gathers); + struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; /* @@ -154,7 +154,7 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) /* * Called at the end of the shootdown operation to free up any resources that were - * collected. The page table lock is still held at this point. + * collected. */ static inline void tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) @@ -174,6 +174,8 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) /* keep the page table cache within bounds */ check_pgt_cache(); + + put_cpu_var(mmu_gathers); } static inline unsigned int diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index 9baf57db01d2..169309bdbf82 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -44,7 +44,7 @@ extern void flush_tlb_pending(void); static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) { - struct mmu_gather *mp = &__get_cpu_var(mmu_gathers); + struct mmu_gather *mp = &get_cpu_var(mmu_gathers); BUG_ON(mp->tlb_nr); @@ -97,6 +97,8 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un /* keep the page table cache within bounds */ check_pgt_cache(); + + put_cpu_var(mmu_gathers); } static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp) From 4d6ddfa9242bc3d27fb0f7248f6fdee0299c731f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:02 -0700 Subject: [PATCH 29/98] [PATCH] mm: tlb_is_full_mm was obscure tlb_is_full_mm? What does that mean? The TLB is full? No, it means that the mm's last user has gone and the whole mm is being torn down. And it's an inline function because sparc64 uses a different (slightly better) "tlb_frozen" name for the flag others call "fullmm". And now the ptep_get_and_clear_full macro used in zap_pte_range refers directly to tlb->fullmm, which would be wrong for sparc64. Rather than correct that, I'd prefer to scrap tlb_is_full_mm altogether, and change sparc64 to just use the same poor name as everyone else - is that okay? Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc64/mm/tlb.c | 4 ++-- include/asm-arm/tlb.h | 5 ----- include/asm-arm26/tlb.h | 7 ------- include/asm-generic/tlb.h | 6 ------ include/asm-ia64/tlb.h | 6 ------ include/asm-sparc64/tlb.h | 13 ++++--------- mm/memory.c | 4 ++-- 7 files changed, 8 insertions(+), 37 deletions(-) diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index 90ca99d0b89c..6a43f7cd090e 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c @@ -72,7 +72,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, pte_t *ptep, pte_t no_cache_flush: - if (mp->tlb_frozen) + if (mp->fullmm) return; nr = mp->tlb_nr; @@ -97,7 +97,7 @@ void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long unsigned long nr = mp->tlb_nr; long s = start, e = end, vpte_base; - if (mp->tlb_frozen) + if (mp->fullmm) return; /* If start is greater than end, that is a real problem. */ diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h index da41df20928f..a35ab0f2e25e 100644 --- a/include/asm-arm/tlb.h +++ b/include/asm-arm/tlb.h @@ -68,11 +68,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) put_cpu_var(mmu_gathers); } -static inline unsigned int tlb_is_full_mm(struct mmu_gather *tlb) -{ - return tlb->fullmm; -} - #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) /* diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h index 8486b00a6799..c7d54ca0a239 100644 --- a/include/asm-arm26/tlb.h +++ b/include/asm-arm26/tlb.h @@ -55,13 +55,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) put_cpu_var(mmu_gathers); } - -static inline unsigned int -tlb_is_full_mm(struct mmu_gather *tlb) -{ - return tlb->fullmm; -} - #define tlb_remove_tlb_entry(tlb,ptep,address) do { } while (0) //#define tlb_start_vma(tlb,vma) do { } while (0) //FIXME - ARM32 uses this now that things changed in the kernel. seems like it may be pointless on arm26, however to get things compiling... diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index c8232622c8d9..5d352a70f004 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -103,12 +103,6 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) put_cpu_var(mmu_gathers); } -static inline unsigned int -tlb_is_full_mm(struct mmu_gather *tlb) -{ - return tlb->fullmm; -} - /* tlb_remove_page * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while * handling the additional races in SMP caused by other CPUs caching valid diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h index 1b82299d7c1e..0bbd79f6a793 100644 --- a/include/asm-ia64/tlb.h +++ b/include/asm-ia64/tlb.h @@ -178,12 +178,6 @@ tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) put_cpu_var(mmu_gathers); } -static inline unsigned int -tlb_is_full_mm(struct mmu_gather *tlb) -{ - return tlb->fullmm; -} - /* * Logically, this routine frees PAGE. On MP machines, the actual freeing of the page * must be delayed until after the TLB has been flushed (see comments at the beginning of diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index 169309bdbf82..5d194eae870c 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -25,7 +25,7 @@ struct mmu_gather { struct mm_struct *mm; unsigned int pages_nr; unsigned int need_flush; - unsigned int tlb_frozen; + unsigned int fullmm; unsigned int tlb_nr; unsigned long freed; unsigned long vaddrs[TLB_BATCH_NR]; @@ -50,7 +50,7 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i mp->mm = mm; mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; - mp->tlb_frozen = full_mm_flush; + mp->fullmm = full_mm_flush; mp->freed = 0; return mp; @@ -88,10 +88,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un tlb_flush_mmu(mp); - if (mp->tlb_frozen) { + if (mp->fullmm) { if (CTX_VALID(mm->context)) do_flush_tlb_mm(mm); - mp->tlb_frozen = 0; + mp->fullmm = 0; } else flush_tlb_pending(); @@ -101,11 +101,6 @@ static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, un put_cpu_var(mmu_gathers); } -static inline unsigned int tlb_is_full_mm(struct mmu_gather *mp) -{ - return mp->tlb_frozen; -} - static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) { mp->need_flush = 1; diff --git a/mm/memory.c b/mm/memory.c index eaf79031f573..585bb4e0b97f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -249,7 +249,7 @@ void free_pgd_range(struct mmu_gather **tlb, free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); - if (!tlb_is_full_mm(*tlb)) + if (!(*tlb)->fullmm) flush_tlb_pgtables((*tlb)->mm, start, end); } @@ -698,7 +698,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; - int fullmm = tlb_is_full_mm(*tlbp); + int fullmm = (*tlbp)->fullmm; for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; From fc2acab31be8e869b2d5f6de12f557f6f054f19c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:03 -0700 Subject: [PATCH 30/98] [PATCH] mm: tlb_finish_mmu forget rss zap_pte_range has been counting the pages it frees in tlb->freed, then tlb_finish_mmu has used that to update the mm's rss. That got stranger when I added anon_rss, yet updated it by a different route; and stranger when rss and anon_rss became mm_counters with special access macros. And it would no longer be viable if we're relying on page_table_lock to stabilize the mm_counter, but calling tlb_finish_mmu outside that lock. Remove the mmu_gather's freed field, let tlb_finish_mmu stick to its own business, just decrement the rss mm_counter in zap_pte_range (yes, there was some point to batching the update, and a subsequent patch restores that). And forget the anal paranoia of first reading the counter to avoid going negative - if rss does go negative, just fix that bug. Remove the mmu_gather's flushes and avoided_flushes from arm and arm26: no use was being made of them. But arm26 alone was actually using the freed, in the way some others use need_flush: give it a need_flush. arm26 seems to prefer spaces to tabs here: respect that. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc64/mm/tlb.c | 3 +-- include/asm-arm/tlb.h | 15 +-------------- include/asm-arm26/tlb.h | 35 +++++++++++++---------------------- include/asm-generic/tlb.h | 9 --------- include/asm-ia64/tlb.h | 9 --------- include/asm-sparc64/tlb.h | 14 ++------------ mm/memory.c | 2 +- 7 files changed, 18 insertions(+), 69 deletions(-) diff --git a/arch/sparc64/mm/tlb.c b/arch/sparc64/mm/tlb.c index 6a43f7cd090e..8b104be4662b 100644 --- a/arch/sparc64/mm/tlb.c +++ b/arch/sparc64/mm/tlb.c @@ -18,8 +18,7 @@ /* Heavily inspired by the ppc64 code. */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = - { NULL, 0, 0, 0, 0, 0, { 0 }, { NULL }, }; +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers) = { 0, }; void flush_tlb_pending(void) { diff --git a/include/asm-arm/tlb.h b/include/asm-arm/tlb.h index a35ab0f2e25e..f49bfb78c221 100644 --- a/include/asm-arm/tlb.h +++ b/include/asm-arm/tlb.h @@ -27,11 +27,7 @@ */ struct mmu_gather { struct mm_struct *mm; - unsigned int freed; unsigned int fullmm; - - unsigned int flushes; - unsigned int avoided_flushes; }; DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -42,7 +38,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; - tlb->freed = 0; tlb->fullmm = full_mm_flush; return tlb; @@ -51,16 +46,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - struct mm_struct *mm = tlb->mm; - unsigned long freed = tlb->freed; - int rss = get_mm_counter(mm, rss); - - if (rss < freed) - freed = rss; - add_mm_counter(mm, rss, -freed); - if (tlb->fullmm) - flush_tlb_mm(mm); + flush_tlb_mm(tlb->mm); /* keep the page table cache within bounds */ check_pgt_cache(); diff --git a/include/asm-arm26/tlb.h b/include/asm-arm26/tlb.h index c7d54ca0a239..08ddd85b8d35 100644 --- a/include/asm-arm26/tlb.h +++ b/include/asm-arm26/tlb.h @@ -10,11 +10,8 @@ */ struct mmu_gather { struct mm_struct *mm; - unsigned int freed; - unsigned int fullmm; - - unsigned int flushes; - unsigned int avoided_flushes; + unsigned int need_flush; + unsigned int fullmm; }; DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); @@ -25,8 +22,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; - tlb->freed = 0; - tlb->fullmm = full_mm_flush; + tlb->need_flush = 0; + tlb->fullmm = full_mm_flush; return tlb; } @@ -34,20 +31,8 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - struct mm_struct *mm = tlb->mm; - unsigned long freed = tlb->freed; - int rss = get_mm_counter(mm, rss); - - if (rss < freed) - freed = rss; - add_mm_counter(mm, rss, -freed); - - if (freed) { - flush_tlb_mm(mm); - tlb->flushes++; - } else { - tlb->avoided_flushes++; - } + if (tlb->need_flush) + flush_tlb_mm(tlb->mm); /* keep the page table cache within bounds */ check_pgt_cache(); @@ -65,7 +50,13 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) } while (0) #define tlb_end_vma(tlb,vma) do { } while (0) -#define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) +static inline void +tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->need_flush = 1; + free_page_and_swap_cache(page); +} + #define pte_free_tlb(tlb,ptep) pte_free(ptep) #define pmd_free_tlb(tlb,pmdp) pmd_free(pmdp) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 5d352a70f004..cdd4145243cd 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -42,7 +42,6 @@ struct mmu_gather { unsigned int nr; /* set to ~0U means fast mode */ unsigned int need_flush;/* Really unmapped some ptes? */ unsigned int fullmm; /* non-zero means full mm flush */ - unsigned long freed; struct page * pages[FREE_PTE_NR]; }; @@ -63,7 +62,6 @@ tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush) tlb->nr = num_online_cpus() > 1 ? 0U : ~0U; tlb->fullmm = full_mm_flush; - tlb->freed = 0; return tlb; } @@ -88,13 +86,6 @@ tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) static inline void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { - int freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - int rss = get_mm_counter(mm, rss); - - if (rss < freed) - freed = rss; - add_mm_counter(mm, rss, -freed); tlb_flush_mmu(tlb, start, end); /* keep the page table cache within bounds */ diff --git a/include/asm-ia64/tlb.h b/include/asm-ia64/tlb.h index 0bbd79f6a793..834370b9dea1 100644 --- a/include/asm-ia64/tlb.h +++ b/include/asm-ia64/tlb.h @@ -60,7 +60,6 @@ struct mmu_gather { unsigned int nr; /* == ~0U => fast mode */ unsigned char fullmm; /* non-zero means full mm flush */ unsigned char need_flush; /* really unmapped some PTEs? */ - unsigned long freed; /* number of pages freed */ unsigned long start_addr; unsigned long end_addr; struct page *pages[FREE_PTE_NR]; @@ -147,7 +146,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) */ tlb->nr = (num_online_cpus() == 1) ? ~0U : 0; tlb->fullmm = full_mm_flush; - tlb->freed = 0; tlb->start_addr = ~0UL; return tlb; } @@ -159,13 +157,6 @@ tlb_gather_mmu (struct mm_struct *mm, unsigned int full_mm_flush) static inline void tlb_finish_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) { - unsigned long freed = tlb->freed; - struct mm_struct *mm = tlb->mm; - unsigned long rss = get_mm_counter(mm, rss); - - if (rss < freed) - freed = rss; - add_mm_counter(mm, rss, -freed); /* * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and * tlb->end_addr. diff --git a/include/asm-sparc64/tlb.h b/include/asm-sparc64/tlb.h index 5d194eae870c..66138d959df5 100644 --- a/include/asm-sparc64/tlb.h +++ b/include/asm-sparc64/tlb.h @@ -27,7 +27,6 @@ struct mmu_gather { unsigned int need_flush; unsigned int fullmm; unsigned int tlb_nr; - unsigned long freed; unsigned long vaddrs[TLB_BATCH_NR]; struct page *pages[FREE_PTE_NR]; }; @@ -51,7 +50,6 @@ static inline struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned i mp->mm = mm; mp->pages_nr = num_online_cpus() > 1 ? 0U : ~0U; mp->fullmm = full_mm_flush; - mp->freed = 0; return mp; } @@ -78,19 +76,11 @@ extern void smp_flush_tlb_mm(struct mm_struct *mm); static inline void tlb_finish_mmu(struct mmu_gather *mp, unsigned long start, unsigned long end) { - unsigned long freed = mp->freed; - struct mm_struct *mm = mp->mm; - unsigned long rss = get_mm_counter(mm, rss); - - if (rss < freed) - freed = rss; - add_mm_counter(mm, rss, -freed); - tlb_flush_mmu(mp); if (mp->fullmm) { - if (CTX_VALID(mm->context)) - do_flush_tlb_mm(mm); + if (CTX_VALID(mp->mm->context)) + do_flush_tlb_mm(mp->mm); mp->fullmm = 0; } else flush_tlb_pending(); diff --git a/mm/memory.c b/mm/memory.c index 585bb4e0b97f..51eb38574830 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -582,7 +582,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, if (pte_young(ptent)) mark_page_accessed(page); } - tlb->freed++; + dec_mm_counter(tlb->mm, rss); page_remove_rmap(page); tlb_remove_page(tlb, page); continue; From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:04 -0700 Subject: [PATCH 31/98] [PATCH] mm: mm_init set_mm_counters How is anon_rss initialized? In dup_mmap, and by mm_alloc's memset; but that's not so good if an mm_counter_t is a special type. And how is rss initialized? By set_mm_counter, all over the place. Come on, we just need to initialize them both at once by set_mm_counter in mm_init (which follows the memcpy when forking). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/kernel/irixelf.c | 1 - arch/sparc64/kernel/binfmt_aout32.c | 1 - arch/x86_64/ia32/ia32_aout.c | 1 - fs/binfmt_aout.c | 1 - fs/binfmt_elf.c | 1 - fs/binfmt_elf_fdpic.c | 7 ------- fs/binfmt_flat.c | 1 - fs/binfmt_som.c | 1 - kernel/fork.c | 4 ++-- 9 files changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c index 99262fe64560..7ce34d4aa220 100644 --- a/arch/mips/kernel/irixelf.c +++ b/arch/mips/kernel/irixelf.c @@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Do this so that we can load the interpreter, if need be. We will * change some of these later. */ - set_mm_counter(current->mm, rss, 0); setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); current->mm->start_stack = bprm->p; diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c index b2854ef221d0..edf52d06b280 100644 --- a/arch/sparc64/kernel/binfmt_aout32.c +++ b/arch/sparc64/kernel/binfmt_aout32.c @@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->brk = ex.a_bss + (current->mm->start_brk = N_BSSADDR(ex)); - set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c index 3e6780fa0186..93c60f4aa47a 100644 --- a/arch/x86_64/ia32/ia32_aout.c +++ b/arch/x86_64/ia32/ia32_aout.c @@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = TASK_UNMAPPED_BASE; current->mm->cached_hole_size = 0; - set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index dd9baabaf016..72011826f0cb 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; - set_mm_counter(current->mm, rss, 0); current->mm->mmap = NULL; compute_creds(bprm); current->flags &= ~PF_FORKNOEXEC; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4b15576e584..918ccc267e41 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* Do this so that we can load the interpreter, if need be. We will change some of these later */ - set_mm_counter(current->mm, rss, 0); current->mm->free_area_cache = current->mm->mmap_base; current->mm->cached_hole_size = 0; retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 134c9c0d1f54..dda87c4c82a3 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs &interp_params, ¤t->mm->start_stack, ¤t->mm->start_brk); -#endif - /* do this so that we can load the interpreter, if need be - * - we will change some of these later - */ - set_mm_counter(current->mm, rss, 0); - -#ifdef CONFIG_MMU retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); if (retval < 0) { send_sig(SIGKILL, current, 0); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 7974efa107bc..9d6625829b99 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm, current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len; - set_mm_counter(current->mm, rss, 0); } if (flags & FLAT_FLAG_KTRACE) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 227a2682d2bf..00a91dc25d16 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) create_som_tables(bprm); current->mm->start_stack = bprm->p; - set_mm_counter(current->mm, rss, 0); #if 0 printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk); diff --git a/kernel/fork.c b/kernel/fork.c index e2ff11f8c1b0..25caa02e2eac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -198,8 +198,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) mm->free_area_cache = oldmm->mmap_base; mm->cached_hole_size = ~0UL; mm->map_count = 0; - set_mm_counter(mm, rss, 0); - set_mm_counter(mm, anon_rss, 0); cpus_clear(mm->cpu_vm_mask); mm->mm_rb = RB_ROOT; rb_link = &mm->mm_rb.rb_node; @@ -323,6 +321,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; + set_mm_counter(mm, rss, 0); + set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); mm->ioctx_list = NULL; From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:05 -0700 Subject: [PATCH 32/98] [PATCH] mm: rss = file_rss + anon_rss I was lazy when we added anon_rss, and chose to change as few places as possible. So currently each anonymous page has to be counted twice, in rss and in anon_rss. Which won't be so good if those are atomic counts in some configurations. Change that around: keep file_rss and anon_rss separately, and add them together (with get_mm_rss macro) when the total is needed - reading two atomics is much cheaper than updating two atomics. And update anon_rss upfront, typically in memory.c, not tucked away in page_add_anon_rmap. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- fs/proc/array.c | 2 +- fs/proc/task_mmu.c | 8 +++----- include/linux/sched.h | 4 +++- kernel/acct.c | 2 +- kernel/fork.c | 4 ++-- mm/fremap.c | 4 ++-- mm/hugetlb.c | 6 +++--- mm/memory.c | 31 +++++++++++++++++-------------- mm/nommu.c | 2 +- mm/rmap.c | 8 +++----- mm/swapfile.c | 2 +- 12 files changed, 38 insertions(+), 37 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d2208f7c87db..cefadf5ab83b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -330,7 +330,7 @@ void install_arg_page(struct vm_area_struct *vma, pte_unmap(pte); goto out; } - inc_mm_counter(mm, rss); + inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); diff --git a/fs/proc/array.c b/fs/proc/array.c index d84eecacbeaf..3e1239e4b303 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) jiffies_to_clock_t(it_real_value), start_time, vsize, - mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */ + mm ? get_mm_rss(mm) : 0, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994612bc72d0..bccee7cf9ccd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -29,7 +29,7 @@ char *task_mem(struct mm_struct *mm, char *buffer) "VmPTE:\t%8lu kB\n", (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), - get_mm_counter(mm, rss) << (PAGE_SHIFT-10), + get_mm_rss(mm) << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); @@ -44,13 +44,11 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - int rss = get_mm_counter(mm, rss); - - *shared = rss - get_mm_counter(mm, anon_rss); + *shared = get_mm_counter(mm, file_rss); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = rss; + *resident = *shared + get_mm_counter(mm, anon_rss); return mm->total_vm; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 27519df0f987..afcaac66cbd5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- +#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) + typedef unsigned long mm_counter_t; struct mm_struct { @@ -286,7 +288,7 @@ struct mm_struct { unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; /* Special counters protected by the page_table_lock */ - mm_counter_t _rss; + mm_counter_t _file_rss; mm_counter_t _anon_rss; unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497e..2e3f4a47e7d0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk) if (delta == 0) return; tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; } } diff --git a/kernel/fork.c b/kernel/fork.c index 25caa02e2eac..2048ed7b5872 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -321,7 +321,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) INIT_LIST_HEAD(&mm->mmlist); mm->core_waiters = 0; mm->nr_ptes = 0; - set_mm_counter(mm, rss, 0); + set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); rwlock_init(&mm->ioctx_list_lock); @@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) if (retval) goto free_pt; - mm->hiwater_rss = get_mm_counter(mm,rss); + mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; good_mm: diff --git a/mm/fremap.c b/mm/fremap.c index ab23a0673c35..fd7f2a17ff3e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -39,7 +39,7 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_page_dirty(page); page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, rss); + dec_mm_counter(mm, file_rss); } } } else { @@ -95,7 +95,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, zap_pte(mm, vma, addr, pte); - inc_mm_counter(mm,rss); + inc_mm_counter(mm, file_rss); flush_icache_page(vma, page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); page_add_file_rmap(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 61d380678030..094455bcbbf7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -286,7 +286,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); + add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -324,7 +324,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, page = pte_page(pte); put_page(page); - add_mm_counter(mm, rss, - (HPAGE_SIZE / PAGE_SIZE)); + add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); } flush_tlb_range(vma, start, end); } @@ -386,7 +386,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) goto out; } } - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); + add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); } out: diff --git a/mm/memory.c b/mm/memory.c index 51eb38574830..59d42e50fa53 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -397,9 +397,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - inc_mm_counter(dst_mm, rss); if (PageAnon(page)) inc_mm_counter(dst_mm, anon_rss); + else + inc_mm_counter(dst_mm, file_rss); set_pte_at(dst_mm, addr, dst_pte, pte); page_dup_rmap(page); } @@ -581,8 +582,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, set_page_dirty(page); if (pte_young(ptent)) mark_page_accessed(page); + dec_mm_counter(tlb->mm, file_rss); } - dec_mm_counter(tlb->mm, rss); page_remove_rmap(page); tlb_remove_page(tlb, page); continue; @@ -1290,13 +1291,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) { - if (PageAnon(old_page)) - dec_mm_counter(mm, anon_rss); if (PageReserved(old_page)) - inc_mm_counter(mm, rss); - else + inc_mm_counter(mm, anon_rss); + else { page_remove_rmap(old_page); - + if (!PageAnon(old_page)) { + inc_mm_counter(mm, anon_rss); + dec_mm_counter(mm, file_rss); + } + } flush_cache_page(vma, address, pfn); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1701,7 +1704,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* The page isn't present yet, go ahead with the fault. */ - inc_mm_counter(mm, rss); + inc_mm_counter(mm, anon_rss); pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -1774,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(page); goto unlock; } - inc_mm_counter(mm, rss); + inc_mm_counter(mm, anon_rss); entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); lru_cache_add_active(page); @@ -1887,19 +1890,19 @@ retry: */ /* Only go through if we didn't race with anybody else... */ if (pte_none(*page_table)) { - if (!PageReserved(new_page)) - inc_mm_counter(mm, rss); - flush_icache_page(vma, new_page); entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte_at(mm, address, page_table, entry); if (anon) { + inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else + } else if (!PageReserved(new_page)) { + inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + } } else { /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); @@ -2192,7 +2195,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); void update_mem_hiwater(struct task_struct *tsk) { if (tsk->mm) { - unsigned long rss = get_mm_counter(tsk->mm, rss); + unsigned long rss = get_mm_rss(tsk->mm); if (tsk->mm->hiwater_rss < rss) tsk->mm->hiwater_rss = rss; diff --git a/mm/nommu.c b/mm/nommu.c index 0ef241ae3763..599924886eb5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1083,7 +1083,7 @@ void update_mem_hiwater(struct task_struct *tsk) unsigned long rss; if (likely(tsk->mm)) { - rss = get_mm_counter(tsk->mm, rss); + rss = get_mm_rss(tsk->mm); if (tsk->mm->hiwater_rss < rss) tsk->mm->hiwater_rss = rss; if (tsk->mm->hiwater_vm < tsk->mm->total_vm) diff --git a/mm/rmap.c b/mm/rmap.c index 1fc559e09ca8..504757624cce 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -445,8 +445,6 @@ void page_add_anon_rmap(struct page *page, { BUG_ON(PageReserved(page)); - inc_mm_counter(vma->vm_mm, anon_rss); - if (atomic_inc_and_test(&page->_mapcount)) { struct anon_vma *anon_vma = vma->anon_vma; @@ -561,9 +559,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); dec_mm_counter(mm, anon_rss); - } + } else + dec_mm_counter(mm, file_rss); - dec_mm_counter(mm, rss); page_remove_rmap(page); page_cache_release(page); @@ -667,7 +665,7 @@ static void try_to_unmap_cluster(unsigned long cursor, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, rss); + dec_mm_counter(mm, file_rss); (*mapcount)--; } diff --git a/mm/swapfile.c b/mm/swapfile.c index 05c851291241..296e0bbf7836 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -407,7 +407,7 @@ void free_swap_and_cache(swp_entry_t entry) static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, swp_entry_t entry, struct page *page) { - inc_mm_counter(vma->vm_mm, rss); + inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); From ae859762332f19bfc06f4c4a1b1fefb41e9e1084 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:05 -0700 Subject: [PATCH 33/98] [PATCH] mm: batch updating mm_counters tlb_finish_mmu used to batch zap_pte_range's update of mm rss, which may be worthwhile if the mm is contended, and would reduce atomic operations if the counts were atomic. Let zap_pte_range now batch its updates to file_rss and anon_rss, per page-table in case we drop the lock outside; and copy_pte_range batch them too. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 59d42e50fa53..da642b5528fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -332,6 +332,16 @@ out: return pte_offset_kernel(pmd, address); } +static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +{ + if (file_rss) + add_mm_counter(mm, file_rss, file_rss); + if (anon_rss) + add_mm_counter(mm, anon_rss, anon_rss); +} + +#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */ + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -341,7 +351,7 @@ out: * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ -static inline void +static inline int copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, unsigned long addr) @@ -349,6 +359,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t pte = *src_pte; struct page *page; unsigned long pfn; + int anon = NO_RSS; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -361,8 +372,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, spin_unlock(&mmlist_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } pfn = pte_pfn(pte); @@ -375,10 +385,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pfn_valid(pfn)) page = pfn_to_page(pfn); - if (!page || PageReserved(page)) { - set_pte_at(dst_mm, addr, dst_pte, pte); - return; - } + if (!page || PageReserved(page)) + goto out_set_pte; /* * If it's a COW mapping, write protect it both @@ -397,12 +405,12 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); get_page(page); - if (PageAnon(page)) - inc_mm_counter(dst_mm, anon_rss); - else - inc_mm_counter(dst_mm, file_rss); - set_pte_at(dst_mm, addr, dst_pte, pte); page_dup_rmap(page); + anon = !!PageAnon(page); + +out_set_pte: + set_pte_at(dst_mm, addr, dst_pte, pte); + return anon; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -412,8 +420,10 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; unsigned long vm_flags = vma->vm_flags; int progress = 0; + int rss[NO_RSS+1], anon; again: + rss[1] = rss[0] = 0; dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); if (!dst_pte) return -ENOMEM; @@ -436,13 +446,16 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); + anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vm_flags, addr); + rss[anon]++; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); spin_unlock(&src_mm->page_table_lock); pte_unmap_nested(src_pte - 1); pte_unmap(dst_pte - 1); + add_mm_rss(dst_mm, rss[0], rss[1]); cond_resched_lock(&dst_mm->page_table_lock); if (addr != end) goto again; @@ -533,6 +546,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, struct zap_details *details) { pte_t *pte; + int file_rss = 0; + int anon_rss = 0; pte = pte_offset_map(pmd, addr); do { @@ -576,13 +591,13 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, set_pte_at(tlb->mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - dec_mm_counter(tlb->mm, anon_rss); + anon_rss++; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent)) mark_page_accessed(page); - dec_mm_counter(tlb->mm, file_rss); + file_rss++; } page_remove_rmap(page); tlb_remove_page(tlb, page); @@ -598,6 +613,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, free_swap_and_cache(pte_to_swp_entry(ptent)); pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); + + add_mm_rss(tlb->mm, -file_rss, -anon_rss); pte_unmap(pte - 1); } From fd3e42fcc888a773572282575d2fdbf5cfd6216e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:06 -0700 Subject: [PATCH 34/98] [PATCH] mm: dup_mmap use oldmm more Use the parent's oldmm throughout dup_mmap, instead of perversely going back to current->mm. (Can you hear the sigh of relief from those mpnts? Usually I squash them, but not today.) Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 2048ed7b5872..0e7fe4a8a8df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -182,16 +182,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) } #ifdef CONFIG_MMU -static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) +static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct * mpnt, *tmp, **pprev; + struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; struct mempolicy *pol; down_write(&oldmm->mmap_sem); - flush_cache_mm(current->mm); + flush_cache_mm(oldmm); mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; @@ -204,7 +204,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) rb_parent = NULL; pprev = &mm->mmap; - for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -265,7 +265,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, current->mm, tmp); + retval = copy_page_range(mm, oldmm, tmp); spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) @@ -277,7 +277,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) retval = 0; out: - flush_tlb_mm(current->mm); + flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: From 7ee78232501ea9de2b6c8f10d32c9a0fee541357 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:08 -0700 Subject: [PATCH 35/98] [PATCH] mm: dup_mmap down new mmap_sem One anomaly remains from when Andrea rationalized the responsibilities of mmap_sem and page_table_lock: in dup_mmap we add vmas to the child holding its page_table_lock, but not the mmap_sem which normally guards the vma list and rbtree. Which could be an issue for unuse_mm: though since it just walks down the list (today with page_table_lock, tomorrow not), it's probably okay. Will need a memory barrier? Oh, keep it simple, Nick and I agreed, no harm in taking child's mmap_sem here. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 0e7fe4a8a8df..2a587b3224e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -192,6 +192,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) down_write(&oldmm->mmap_sem); flush_cache_mm(oldmm); + down_write(&mm->mmap_sem); + mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_cache = NULL; @@ -251,10 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) } /* - * Link in the new vma and copy the page table entries: - * link in first so that swapoff can see swap entries. - * Note that, exceptionally, here the vma is inserted - * without holding mm->mmap_sem. + * Link in the new vma and copy the page table entries. */ spin_lock(&mm->page_table_lock); *pprev = tmp; @@ -275,8 +274,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto out; } retval = 0; - out: + up_write(&mm->mmap_sem); flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; From 147efea8ebb034b48aee806caae1da9a2ee41b38 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:09 -0700 Subject: [PATCH 36/98] [PATCH] mm: sh64 hugetlbpage.c The sh64 hugetlbpage.c seems to be erroneous, left over from a bygone age, clashing with the common hugetlb.c. Replace it by a copy of the sh hugetlbpage.c. Except, delete that mk_pte_huge macro neither uses. Signed-off-by: Hugh Dickins Acked-by: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/hugetlbpage.c | 2 - arch/sh64/mm/hugetlbpage.c | 188 +++---------------------------------- 2 files changed, 12 insertions(+), 178 deletions(-) diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 95bb1a6c6060..6b7a7688c98e 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -54,8 +54,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index dcd9c8a8baf8..ed6a505b3ee2 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -54,41 +54,31 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) - -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, pte_t * page_table, int write_access) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry) { - unsigned long i; - pte_t entry; - - add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); - - if (write_access) - entry = pte_mkwrite(pte_mkdirty(mk_pte(page, - vma->vm_page_prot))); - else - entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); - entry = pte_mkyoung(entry); - mk_pte_huge(entry); + int i; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(page_table, entry); - page_table++; - + set_pte_at(mm, addr, ptep, entry); + ptep++; + addr += PAGE_SIZE; pte_val(entry) += PAGE_SIZE; } } -pte_t huge_ptep_get_and_clear(pte_t *ptep) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { pte_t entry; + int i; entry = *ptep; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(pte); - pte++; + pte_clear(mm, addr, ptep); + addr += PAGE_SIZE; + ptep++; } return entry; @@ -106,79 +96,6 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return 0; } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) -{ - pte_t *src_pte, *dst_pte, entry; - struct page *ptepage; - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - int i; - - while (addr < end) { - dst_pte = huge_pte_alloc(dst, addr); - if (!dst_pte) - goto nomem; - src_pte = huge_pte_offset(src, addr); - BUG_ON(!src_pte || pte_none(*src_pte)); - entry = *src_pte; - ptepage = pte_page(entry); - get_page(ptepage); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - set_pte(dst_pte, entry); - pte_val(entry) += PAGE_SIZE; - dst_pte++; - } - add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); - addr += HPAGE_SIZE; - } - return 0; - -nomem: - return -ENOMEM; -} - -int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, struct vm_area_struct **vmas, - unsigned long *position, int *length, int i) -{ - unsigned long vaddr = *position; - int remainder = *length; - - WARN_ON(!is_vm_hugetlb_page(vma)); - - while (vaddr < vma->vm_end && remainder) { - if (pages) { - pte_t *pte; - struct page *page; - - pte = huge_pte_offset(mm, vaddr); - - /* hugetlb should be locked, and hence, prefaulted */ - BUG_ON(!pte || pte_none(*pte)); - - page = pte_page(*pte); - - WARN_ON(!PageCompound(page)); - - get_page(page); - pages[i] = page; - } - - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - --remainder; - ++i; - } - - *length = remainder; - *position = vaddr; - - return i; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -195,84 +112,3 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -void unmap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - unsigned long address; - pte_t *pte; - struct page *page; - int i; - - BUG_ON(start & (HPAGE_SIZE - 1)); - BUG_ON(end & (HPAGE_SIZE - 1)); - - for (address = start; address < end; address += HPAGE_SIZE) { - pte = huge_pte_offset(mm, address); - BUG_ON(!pte); - if (pte_none(*pte)) - continue; - page = pte_page(*pte); - put_page(page); - for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { - pte_clear(mm, address+(i*PAGE_SIZE), pte); - pte++; - } - } - add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); - flush_tlb_range(vma, start, end); -} - -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); - - spin_lock(&mm->page_table_lock); - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; - - if (!pte) { - ret = -ENOMEM; - goto out; - } - if (!pte_none(*pte)) - continue; - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); - } -out: - spin_unlock(&mm->page_table_lock); - return ret; -} From f9c98d0287de42221c624482fd4f8d485c98ab22 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:10 -0700 Subject: [PATCH 37/98] [PATCH] mm: m68k kill stram swap Please, please now delete the Atari CONFIG_STRAM_SWAP code. It may be excellent and ingenious code, but its reference to swap_vfsmnt betrays that it hasn't been built since 2.5.1 (four years old come December), it's delving deep into matters which are the preserve of core mm code, its only purpose is to give the more conscientious mm guys an anxiety attack from time to time; yet we keep on breaking it more and more. If you want to use RAM for swap, then if the MTD driver does not already provide just what you need, I'm sure David could be persuaded to add the extra. But you'd also like to be able to allocate extents of that swap for other use: we can give you a core interface for that if you need. But unbuilt for four years suggests to me that there's no need at all. I cannot swear the patch below won't break your build, but believe so. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 - Documentation/m68k/kernel-options.txt | 24 +- arch/m68k/Kconfig | 24 +- arch/m68k/atari/stram.c | 918 +------------------------- 4 files changed, 17 insertions(+), 951 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 90766b75d1b7..5dffcfefc3c7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1460,8 +1460,6 @@ running once the system is up. stifb= [HW] Format: bpp:[:[:...]] - stram_swap= [HW,M68k] - swiotlb= [IA-64] Number of I/O TLB slabs switches= [HW,M68k] diff --git a/Documentation/m68k/kernel-options.txt b/Documentation/m68k/kernel-options.txt index e191baad8308..d5d3f064f552 100644 --- a/Documentation/m68k/kernel-options.txt +++ b/Documentation/m68k/kernel-options.txt @@ -626,7 +626,7 @@ ignored (others aren't affected). can be performed in optimal order. Not all SCSI devices support tagged queuing (:-(). -4.6 switches= +4.5 switches= ------------- Syntax: switches= @@ -661,28 +661,6 @@ correctly. earlier initialization ("ov_"-less) takes precedence. But the switching-off on reset still happens in this case. -4.5) stram_swap= ----------------- - -Syntax: stram_swap=[,] - - This option is available only if the kernel has been compiled with -CONFIG_STRAM_SWAP enabled. Normally, the kernel then determines -dynamically whether to actually use ST-RAM as swap space. (Currently, -the fraction of ST-RAM must be less or equal 1/3 of total memory to -enable this swapping.) You can override the kernel's decision by -specifying this option. 1 for means always enable the swap, -even if you have less alternate RAM. 0 stands for never swap to -ST-RAM, even if it's small enough compared to the rest of memory. - - If ST-RAM swapping is enabled, the kernel usually uses all free -ST-RAM as swap "device". If the kernel resides in ST-RAM, the region -allocated by it is obviously never used for swapping :-) You can also -limit this amount by specifying the second parameter, , if -you want to use parts of ST-RAM as normal system memory. is -in kBytes and the number should be a multiple of 4 (otherwise: rounded -down). - 5) Options for Amiga Only: ========================== diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index ba960bbc8e6d..1dd5d18b2201 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -388,33 +388,11 @@ config AMIGA_PCMCIA Include support in the kernel for pcmcia on Amiga 1200 and Amiga 600. If you intend to use pcmcia cards say Y; otherwise say N. -config STRAM_SWAP - bool "Support for ST-RAM as swap space" - depends on ATARI && BROKEN - ---help--- - Some Atari 68k machines (including the 520STF and 1020STE) divide - their addressable memory into ST and TT sections. The TT section - (up to 512MB) is the main memory; the ST section (up to 4MB) is - accessible to the built-in graphics board, runs slower, and is - present mainly for backward compatibility with older machines. - - This enables support for using (parts of) ST-RAM as swap space, - instead of as normal system memory. This can first enhance system - performance if you have lots of alternate RAM (compared to the size - of ST-RAM), because executable code always will reside in faster - memory. ST-RAM will remain as ultra-fast swap space. On the other - hand, it allows much improved dynamic allocations of ST-RAM buffers - for device driver modules (e.g. floppy, ACSI, SLM printer, DMA - sound). The probability that such allocations at module load time - fail is drastically reduced. - config STRAM_PROC bool "ST-RAM statistics in /proc" depends on ATARI help - Say Y here to report ST-RAM usage statistics in /proc/stram. See - the help for CONFIG_STRAM_SWAP for discussion of ST-RAM and its - uses. + Say Y here to report ST-RAM usage statistics in /proc/stram. config HEARTBEAT bool "Use power LED as a heartbeat" if AMIGA || APOLLO || ATARI || MAC ||Q40 diff --git a/arch/m68k/atari/stram.c b/arch/m68k/atari/stram.c index 5a3c106b40c8..22e0481a5f7b 100644 --- a/arch/m68k/atari/stram.c +++ b/arch/m68k/atari/stram.c @@ -15,11 +15,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -33,8 +31,6 @@ #include #include -#include - #undef DEBUG #ifdef DEBUG @@ -49,8 +45,7 @@ #include #endif -/* Pre-swapping comments: - * +/* * ++roman: * * New version of ST-Ram buffer allocation. Instead of using the @@ -75,76 +70,6 @@ * */ -/* - * New Nov 1997: Use ST-RAM as swap space! - * - * In the past, there were often problems with modules that require ST-RAM - * buffers. Such drivers have to use __get_dma_pages(), which unfortunately - * often isn't very successful in allocating more than 1 page :-( [1] The net - * result was that most of the time you couldn't insmod such modules (ataflop, - * ACSI, SCSI on Falcon, Atari internal framebuffer, not to speak of acsi_slm, - * which needs a 1 MB buffer... :-). - * - * To overcome this limitation, ST-RAM can now be turned into a very - * high-speed swap space. If a request for an ST-RAM buffer comes, the kernel - * now tries to unswap some pages on that swap device to make some free (and - * contiguous) space. This works much better in comparison to - * __get_dma_pages(), since used swap pages can be selectively freed by either - * moving them to somewhere else in swap space, or by reading them back into - * system memory. Ok, there operation of unswapping isn't really cheap (for - * each page, one has to go through the page tables of all processes), but it - * doesn't happen that often (only when allocation ST-RAM, i.e. when loading a - * module that needs ST-RAM). But it at least makes it possible to load such - * modules! - * - * It could also be that overall system performance increases a bit due to - * ST-RAM swapping, since slow ST-RAM isn't used anymore for holding data or - * executing code in. It's then just a (very fast, compared to disk) back - * storage for not-so-often needed data. (But this effect must be compared - * with the loss of total memory...) Don't know if the effect is already - * visible on a TT, where the speed difference between ST- and TT-RAM isn't - * that dramatic, but it should on machines where TT-RAM is really much faster - * (e.g. Afterburner). - * - * [1]: __get_free_pages() does a fine job if you only want one page, but if - * you want more (contiguous) pages, it can give you such a block only if - * there's already a free one. The algorithm can't try to free buffers or swap - * out something in order to make more free space, since all that page-freeing - * mechanisms work "target-less", i.e. they just free something, but not in a - * specific place. I.e., __get_free_pages() can't do anything to free - * *adjacent* pages :-( This situation becomes even worse for DMA memory, - * since the freeing algorithms are also blind to DMA capability of pages. - */ - -/* 1998-10-20: ++andreas - unswap_by_move disabled because it does not handle swapped shm pages. -*/ - -/* 2000-05-01: ++andreas - Integrated with bootmem. Remove all traces of unswap_by_move. -*/ - -#ifdef CONFIG_STRAM_SWAP -#define ALIGN_IF_SWAP(x) PAGE_ALIGN(x) -#else -#define ALIGN_IF_SWAP(x) (x) -#endif - -/* get index of swap page at address 'addr' */ -#define SWAP_NR(addr) (((addr) - swap_start) >> PAGE_SHIFT) - -/* get address of swap page #'nr' */ -#define SWAP_ADDR(nr) (swap_start + ((nr) << PAGE_SHIFT)) - -/* get number of pages for 'n' bytes (already page-aligned) */ -#define N_PAGES(n) ((n) >> PAGE_SHIFT) - -/* The following two numbers define the maximum fraction of ST-RAM in total - * memory, below that the kernel would automatically use ST-RAM as swap - * space. This decision can be overridden with stram_swap= */ -#define MAX_STRAM_FRACTION_NOM 1 -#define MAX_STRAM_FRACTION_DENOM 3 - /* Start and end (virtual) of ST-RAM */ static void *stram_start, *stram_end; @@ -164,10 +89,9 @@ typedef struct stram_block { } BLOCK; /* values for flags field */ -#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ +#define BLOCK_FREE 0x01 /* free structure in the BLOCKs pool */ #define BLOCK_KMALLOCED 0x02 /* structure allocated by kmalloc() */ -#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ -#define BLOCK_INSWAP 0x10 /* block allocated in swap space */ +#define BLOCK_GFP 0x08 /* block allocated with __get_dma_pages() */ /* list of allocated blocks */ static BLOCK *alloc_list; @@ -179,60 +103,8 @@ static BLOCK *alloc_list; #define N_STATIC_BLOCKS 20 static BLOCK static_blocks[N_STATIC_BLOCKS]; -#ifdef CONFIG_STRAM_SWAP -/* max. number of bytes to use for swapping - * 0 = no ST-RAM swapping - * -1 = do swapping (to whole ST-RAM) if it's less than MAX_STRAM_FRACTION of - * total memory - */ -static int max_swap_size = -1; - -/* start and end of swapping area */ -static void *swap_start, *swap_end; - -/* The ST-RAM's swap info structure */ -static struct swap_info_struct *stram_swap_info; - -/* The ST-RAM's swap type */ -static int stram_swap_type; - -/* Semaphore for get_stram_region. */ -static DECLARE_MUTEX(stram_swap_sem); - -/* major and minor device number of the ST-RAM device; for the major, we use - * the same as Amiga z2ram, which is really similar and impossible on Atari, - * and for the minor a relatively odd number to avoid the user creating and - * using that device. */ -#define STRAM_MAJOR Z2RAM_MAJOR -#define STRAM_MINOR 13 - -/* Some impossible pointer value */ -#define MAGIC_FILE_P (struct file *)0xffffdead - -#ifdef DO_PROC -static unsigned stat_swap_read; -static unsigned stat_swap_write; -static unsigned stat_swap_force; -#endif /* DO_PROC */ - -#endif /* CONFIG_STRAM_SWAP */ - /***************************** Prototypes *****************************/ -#ifdef CONFIG_STRAM_SWAP -static int swap_init(void *start_mem, void *swap_data); -static void *get_stram_region( unsigned long n_pages ); -static void free_stram_region( unsigned long offset, unsigned long n_pages - ); -static int in_some_region(void *addr); -static unsigned long find_free_region( unsigned long n_pages, unsigned long - *total_free, unsigned long - *region_free ); -static void do_stram_request(request_queue_t *); -static int stram_open( struct inode *inode, struct file *filp ); -static int stram_release( struct inode *inode, struct file *filp ); -static void reserve_region(void *start, void *end); -#endif static BLOCK *add_region( void *addr, unsigned long size ); static BLOCK *find_region( void *addr ); static int remove_region( BLOCK *block ); @@ -279,84 +151,11 @@ void __init atari_stram_init(void) */ void __init atari_stram_reserve_pages(void *start_mem) { -#ifdef CONFIG_STRAM_SWAP - /* if max_swap_size is negative (i.e. no stram_swap= option given), - * determine at run time whether to use ST-RAM swapping */ - if (max_swap_size < 0) - /* Use swapping if ST-RAM doesn't make up more than MAX_STRAM_FRACTION - * of total memory. In that case, the max. size is set to 16 MB, - * because ST-RAM can never be bigger than that. - * Also, never use swapping on a Hades, there's no separate ST-RAM in - * that machine. */ - max_swap_size = - (!MACH_IS_HADES && - (N_PAGES(stram_end-stram_start)*MAX_STRAM_FRACTION_DENOM <= - ((unsigned long)high_memory>>PAGE_SHIFT)*MAX_STRAM_FRACTION_NOM)) ? 16*1024*1024 : 0; - DPRINTK( "atari_stram_reserve_pages: max_swap_size = %d\n", max_swap_size ); -#endif - /* always reserve first page of ST-RAM, the first 2 kB are * supervisor-only! */ if (!kernel_in_stram) reserve_bootmem (0, PAGE_SIZE); -#ifdef CONFIG_STRAM_SWAP - { - void *swap_data; - - start_mem = (void *) PAGE_ALIGN ((unsigned long) start_mem); - /* determine first page to use as swap: if the kernel is - in TT-RAM, this is the first page of (usable) ST-RAM; - otherwise just use the end of kernel data (= start_mem) */ - swap_start = !kernel_in_stram ? stram_start + PAGE_SIZE : start_mem; - /* decrement by one page, rest of kernel assumes that first swap page - * is always reserved and maybe doesn't handle swp_entry == 0 - * correctly */ - swap_start -= PAGE_SIZE; - swap_end = stram_end; - if (swap_end-swap_start > max_swap_size) - swap_end = swap_start + max_swap_size; - DPRINTK( "atari_stram_reserve_pages: swapping enabled; " - "swap=%p-%p\n", swap_start, swap_end); - - /* reserve some amount of memory for maintainance of - * swapping itself: one page for each 2048 (PAGE_SIZE/2) - * swap pages. (2 bytes for each page) */ - swap_data = start_mem; - start_mem += ((SWAP_NR(swap_end) + PAGE_SIZE/2 - 1) - >> (PAGE_SHIFT-1)) << PAGE_SHIFT; - /* correct swap_start if necessary */ - if (swap_start + PAGE_SIZE == swap_data) - swap_start = start_mem - PAGE_SIZE; - - if (!swap_init( start_mem, swap_data )) { - printk( KERN_ERR "ST-RAM swap space initialization failed\n" ); - max_swap_size = 0; - return; - } - /* reserve region for swapping meta-data */ - reserve_region(swap_data, start_mem); - /* reserve swapping area itself */ - reserve_region(swap_start + PAGE_SIZE, swap_end); - - /* - * If the whole ST-RAM is used for swapping, there are no allocatable - * dma pages left. But unfortunately, some shared parts of the kernel - * (particularly the SCSI mid-level) call __get_dma_pages() - * unconditionally :-( These calls then fail, and scsi.c even doesn't - * check for NULL return values and just crashes. The quick fix for - * this (instead of doing much clean up work in the SCSI code) is to - * pretend all pages are DMA-able by setting mach_max_dma_address to - * ULONG_MAX. This doesn't change any functionality so far, since - * get_dma_pages() shouldn't be used on Atari anyway anymore (better - * use atari_stram_alloc()), and the Atari SCSI drivers don't need DMA - * memory. But unfortunately there's now no kind of warning (even not - * a NULL return value) if you use get_dma_pages() nevertheless :-( - * You just will get non-DMA-able memory... - */ - mach_max_dma_address = 0xffffffff; - } -#endif } void atari_stram_mem_init_hook (void) @@ -367,7 +166,6 @@ void atari_stram_mem_init_hook (void) /* * This is main public interface: somehow allocate a ST-RAM block - * There are three strategies: * * - If we're before mem_init(), we have to make a static allocation. The * region is taken in the kernel data area (if the kernel is in ST-RAM) or @@ -375,14 +173,9 @@ void atari_stram_mem_init_hook (void) * rsvd_stram_* region. The ST-RAM is somewhere in the middle of kernel * address space in the latter case. * - * - If mem_init() already has been called and ST-RAM swapping is enabled, - * try to get the memory from the (pseudo) swap-space, either free already - * or by moving some other pages out of the swap. - * - * - If mem_init() already has been called, and ST-RAM swapping is not - * enabled, the only possibility is to try with __get_dma_pages(). This has - * the disadvantage that it's very hard to get more than 1 page, and it is - * likely to fail :-( + * - If mem_init() already has been called, try with __get_dma_pages(). + * This has the disadvantage that it's very hard to get more than 1 page, + * and it is likely to fail :-( * */ void *atari_stram_alloc(long size, const char *owner) @@ -393,27 +186,13 @@ void *atari_stram_alloc(long size, const char *owner) DPRINTK("atari_stram_alloc(size=%08lx,owner=%s)\n", size, owner); - size = ALIGN_IF_SWAP(size); - DPRINTK( "atari_stram_alloc: rounded size = %08lx\n", size ); -#ifdef CONFIG_STRAM_SWAP - if (max_swap_size) { - /* If swapping is active: make some free space in the swap - "device". */ - DPRINTK( "atari_stram_alloc: after mem_init, swapping ok, " - "calling get_region\n" ); - addr = get_stram_region( N_PAGES(size) ); - flags = BLOCK_INSWAP; - } - else -#endif if (!mem_init_done) return alloc_bootmem_low(size); else { - /* After mem_init() and no swapping: can only resort to - * __get_dma_pages() */ + /* After mem_init(): can only resort to __get_dma_pages() */ addr = (void *)__get_dma_pages(GFP_KERNEL, get_order(size)); flags = BLOCK_GFP; - DPRINTK( "atari_stram_alloc: after mem_init, swapping off, " + DPRINTK( "atari_stram_alloc: after mem_init, " "get_pages=%p\n", addr ); } @@ -422,12 +201,7 @@ void *atari_stram_alloc(long size, const char *owner) /* out of memory for BLOCK structure :-( */ DPRINTK( "atari_stram_alloc: out of mem for BLOCK -- " "freeing again\n" ); -#ifdef CONFIG_STRAM_SWAP - if (flags == BLOCK_INSWAP) - free_stram_region( SWAP_NR(addr), N_PAGES(size) ); - else -#endif - free_pages((unsigned long)addr, get_order(size)); + free_pages((unsigned long)addr, get_order(size)); return( NULL ); } block->owner = owner; @@ -451,25 +225,12 @@ void atari_stram_free( void *addr ) DPRINTK( "atari_stram_free: found block (%p): size=%08lx, owner=%s, " "flags=%02x\n", block, block->size, block->owner, block->flags ); -#ifdef CONFIG_STRAM_SWAP - if (!max_swap_size) { -#endif - if (block->flags & BLOCK_GFP) { - DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", - get_order(block->size)); - free_pages((unsigned long)addr, get_order(block->size)); - } - else - goto fail; -#ifdef CONFIG_STRAM_SWAP - } - else if (block->flags & BLOCK_INSWAP) { - DPRINTK( "atari_stram_free: is swap-alloced\n" ); - free_stram_region( SWAP_NR(block->start), N_PAGES(block->size) ); - } - else + if (!(block->flags & BLOCK_GFP)) goto fail; -#endif + + DPRINTK("atari_stram_free: is kmalloced, order_size=%d\n", + get_order(block->size)); + free_pages((unsigned long)addr, get_order(block->size)); remove_region( block ); return; @@ -478,612 +239,6 @@ void atari_stram_free( void *addr ) "(called from %p)\n", addr, __builtin_return_address(0) ); } - -#ifdef CONFIG_STRAM_SWAP - - -/* ------------------------------------------------------------------------ */ -/* Main Swapping Functions */ -/* ------------------------------------------------------------------------ */ - - -/* - * Initialize ST-RAM swap device - * (lots copied and modified from sys_swapon() in mm/swapfile.c) - */ -static int __init swap_init(void *start_mem, void *swap_data) -{ - static struct dentry fake_dentry; - static struct vfsmount fake_vfsmnt; - struct swap_info_struct *p; - struct inode swap_inode; - unsigned int type; - void *addr; - int i, j, k, prev; - - DPRINTK("swap_init(start_mem=%p, swap_data=%p)\n", - start_mem, swap_data); - - /* need at least one page for swapping to (and this also isn't very - * much... :-) */ - if (swap_end - swap_start < 2*PAGE_SIZE) { - printk( KERN_WARNING "stram_swap_init: swap space too small\n" ); - return( 0 ); - } - - /* find free slot in swap_info */ - for( p = swap_info, type = 0; type < nr_swapfiles; type++, p++ ) - if (!(p->flags & SWP_USED)) - break; - if (type >= MAX_SWAPFILES) { - printk( KERN_WARNING "stram_swap_init: max. number of " - "swap devices exhausted\n" ); - return( 0 ); - } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - - stram_swap_info = p; - stram_swap_type = type; - - /* fake some dir cache entries to give us some name in /dev/swaps */ - fake_dentry.d_parent = &fake_dentry; - fake_dentry.d_name.name = "stram (internal)"; - fake_dentry.d_name.len = 16; - fake_vfsmnt.mnt_parent = &fake_vfsmnt; - - p->flags = SWP_USED; - p->swap_file = &fake_dentry; - p->swap_vfsmnt = &fake_vfsmnt; - p->swap_map = swap_data; - p->cluster_nr = 0; - p->next = -1; - p->prio = 0x7ff0; /* a rather high priority, but not the higest - * to give the user a chance to override */ - - /* call stram_open() directly, avoids at least the overhead in - * constructing a dummy file structure... */ - swap_inode.i_rdev = MKDEV( STRAM_MAJOR, STRAM_MINOR ); - stram_open( &swap_inode, MAGIC_FILE_P ); - p->max = SWAP_NR(swap_end); - - /* initialize swap_map: set regions that are already allocated or belong - * to kernel data space to SWAP_MAP_BAD, otherwise to free */ - j = 0; /* # of free pages */ - k = 0; /* # of already allocated pages (from pre-mem_init stram_alloc()) */ - p->lowest_bit = 0; - p->highest_bit = 0; - for( i = 1, addr = SWAP_ADDR(1); i < p->max; - i++, addr += PAGE_SIZE ) { - if (in_some_region( addr )) { - p->swap_map[i] = SWAP_MAP_BAD; - ++k; - } - else if (kernel_in_stram && addr < start_mem ) { - p->swap_map[i] = SWAP_MAP_BAD; - } - else { - p->swap_map[i] = 0; - ++j; - if (!p->lowest_bit) p->lowest_bit = i; - p->highest_bit = i; - } - } - /* first page always reserved (and doesn't really belong to swap space) */ - p->swap_map[0] = SWAP_MAP_BAD; - - /* now swapping to this device ok */ - p->pages = j + k; - swap_list_lock(); - nr_swap_pages += j; - p->flags = SWP_WRITEOK; - - /* insert swap space into swap_list */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { - break; - } - prev = i; - } - p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } - swap_list_unlock(); - - printk( KERN_INFO "Using %dk (%d pages) of ST-RAM as swap space.\n", - p->pages << 2, p->pages ); - return( 1 ); -} - - -/* - * The swap entry has been read in advance, and we return 1 to indicate - * that the page has been used or is no longer needed. - * - * Always set the resulting pte to be nowrite (the same as COW pages - * after one process has exited). We don't know just how many PTEs will - * share this swap entry, so be cautious and let do_wp_page work out - * what to do if a write is requested later. - */ -static inline void unswap_pte(struct vm_area_struct * vma, unsigned long - address, pte_t *dir, swp_entry_t entry, - struct page *page) -{ - pte_t pte = *dir; - - if (pte_none(pte)) - return; - if (pte_present(pte)) { - /* If this entry is swap-cached, then page must already - hold the right address for any copies in physical - memory */ - if (pte_page(pte) != page) - return; - /* We will be removing the swap cache in a moment, so... */ - set_pte(dir, pte_mkdirty(pte)); - return; - } - if (pte_val(pte) != entry.val) - return; - - DPRINTK("unswap_pte: replacing entry %08lx by new page %p", - entry.val, page); - set_pte(dir, pte_mkdirty(mk_pte(page, vma->vm_page_prot))); - swap_free(entry); - get_page(page); - inc_mm_counter(vma->vm_mm, rss); -} - -static inline void unswap_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long size, - unsigned long offset, swp_entry_t entry, - struct page *page) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - pte = pte_offset_kernel(dir, address); - offset += address & PMD_MASK; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - unswap_pte(vma, offset+address-vma->vm_start, pte, entry, page); - address += PAGE_SIZE; - pte++; - } while (address < end); -} - -static inline void unswap_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page) -{ - pmd_t * pmd; - unsigned long offset, end; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, address); - offset = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - unswap_pmd(vma, pmd, address, end - address, offset, entry, - page); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -static void unswap_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page) -{ - unsigned long start = vma->vm_start, end = vma->vm_end; - - do { - unswap_pgd(vma, pgdir, start, end - start, entry, page); - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (start < end); -} - -static void unswap_process(struct mm_struct * mm, swp_entry_t entry, - struct page *page) -{ - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - if (!mm) - return; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - pgd_t * pgd = pgd_offset(mm, vma->vm_start); - unswap_vma(vma, pgd, entry, page); - } -} - - -static int unswap_by_read(unsigned short *map, unsigned long max, - unsigned long start, unsigned long n_pages) -{ - struct task_struct *p; - struct page *page; - swp_entry_t entry; - unsigned long i; - - DPRINTK( "unswapping %lu..%lu by reading in\n", - start, start+n_pages-1 ); - - for( i = start; i < start+n_pages; ++i ) { - if (map[i] == SWAP_MAP_BAD) { - printk( KERN_ERR "get_stram_region: page %lu already " - "reserved??\n", i ); - continue; - } - - if (map[i]) { - entry = swp_entry(stram_swap_type, i); - DPRINTK("unswap: map[i=%lu]=%u nr_swap=%ld\n", - i, map[i], nr_swap_pages); - - swap_device_lock(stram_swap_info); - map[i]++; - swap_device_unlock(stram_swap_info); - /* Get a page for the entry, using the existing - swap cache page if there is one. Otherwise, - get a clean page and read the swap into it. */ - page = read_swap_cache_async(entry, NULL, 0); - if (!page) { - swap_free(entry); - return -ENOMEM; - } - read_lock(&tasklist_lock); - for_each_process(p) - unswap_process(p->mm, entry, page); - read_unlock(&tasklist_lock); - shmem_unuse(entry, page); - /* Now get rid of the extra reference to the - temporary page we've been using. */ - if (PageSwapCache(page)) - delete_from_swap_cache(page); - __free_page(page); - #ifdef DO_PROC - stat_swap_force++; - #endif - } - - DPRINTK( "unswap: map[i=%lu]=%u nr_swap=%ld\n", - i, map[i], nr_swap_pages ); - swap_list_lock(); - swap_device_lock(stram_swap_info); - map[i] = SWAP_MAP_BAD; - if (stram_swap_info->lowest_bit == i) - stram_swap_info->lowest_bit++; - if (stram_swap_info->highest_bit == i) - stram_swap_info->highest_bit--; - --nr_swap_pages; - swap_device_unlock(stram_swap_info); - swap_list_unlock(); - } - - return 0; -} - -/* - * reserve a region in ST-RAM swap space for an allocation - */ -static void *get_stram_region( unsigned long n_pages ) -{ - unsigned short *map = stram_swap_info->swap_map; - unsigned long max = stram_swap_info->max; - unsigned long start, total_free, region_free; - int err; - void *ret = NULL; - - DPRINTK( "get_stram_region(n_pages=%lu)\n", n_pages ); - - down(&stram_swap_sem); - - /* disallow writing to the swap device now */ - stram_swap_info->flags = SWP_USED; - - /* find a region of n_pages pages in the swap space including as much free - * pages as possible (and excluding any already-reserved pages). */ - if (!(start = find_free_region( n_pages, &total_free, ®ion_free ))) - goto end; - DPRINTK( "get_stram_region: region starts at %lu, has %lu free pages\n", - start, region_free ); - - err = unswap_by_read(map, max, start, n_pages); - if (err) - goto end; - - ret = SWAP_ADDR(start); - end: - /* allow using swap device again */ - stram_swap_info->flags = SWP_WRITEOK; - up(&stram_swap_sem); - DPRINTK( "get_stram_region: returning %p\n", ret ); - return( ret ); -} - - -/* - * free a reserved region in ST-RAM swap space - */ -static void free_stram_region( unsigned long offset, unsigned long n_pages ) -{ - unsigned short *map = stram_swap_info->swap_map; - - DPRINTK( "free_stram_region(offset=%lu,n_pages=%lu)\n", offset, n_pages ); - - if (offset < 1 || offset + n_pages > stram_swap_info->max) { - printk( KERN_ERR "free_stram_region: Trying to free non-ST-RAM\n" ); - return; - } - - swap_list_lock(); - swap_device_lock(stram_swap_info); - /* un-reserve the freed pages */ - for( ; n_pages > 0; ++offset, --n_pages ) { - if (map[offset] != SWAP_MAP_BAD) - printk( KERN_ERR "free_stram_region: Swap page %lu was not " - "reserved\n", offset ); - map[offset] = 0; - } - - /* update swapping meta-data */ - if (offset < stram_swap_info->lowest_bit) - stram_swap_info->lowest_bit = offset; - if (offset+n_pages-1 > stram_swap_info->highest_bit) - stram_swap_info->highest_bit = offset+n_pages-1; - if (stram_swap_info->prio > swap_info[swap_list.next].prio) - swap_list.next = swap_list.head; - nr_swap_pages += n_pages; - swap_device_unlock(stram_swap_info); - swap_list_unlock(); -} - - -/* ------------------------------------------------------------------------ */ -/* Utility Functions for Swapping */ -/* ------------------------------------------------------------------------ */ - - -/* is addr in some of the allocated regions? */ -static int in_some_region(void *addr) -{ - BLOCK *p; - - for( p = alloc_list; p; p = p->next ) { - if (p->start <= addr && addr < p->start + p->size) - return( 1 ); - } - return( 0 ); -} - - -static unsigned long find_free_region(unsigned long n_pages, - unsigned long *total_free, - unsigned long *region_free) -{ - unsigned short *map = stram_swap_info->swap_map; - unsigned long max = stram_swap_info->max; - unsigned long head, tail, max_start; - long nfree, max_free; - - /* first scan the swap space for a suitable place for the allocation */ - head = 1; - max_start = 0; - max_free = -1; - *total_free = 0; - - start_over: - /* increment tail until final window size reached, and count free pages */ - nfree = 0; - for( tail = head; tail-head < n_pages && tail < max; ++tail ) { - if (map[tail] == SWAP_MAP_BAD) { - head = tail+1; - goto start_over; - } - if (!map[tail]) { - ++nfree; - ++*total_free; - } - } - if (tail-head < n_pages) - goto out; - if (nfree > max_free) { - max_start = head; - max_free = nfree; - if (max_free >= n_pages) - /* don't need more free pages... :-) */ - goto out; - } - - /* now shift the window and look for the area where as much pages as - * possible are free */ - while( tail < max ) { - nfree -= (map[head++] == 0); - if (map[tail] == SWAP_MAP_BAD) { - head = tail+1; - goto start_over; - } - if (!map[tail]) { - ++nfree; - ++*total_free; - } - ++tail; - if (nfree > max_free) { - max_start = head; - max_free = nfree; - if (max_free >= n_pages) - /* don't need more free pages... :-) */ - goto out; - } - } - - out: - if (max_free < 0) { - printk( KERN_NOTICE "get_stram_region: ST-RAM too full or fragmented " - "-- can't allocate %lu pages\n", n_pages ); - return( 0 ); - } - - *region_free = max_free; - return( max_start ); -} - - -/* setup parameters from command line */ -void __init stram_swap_setup(char *str, int *ints) -{ - if (ints[0] >= 1) - max_swap_size = ((ints[1] < 0 ? 0 : ints[1]) * 1024) & PAGE_MASK; -} - - -/* ------------------------------------------------------------------------ */ -/* ST-RAM device */ -/* ------------------------------------------------------------------------ */ - -static int refcnt; - -static void do_stram_request(request_queue_t *q) -{ - struct request *req; - - while ((req = elv_next_request(q)) != NULL) { - void *start = swap_start + (req->sector << 9); - unsigned long len = req->current_nr_sectors << 9; - if ((start + len) > swap_end) { - printk( KERN_ERR "stram: bad access beyond end of device: " - "block=%ld, count=%d\n", - req->sector, - req->current_nr_sectors ); - end_request(req, 0); - continue; - } - - if (req->cmd == READ) { - memcpy(req->buffer, start, len); -#ifdef DO_PROC - stat_swap_read += N_PAGES(len); -#endif - } - else { - memcpy(start, req->buffer, len); -#ifdef DO_PROC - stat_swap_write += N_PAGES(len); -#endif - } - end_request(req, 1); - } -} - - -static int stram_open( struct inode *inode, struct file *filp ) -{ - if (filp != MAGIC_FILE_P) { - printk( KERN_NOTICE "Only kernel can open ST-RAM device\n" ); - return( -EPERM ); - } - if (refcnt) - return( -EBUSY ); - ++refcnt; - return( 0 ); -} - -static int stram_release( struct inode *inode, struct file *filp ) -{ - if (filp != MAGIC_FILE_P) { - printk( KERN_NOTICE "Only kernel can close ST-RAM device\n" ); - return( -EPERM ); - } - if (refcnt > 0) - --refcnt; - return( 0 ); -} - - -static struct block_device_operations stram_fops = { - .open = stram_open, - .release = stram_release, -}; - -static struct gendisk *stram_disk; -static struct request_queue *stram_queue; -static DEFINE_SPINLOCK(stram_lock); - -int __init stram_device_init(void) -{ - if (!MACH_IS_ATARI) - /* no point in initializing this, I hope */ - return -ENXIO; - - if (!max_swap_size) - /* swapping not enabled */ - return -ENXIO; - stram_disk = alloc_disk(1); - if (!stram_disk) - return -ENOMEM; - - if (register_blkdev(STRAM_MAJOR, "stram")) { - put_disk(stram_disk); - return -ENXIO; - } - - stram_queue = blk_init_queue(do_stram_request, &stram_lock); - if (!stram_queue) { - unregister_blkdev(STRAM_MAJOR, "stram"); - put_disk(stram_disk); - return -ENOMEM; - } - - stram_disk->major = STRAM_MAJOR; - stram_disk->first_minor = STRAM_MINOR; - stram_disk->fops = &stram_fops; - stram_disk->queue = stram_queue; - sprintf(stram_disk->disk_name, "stram"); - set_capacity(stram_disk, (swap_end - swap_start)/512); - add_disk(stram_disk); - return 0; -} - - - -/* ------------------------------------------------------------------------ */ -/* Misc Utility Functions */ -/* ------------------------------------------------------------------------ */ - -/* reserve a range of pages */ -static void reserve_region(void *start, void *end) -{ - reserve_bootmem (virt_to_phys(start), end - start); -} - -#endif /* CONFIG_STRAM_SWAP */ - /* ------------------------------------------------------------------------ */ /* Region Management */ @@ -1173,50 +328,9 @@ int get_stram_list( char *buf ) { int len = 0; BLOCK *p; -#ifdef CONFIG_STRAM_SWAP - int i; - unsigned short *map = stram_swap_info->swap_map; - unsigned long max = stram_swap_info->max; - unsigned free = 0, used = 0, rsvd = 0; -#endif -#ifdef CONFIG_STRAM_SWAP - if (max_swap_size) { - for( i = 1; i < max; ++i ) { - if (!map[i]) - ++free; - else if (map[i] == SWAP_MAP_BAD) - ++rsvd; - else - ++used; - } - PRINT_PROC( - "Total ST-RAM: %8u kB\n" - "Total ST-RAM swap: %8lu kB\n" - "Free swap: %8u kB\n" - "Used swap: %8u kB\n" - "Allocated swap: %8u kB\n" - "Swap Reads: %8u\n" - "Swap Writes: %8u\n" - "Swap Forced Reads: %8u\n", - (stram_end - stram_start) >> 10, - (max-1) << (PAGE_SHIFT-10), - free << (PAGE_SHIFT-10), - used << (PAGE_SHIFT-10), - rsvd << (PAGE_SHIFT-10), - stat_swap_read, - stat_swap_write, - stat_swap_force ); - } - else { -#endif - PRINT_PROC( "ST-RAM swapping disabled\n" ); - PRINT_PROC("Total ST-RAM: %8u kB\n", + PRINT_PROC("Total ST-RAM: %8u kB\n", (stram_end - stram_start) >> 10); -#ifdef CONFIG_STRAM_SWAP - } -#endif - PRINT_PROC( "Allocated regions:\n" ); for( p = alloc_list; p; p = p->next ) { if (len + 50 >= PAGE_SIZE) @@ -1227,8 +341,6 @@ int get_stram_list( char *buf ) p->owner); if (p->flags & BLOCK_GFP) PRINT_PROC( "page-alloced)\n" ); - else if (p->flags & BLOCK_INSWAP) - PRINT_PROC( "in swap)\n" ); else PRINT_PROC( "??)\n" ); } From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 29 Oct 2005 18:16:12 -0700 Subject: [PATCH 38/98] [PATCH] core remove PageReserved Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearing of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed (a generic page_is_ram() should work). A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. Signed-off-by: Nick Piggin Refcount bug fix for filemap_xip.c Signed-off-by: Carsten Otte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/kernel/vdso.c | 12 ++-- arch/sparc/mm/generic.c | 3 + arch/sparc64/mm/generic.c | 3 + drivers/scsi/sg.c | 12 ++-- drivers/scsi/st.c | 10 ++- fs/direct-io.c | 4 +- include/linux/mm.h | 5 +- kernel/power/swsusp.c | 25 ++++--- mm/bootmem.c | 1 + mm/filemap_xip.c | 11 +++- mm/fremap.c | 23 ++++--- mm/madvise.c | 2 +- mm/memory.c | 133 +++++++++++++++++++++++--------------- mm/mempolicy.c | 29 +++++---- mm/mmap.c | 11 ++++ mm/mprotect.c | 8 +++ mm/msync.c | 17 +++-- mm/page_alloc.c | 14 ++-- mm/rmap.c | 14 ++-- mm/shmem.c | 4 +- mm/swap.c | 4 +- sound/core/pcm_native.c | 9 +-- 22 files changed, 219 insertions(+), 135 deletions(-) diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c index efa985f05aca..4aacf521e3e4 100644 --- a/arch/ppc64/kernel/vdso.c +++ b/arch/ppc64/kernel/vdso.c @@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma, return NOPAGE_SIGBUS; /* - * Last page is systemcfg, special handling here, no get_page() a - * this is a reserved page + * Last page is systemcfg. */ if ((vma->vm_end - address) <= PAGE_SIZE) - return virt_to_page(systemcfg); + pg = virt_to_page(systemcfg); + else + pg = virt_to_page(vbase + offset); - pg = virt_to_page(vbase + offset); get_page(pg); DBG(" ->page count: %d\n", page_count(pg)); @@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack) * gettimeofday will be totally dead. It's fine to use that for setting * breakpoints in the vDSO code pages though */ - vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED; vma->vm_flags |= mm->def_flags; vma->vm_page_prot = protection_map[vma->vm_flags & 0x7]; vma->vm_ops = &vdso_vmops; @@ -603,6 +603,8 @@ void __init vdso_init(void) ClearPageReserved(pg); get_page(pg); } + + get_page(virt_to_page(systemcfg)); } int in_gate_area_no_task(unsigned long addr) diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c index 20ccb957fb77..659c9a71f867 100644 --- a/arch/sparc/mm/generic.c +++ b/arch/sparc/mm/generic.c @@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, int space = GET_IOSPACE(pfn); unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; + /* See comment in mm/memory.c remap_pfn_range */ + vma->vm_flags |= VM_IO | VM_RESERVED; + prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c index c954d91f01d0..afc01cec701f 100644 --- a/arch/sparc64/mm/generic.c +++ b/arch/sparc64/mm/generic.c @@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, int space = GET_IOSPACE(pfn); unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT; + /* See comment in mm/memory.c remap_pfn_range */ + vma->vm_flags |= VM_IO | VM_RESERVED; + prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 861e51375d70..2d30b46806bf 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages, int i; for (i=0; i < nr_pages; i++) { - if (dirtied && !PageReserved(sgl[i].page)) - SetPageDirty(sgl[i].page); - /* unlock_page(sgl[i].page); */ + struct page *page = sgl[i].page; + + /* XXX: just for debug. Remove when PageReserved is removed */ + BUG_ON(PageReserved(page)); + if (dirtied) + SetPageDirty(page); + /* unlock_page(page); */ /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(sgl[i].page); + page_cache_release(page); } return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 5eb54d8019b4..da9766283bd7 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p int i; for (i=0; i < nr_pages; i++) { - if (dirtied && !PageReserved(sgl[i].page)) - SetPageDirty(sgl[i].page); + struct page *page = sgl[i].page; + + /* XXX: just for debug. Remove when PageReserved is removed */ + BUG_ON(PageReserved(page)); + if (dirtied) + SetPageDirty(page); /* FIXME: cache flush missing for rw==READ * FIXME: call the correct reference counting function */ - page_cache_release(sgl[i].page); + page_cache_release(page); } return 0; diff --git a/fs/direct-io.c b/fs/direct-io.c index 0d06097bc995..3931e7f1e6bf 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio) up_read(¤t->mm->mmap_sem); if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) { + struct page *page = ZERO_PAGE(dio->curr_user_address); /* * A memory fault, but the filesystem has some outstanding * mapped blocks. We need to use those blocks up to avoid @@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio) */ if (dio->page_errors == 0) dio->page_errors = ret; - dio->pages[0] = ZERO_PAGE(dio->curr_user_address); + page_cache_get(page); + dio->pages[0] = page; dio->head = 0; dio->tail = 1; ret = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c64484d8ae0..da42093250c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ +#define VM_RESERVED 0x00080000 /* Pages managed in a special way */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ @@ -338,7 +338,7 @@ static inline void get_page(struct page *page) static inline void put_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 10bc5ec496d7..016504ccfccf 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone) continue; page = pfn_to_page(pfn); /* - * This condition results from rvmalloc() sans vmalloc_32() - * and architectural memory reservations. This should be - * corrected eventually when the cases giving rise to this - * are better understood. + * PageReserved results from rvmalloc() sans vmalloc_32() + * and architectural memory reservations. + * + * rvmalloc should not cause this, because all implementations + * appear to always be using vmalloc_32 on architectures with + * highmem. This is a good thing, because we would like to save + * rvmalloc pages. + * + * It appears to be triggered by pages which do not point to + * valid memory (see arch/i386/mm/init.c:one_highpage_init(), + * which sets PageReserved if the page does not point to valid + * RAM. + * + * XXX: must remove usage of PageReserved! */ - if (PageReserved(page)) { - printk("highmem reserved page?!\n"); + if (PageReserved(page)) continue; - } BUG_ON(PageNosave(page)); if (PageNosaveFree(page)) continue; @@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn) return 0; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { + if (pfn_is_nosave(pfn)) { pr_debug("[nosave pfn 0x%lx]", pfn); return 0; } diff --git a/mm/bootmem.c b/mm/bootmem.c index a58699b6579e..e8c567177dcf 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) if (j + 16 < BITS_PER_LONG) prefetchw(page + j + 16); __ClearPageReserved(page + j); + set_page_count(page + j, 0); } __free_pages(page, order); i += BITS_PER_LONG; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 8c199f537732..9354ee279b13 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping, unsigned long address; pte_t *pte; pte_t pteval; + struct page *page = ZERO_PAGE(address); spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { @@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping, * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - pte = page_check_address(ZERO_PAGE(address), mm, - address); + pte = page_check_address(page, mm, address); if (!IS_ERR(pte)) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); + page_remove_rmap(page); + dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap(pte); spin_unlock(&mm->page_table_lock); + page_cache_release(page); } } spin_unlock(&mapping->i_mmap_lock); @@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area, page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0); if (!IS_ERR(page)) { - return page; + goto out; } if (PTR_ERR(page) != -ENODATA) return NULL; @@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area, page = ZERO_PAGE(address); } +out: + page_cache_get(page); return page; } diff --git a/mm/fremap.c b/mm/fremap.c index fd7f2a17ff3e..224cc1598b35 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, return; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); + struct page *page; flush_cache_page(vma, addr, pfn); pte = ptep_clear_flush(vma, addr, ptep); - if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - if (!PageReserved(page)) { - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); - dec_mm_counter(mm, file_rss); - } + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return; } + page = pfn_to_page(pfn); + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); + dec_mm_counter(mm, file_rss); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); @@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pgd_t *pgd; pte_t pte_val; + BUG_ON(vma->vm_flags & VM_RESERVED); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); @@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pgd_t *pgd; pte_t pte_val; + BUG_ON(vma->vm_flags & VM_RESERVED); + pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); diff --git a/mm/madvise.c b/mm/madvise.c index 20e075d1c64c..17aaf3e16449 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/mm/memory.c b/mm/memory.c index da642b5528fa..e83f9440bb66 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -342,6 +342,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) #define NO_RSS 2 /* Increment neither file_rss nor anon_rss */ +/* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +{ + printk(KERN_ERR "Bad pte = %08llx, process = %s, " + "vm_flags = %lx, vaddr = %lx\n", + (long long)pte_val(pte), + (vma->vm_mm == current->mm ? current->comm : "???"), + vma->vm_flags, vaddr); + dump_stack(); +} + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range @@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) static inline int copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr) { + unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; unsigned long pfn; @@ -375,19 +393,23 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_set_pte; } - pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the - * mapping is assumed to be good, meaningful - * and not mapped via rmap - duplicate the - * mapping as is. + /* If the region is VM_RESERVED, the mapping is not + * mapped via rmap - duplicate the pte as is. */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) + if (vm_flags & VM_RESERVED) goto out_set_pte; + pfn = pte_pfn(pte); + /* If the pte points outside of valid memory but + * the region is not VM_RESERVED, we have a problem. + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + goto out_set_pte; /* try to do something sane */ + } + + page = pfn_to_page(pfn); + /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; - unsigned long vm_flags = vma->vm_flags; int progress = 0; int rss[NO_RSS+1], anon; @@ -446,8 +467,7 @@ again: progress++; continue; } - anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, - vm_flags, addr); + anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr); rss[anon]++; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, +static void zap_pte_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { + struct mm_struct *mm = tlb->mm; pte_t *pte; int file_rss = 0; int anon_rss = 0; @@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, continue; if (pte_present(ptent)) { struct page *page = NULL; - unsigned long pfn = pte_pfn(ptent); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; + if (!(vma->vm_flags & VM_RESERVED)) { + unsigned long pfn = pte_pfn(ptent); + if (unlikely(!pfn_valid(pfn))) + print_bad_pte(vma, ptent, addr); + else + page = pfn_to_page(pfn); } if (unlikely(details) && page) { /* @@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, page->index > details->last_index)) continue; } - ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, + ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) @@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) - set_pte_at(tlb->mm, addr, pte, + set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) anon_rss++; @@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, continue; if (!pte_file(ptent)) free_swap_and_cache(pte_to_swp_entry(ptent)); - pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); + pte_clear_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); - add_mm_rss(tlb->mm, -file_rss, -anon_rss); + add_mm_rss(mm, -file_rss, -anon_rss); pte_unmap(pte - 1); } -static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, +static inline void zap_pmd_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - zap_pte_range(tlb, pmd, addr, next, details); + zap_pte_range(tlb, vma, pmd, addr, next, details); } while (pmd++, addr = next, addr != end); } -static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, +static inline void zap_pud_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { @@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - zap_pmd_range(tlb, pud, addr, next, details); + zap_pmd_range(tlb, vma, pud, addr, next, details); } while (pud++, addr = next, addr != end); } @@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - zap_pud_range(tlb, pgd, addr, next, details); + zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } @@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & VM_IO) + if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) || !(flags & vma->vm_flags)) return i ? : -EFAULT; @@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (pages) { pages[i] = page; flush_dcache_page(page); - if (!PageReserved(page)) - page_cache_get(page); + page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, if (!pte) return -ENOMEM; do { - pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); + struct page *page = ZERO_PAGE(addr); + pte_t zero_pte = pte_wrprotect(mk_pte(page, prot)); + page_cache_get(page); + page_add_file_rmap(page); + inc_mm_counter(mm, file_rss); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); @@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, return -ENOMEM; do { BUG_ON(!pte_none(*pte)); - if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); + set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); @@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED tells swapout not to try to touch - * this region. + * VM_RESERVED tells the core MM not to "manage" these pages + * (e.g. refcount, mapcount, try to swap them out). */ vma->vm_flags |= VM_IO | VM_RESERVED; @@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t entry; int ret = VM_FAULT_MINOR; + BUG_ON(vma->vm_flags & VM_RESERVED); + if (unlikely(!pfn_valid(pfn))) { /* * Page table corrupted: show pte and kill process. */ - pte_ERROR(orig_pte); + print_bad_pte(vma, orig_pte, address); ret = VM_FAULT_OOM; goto unlock; } @@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Ok, we need to copy. Oh, well.. */ - if (!PageReserved(old_page)) - page_cache_get(old_page); + page_cache_get(old_page); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, orig_pte))) { - if (PageReserved(old_page)) + page_remove_rmap(old_page); + if (!PageAnon(old_page)) { inc_mm_counter(mm, anon_rss); - else { - page_remove_rmap(old_page); - if (!PageAnon(old_page)) { - inc_mm_counter(mm, anon_rss); - dec_mm_counter(mm, file_rss); - } + dec_mm_counter(mm, file_rss); } flush_cache_page(vma, address, pfn); entry = mk_pte(new_page, vma->vm_page_prot); @@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access) { + struct page *page = ZERO_PAGE(addr); pte_t entry; /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ - entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot); + entry = mk_pte(page, vma->vm_page_prot); if (write_access) { - struct page *page; - /* Allocate our own private page. */ pte_unmap(page_table); spin_unlock(&mm->page_table_lock); @@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, lru_cache_add_active(page); SetPageReferenced(page); page_add_anon_rmap(page, vma, address); + } else { + inc_mm_counter(mm, file_rss); + page_add_file_rmap(page); + page_cache_get(page); } set_pte_at(mm, address, page_table, entry); @@ -1916,7 +1943,7 @@ retry: inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else if (!PageReserved(new_page)) { + } else if (!(vma->vm_flags & VM_RESERVED)) { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); } @@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Page table corrupted: show pte and kill process. */ - pte_ERROR(orig_pte); + print_bad_pte(vma, orig_pte, address); return VM_FAULT_OOM; } /* We can then assume vm->vm_ops && vma->vm_ops->populate */ @@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = 0; + gate_vma.vm_flags = VM_RESERVED; return 0; } __initcall(gate_vma_init); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 43b1199af591..11d824f282f1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) } /* Ensure all existing pages follow the policy. */ -static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, +static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, nodemask_t *nodes) { pte_t *orig_pte; pte_t *pte; - spin_lock(&mm->page_table_lock); + spin_lock(&vma->vm_mm->page_table_lock); orig_pte = pte = pte_offset_map(pmd, addr); do { unsigned long pfn; @@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, if (!pte_present(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (!pfn_valid(pfn)) { + print_bad_pte(vma, *pte, addr); continue; + } nid = pfn_to_nid(pfn); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(orig_pte); - spin_unlock(&mm->page_table_lock); + spin_unlock(&vma->vm_mm->page_table_lock); return addr != end; } -static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, nodemask_t *nodes) { pmd_t *pmd; @@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - if (check_pte_range(mm, pmd, addr, next, nodes)) + if (check_pte_range(vma, pmd, addr, next, nodes)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, nodemask_t *nodes) { pud_t *pud; @@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(mm, pud, addr, next, nodes)) + if (check_pmd_range(vma, pud, addr, next, nodes)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct mm_struct *mm, +static inline int check_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, nodemask_t *nodes) { pgd_t *pgd; unsigned long next; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(mm, pgd, addr, next, nodes)) + if (check_pud_range(vma, pgd, addr, next, nodes)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); + if (first->vm_flags & VM_RESERVED) + return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) @@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, endvma = end; if (vma->vm_start > start) start = vma->vm_start; - err = check_pgd_range(vma->vm_mm, - start, endvma, nodes); + err = check_pgd_range(vma, start, endvma, nodes); if (err) { first = ERR_PTR(err); break; diff --git a/mm/mmap.c b/mm/mmap.c index 459b9f068ad7..8a111792b8db 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1088,6 +1088,17 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; + if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) + == (VM_WRITE | VM_RESERVED)) { + printk(KERN_WARNING "program %s is using MAP_PRIVATE, " + "PROT_WRITE mmap of VM_RESERVED memory, which " + "is deprecated. Please report this to " + "linux-kernel@vger.kernel.org\n",current->comm); + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); + error = -EACCES; + goto unmap_and_free_vma; + } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) diff --git a/mm/mprotect.c b/mm/mprotect.c index b426f01c5e9c..672a76fddd5e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { + if (oldflags & VM_RESERVED) { + BUG_ON(oldflags & VM_WRITE); + printk(KERN_WARNING "program %s is using MAP_PRIVATE, " + "PROT_WRITE mprotect of VM_RESERVED memory, " + "which is deprecated. Please report this to " + "linux-kernel@vger.kernel.org\n",current->comm); + return -EACCES; + } if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { charged = nrpages; if (security_vm_enough_memory(charged)) diff --git a/mm/msync.c b/mm/msync.c index 3b5f1c521d4b..860395486060 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -25,6 +25,7 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { + struct mm_struct *mm = vma->vm_mm; pte_t *pte; int progress = 0; @@ -37,7 +38,7 @@ again: if (progress >= 64) { progress = 0; if (need_resched() || - need_lockbreak(&vma->vm_mm->page_table_lock)) + need_lockbreak(&mm->page_table_lock)) break; } progress++; @@ -46,11 +47,11 @@ again: if (!pte_maybe_dirty(*pte)) continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, *pte, addr); continue; + } page = pfn_to_page(pfn); - if (PageReserved(page)) - continue; if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) @@ -58,7 +59,7 @@ again: progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1); - cond_resched_lock(&vma->vm_mm->page_table_lock); + cond_resched_lock(&mm->page_table_lock); if (addr != end) goto again; } @@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma, /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need - * to do anything more on an msync() */ - if (is_vm_hugetlb_page(vma)) + * to do anything more on an msync(). + * Can't do anything with VM_RESERVED regions either. + */ + if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) return; BUG_ON(addr >= end); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 60663232fbb2..0541288ebf4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback); + 1 << PG_writeback | + 1 << PG_reserved ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order) { if (PagePrivate(page) && (page_order(page) == order) && - !PageReserved(page) && page_count(page) == 0) return 1; return 0; @@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); @@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ))) + 1 << PG_writeback | + 1 << PG_reserved ))) bad_page(__FUNCTION__, page); page->flags &= ~(1 << PG_uptodate | 1 << PG_error | @@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec) fastcall void __free_pages(struct page *page, unsigned int order) { - if (!PageReserved(page) && put_page_testzero(page)) { + if (put_page_testzero(page)) { if (order == 0) free_hot_page(page); else @@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 0); + set_page_count(page, 1); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/mm/rmap.c b/mm/rmap.c index 504757624cce..f69d5342ce7f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - BUG_ON(PageReserved(page)); - if (atomic_inc_and_test(&page->_mapcount)) { struct anon_vma *anon_vma = vma->anon_vma; @@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page) { BUG_ON(PageAnon(page)); - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + BUG_ON(!pfn_valid(page_to_pfn(page))); if (atomic_inc_and_test(&page->_mapcount)) inc_page_state(nr_mapped); @@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { - BUG_ON(PageReserved(page)); - if (atomic_add_negative(-1, &page->_mapcount)) { BUG_ON(page_mapcount(page) < 0); /* @@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor, continue; pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, *pte, address); continue; + } page = pfn_to_page(pfn); BUG_ON(PageAnon(page)); - if (PageReserved(page)) - continue; if (ptep_clear_flush_young(vma, address, pte)) continue; @@ -808,7 +803,6 @@ int try_to_unmap(struct page *page) { int ret; - BUG_ON(PageReserved(page)); BUG_ON(!PageLocked(page)); if (PageAnon(page)) diff --git a/mm/shmem.c b/mm/shmem.c index 6796311a23ef..37777f4c11f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ */ if (!offset) mark_page_accessed(page); - } else + } else { page = ZERO_PAGE(0); + page_cache_get(page); + } /* * Ok, we have the page, and it's up-to-date, so diff --git a/mm/swap.c b/mm/swap.c index 7771d2803f62..21d15f99805c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -48,7 +48,7 @@ void put_page(struct page *page) } return; } - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } EXPORT_SYMBOL(put_page); @@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold) struct page *page = pages[i]; struct zone *pagezone; - if (PageReserved(page) || !put_page_testzero(page)) + if (!put_page_testzero(page)) continue; pagezone = page_zone(page); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 67abebabf83e..e97b2d162cc7 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->status); - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un return NOPAGE_OOM; runtime = substream->runtime; page = virt_to_page(runtime->control); - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; @@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign vaddr = runtime->dma_area + offset; page = virt_to_page(vaddr); } - if (!PageReserved(page)) - get_page(page); + get_page(page); if (type) *type = VM_FAULT_MINOR; return page; From 8c10376271e097fa13cda956e1b2f3cb7e4d4dd9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:13 -0700 Subject: [PATCH 39/98] [PATCH] mm: copy_one_pte inc rss Small adjustment, following Nick's suggestion: it's more straightforward for copy_pte_range to let copy_one_pte do the rss incrementation, than use an index it passed back. Saves a #define, and 16 bytes of .text. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e83f9440bb66..7893eb4bb8c0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -340,8 +340,6 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) add_mm_counter(mm, anon_rss, anon_rss); } -#define NO_RSS 2 /* Increment neither file_rss nor anon_rss */ - /* * This function is called to print an error when a pte in a * !VM_RESERVED region is found pointing to an invalid pfn (which @@ -368,16 +366,15 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ -static inline int +static inline void copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; unsigned long pfn; - int anon = NO_RSS; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -428,11 +425,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkold(pte); get_page(page); page_dup_rmap(page); - anon = !!PageAnon(page); + rss[!!PageAnon(page)]++; out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); - return anon; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -441,7 +437,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, { pte_t *src_pte, *dst_pte; int progress = 0; - int rss[NO_RSS+1], anon; + int rss[2]; again: rss[1] = rss[0] = 0; @@ -467,8 +463,7 @@ again: progress++; continue; } - anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr); - rss[anon]++; + copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); spin_unlock(&src_mm->page_table_lock); From 86d912f41dca32eca8827f2f878139735e69dc28 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:14 -0700 Subject: [PATCH 40/98] [PATCH] mm: zap_pte_range dec rss Small adjustment: zap_pte_range decrement its rss counts from 0 then finally add, avoiding negations - we don't have or need a sub_mm_rss. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7893eb4bb8c0..bc6296398f8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -609,13 +609,13 @@ static void zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss++; + anon_rss--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent)) mark_page_accessed(page); - file_rss++; + file_rss--; } page_remove_rmap(page); tlb_remove_page(tlb, page); @@ -632,7 +632,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pte_clear_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); - add_mm_rss(mm, -file_rss, -anon_rss); + add_mm_rss(mm, file_rss, anon_rss); pte_unmap(pte - 1); } From 9e9bef07ce5a342aa6246ebc5c20829d0d5d63d0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:15 -0700 Subject: [PATCH 41/98] [PATCH] mm: do_swap_page race major Small adjustment: do_swap_page should report its !pte_same race as a major fault if it had to read into swap cache, because whatever raced with it will have found page already in cache and reported minor fault. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index bc6296398f8b..a25ee1d3e20a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1728,10 +1728,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); - if (unlikely(!pte_same(*page_table, orig_pte))) { - ret = VM_FAULT_MINOR; + if (unlikely(!pte_same(*page_table, orig_pte))) goto out_nomap; - } if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; From d0de32d9b71e11cc51618c2045086e9694093d01 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:16 -0700 Subject: [PATCH 42/98] [PATCH] mm: do_mremap current mm Cleanup: relieve do_mremap from its surfeit of current->mms. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index f4e562098500..318eea5467a0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -245,6 +245,7 @@ unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret = -EINVAL; unsigned long charged = 0; @@ -285,7 +286,7 @@ unsigned long do_mremap(unsigned long addr, if ((addr <= new_addr) && (addr+old_len) > new_addr) goto out; - ret = do_munmap(current->mm, new_addr, new_len); + ret = do_munmap(mm, new_addr, new_len); if (ret) goto out; } @@ -296,7 +297,7 @@ unsigned long do_mremap(unsigned long addr, * do_munmap does all the needed commit accounting */ if (old_len >= new_len) { - ret = do_munmap(current->mm, addr+new_len, old_len - new_len); + ret = do_munmap(mm, addr+new_len, old_len - new_len); if (ret && old_len != new_len) goto out; ret = addr; @@ -309,7 +310,7 @@ unsigned long do_mremap(unsigned long addr, * Ok, we need to grow.. or relocate. */ ret = -EFAULT; - vma = find_vma(current->mm, addr); + vma = find_vma(mm, addr); if (!vma || vma->vm_start > addr) goto out; if (is_vm_hugetlb_page(vma)) { @@ -325,14 +326,14 @@ unsigned long do_mremap(unsigned long addr, } if (vma->vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; - locked = current->mm->locked_vm << PAGE_SHIFT; + locked = mm->locked_vm << PAGE_SHIFT; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; locked += new_len - old_len; ret = -EAGAIN; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) goto out; } - if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { ret = -ENOMEM; goto out; } @@ -359,11 +360,10 @@ unsigned long do_mremap(unsigned long addr, vma_adjust(vma, vma->vm_start, addr + new_len, vma->vm_pgoff, NULL); - current->mm->total_vm += pages; - vm_stat_account(vma->vm_mm, vma->vm_flags, - vma->vm_file, pages); + mm->total_vm += pages; + vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { - current->mm->locked_vm += pages; + mm->locked_vm += pages; make_pages_present(addr + old_len, addr + new_len); } From 861f2fb8e796022b4928cab9c74fca6681a1c557 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:17 -0700 Subject: [PATCH 43/98] [PATCH] mm: zap_pte out of line There used to be just one call to zap_pte, but it shouldn't be inline now there are two. Check for the common case pte_none before calling, and move its rss accounting up into install_page or install_file_pte - which helps the next patch. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/fremap.c b/mm/fremap.c index 224cc1598b35..7f08d10ceaff 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -20,34 +20,32 @@ #include #include -static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, +static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page = NULL; - if (pte_none(pte)) - return; if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - struct page *page; - flush_cache_page(vma, addr, pfn); pte = ptep_clear_flush(vma, addr, ptep); if (unlikely(!pfn_valid(pfn))) { print_bad_pte(vma, pte, addr); - return; + goto out; } page = pfn_to_page(pfn); if (pte_dirty(pte)) set_page_dirty(page); page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } +out: + return !!page; } /* @@ -96,9 +94,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, if (page_mapcount(page) > INT_MAX/2) goto err_unlock; - zap_pte(mm, vma, addr, pte); + if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) + inc_mm_counter(mm, file_rss); - inc_mm_counter(mm, file_rss); flush_icache_page(vma, page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); page_add_file_rmap(page); @@ -145,7 +143,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto err_unlock; - zap_pte(mm, vma, addr, pte); + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) + dec_mm_counter(mm, file_rss); set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:18 -0700 Subject: [PATCH 44/98] [PATCH] mm: update_hiwaters just in time update_mem_hiwater has attracted various criticisms, in particular from those concerned with mm scalability. Originally it was called whenever rss or total_vm got raised. Then many of those callsites were replaced by a timer tick call from account_system_time. Now Frank van Maarseveen reports that to be found inadequate. How about this? Works for Frank. Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros update_hiwater_rss and update_hiwater_vm. Don't attempt to keep mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually by 1): those are hot paths. Do the opposite, update only when about to lower rss (usually by many), or just before final accounting in do_exit. Handle mm->hiwater_vm in the same way, though it's much less of an issue. Demand that whoever collects these hiwater statistics do the work of taking the maximum with rss or total_vm. And there has been no collector of these hiwater statistics in the tree. The new convention needs an example, so match Frank's usage by adding a VmPeak line above VmSize to /proc//status, and also a VmHWM line above VmRSS (High-Water-Mark or High-Water-Memory). There was a particular anomaly during mremap move, that hiwater_vm might be captured too high. A fleeting such anomaly remains, but it's quickly corrected now, whereas before it would stick. What locking? None: if the app is racy then these statistics will be racy, it's not worth any overhead to make them exact. But whenever it suits, hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under page_table_lock (for now) or with preemption disabled (later on): without going to any trouble, minimize the time between reading current values and updating, to minimize those occasions when a racing thread bumps a count up and back down in between. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 1 - fs/exec.c | 1 - fs/proc/task_mmu.c | 23 +++++++++++++++++++++-- include/linux/mm.h | 3 --- include/linux/sched.h | 10 ++++++++++ kernel/exit.c | 5 ++++- kernel/sched.c | 2 -- mm/fremap.c | 4 +++- mm/hugetlb.c | 3 +++ mm/memory.c | 17 +---------------- mm/mmap.c | 4 ++++ mm/mremap.c | 12 ++++++++++-- mm/nommu.c | 15 ++------------- mm/rmap.c | 6 ++++++ 14 files changed, 64 insertions(+), 42 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index a719e158e002..8e71cdbecc7c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); - update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/fs/exec.c b/fs/exec.c index cefadf5ab83b..9bb55c8cf224 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1207,7 +1207,6 @@ int do_execve(char * filename, /* execve success */ security_bprm_free(bprm); acct_update_integrals(current); - update_mem_hiwater(current); kfree(bprm); return retval; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bccee7cf9ccd..7c89b4549049 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,22 +14,41 @@ char *task_mem(struct mm_struct *mm, char *buffer) { unsigned long data, text, lib; + unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + + /* + * Note: to minimize their overhead, mm maintains hiwater_vm and + * hiwater_rss only when about to *lower* total_vm or rss. Any + * collector of these hiwater stats must therefore get total_vm + * and rss too, which will usually be the higher. Barriers? not + * worth the effort, such snapshots can always be inconsistent. + */ + hiwater_vm = total_vm = mm->total_vm; + if (hiwater_vm < mm->hiwater_vm) + hiwater_vm = mm->hiwater_vm; + hiwater_rss = total_rss = get_mm_rss(mm); + if (hiwater_rss < mm->hiwater_rss) + hiwater_rss = mm->hiwater_rss; data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; buffer += sprintf(buffer, + "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" + "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n", - (mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), + hiwater_vm << (PAGE_SHIFT-10), + (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), - get_mm_rss(mm) << (PAGE_SHIFT-10), + hiwater_rss << (PAGE_SHIFT-10), + total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); diff --git a/include/linux/mm.h b/include/linux/mm.h index da42093250c3..7d4552fe0864 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm, } #endif /* CONFIG_PROC_FS */ -/* update per process rss and vm hiwater data */ -extern void update_mem_hiwater(struct task_struct *tsk); - #ifndef CONFIG_DEBUG_PAGEALLOC static inline void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/include/linux/sched.h b/include/linux/sched.h index afcaac66cbd5..a9c0b7d26303 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #define dec_mm_counter(mm, member) (mm)->_##member-- #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +#define update_hiwater_rss(mm) do { \ + unsigned long _rss = get_mm_rss(mm); \ + if ((mm)->hiwater_rss < _rss) \ + (mm)->hiwater_rss = _rss; \ +} while (0) +#define update_hiwater_vm(mm) do { \ + if ((mm)->hiwater_vm < (mm)->total_vm) \ + (mm)->hiwater_vm = (mm)->total_vm; \ +} while (0) + typedef unsigned long mm_counter_t; struct mm_struct { diff --git a/kernel/exit.c b/kernel/exit.c index 3b25b182d2be..79f52b85d6ed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - update_mem_hiwater(tsk); + if (tsk->mm) { + update_hiwater_rss(tsk->mm); + update_hiwater_vm(tsk->mm); + } group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { del_timer_sync(&tsk->signal->real_timer); diff --git a/kernel/sched.c b/kernel/sched.c index 1e5cafdf4e27..4f26c544d02c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->idle = cputime64_add(cpustat->idle, tmp); /* Account for system time used */ acct_update_integrals(p); - /* Update rss highwater mark */ - update_mem_hiwater(p); } /* diff --git a/mm/fremap.c b/mm/fremap.c index 7f08d10ceaff..49719a35769a 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte) goto err_unlock; - if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) + if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { + update_hiwater_rss(mm); dec_mm_counter(mm, file_rss); + } set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 094455bcbbf7..ac5f044bf514 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (! ptep) diff --git a/mm/memory.c b/mm/memory.c index a25ee1d3e20a..692ad810263d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); @@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr) EXPORT_SYMBOL(vmalloc_to_pfn); -/* - * update_mem_hiwater - * - update per process rss and vm high water data - */ -void update_mem_hiwater(struct task_struct *tsk) -{ - if (tsk->mm) { - unsigned long rss = get_mm_rss(tsk->mm); - - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - #if !defined(__HAVE_ARCH_GATE_AREA) #if defined(AT_SYSINFO_EHDR) diff --git a/mm/mmap.c b/mm/mmap.c index 8a111792b8db..c43b28457007 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { + /* Update high watermark before we lower total_vm */ + update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); @@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm, lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); + update_hiwater_rss(mm); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, @@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); + /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); diff --git a/mm/mremap.c b/mm/mremap.c index 318eea5467a0..ccf456477020 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long new_pgoff; unsigned long moved_len; unsigned long excess = 0; + unsigned long hiwater_vm; int split = 0; /* @@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* - * if we failed to move page tables we still do total_vm increment - * since do_munmap() will decrement it by old_len == new_len + * If we failed to move page tables we still do total_vm increment + * since do_munmap() will decrement it by old_len == new_len. + * + * Since total_vm is about to be raised artificially high for a + * moment, we need to restore high watermark afterwards: if stats + * are taken meanwhile, total_vm and hiwater_vm appear too high. + * If this were a serious issue, we'd add a flag to do_munmap(). */ + hiwater_vm = mm->hiwater_vm; mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); @@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, vm_unacct_memory(excess >> PAGE_SHIFT); excess = 0; } + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ if (excess) { diff --git a/mm/nommu.c b/mm/nommu.c index 599924886eb5..dfb124ffb9be 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) realalloc -= kobjsize(vml); askedalloc -= sizeof(*vml); kfree(vml); + + update_hiwater_vm(mm); mm->total_vm -= len >> PAGE_SHIFT; #ifdef DEBUG @@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) { } -void update_mem_hiwater(struct task_struct *tsk) -{ - unsigned long rss; - - if (likely(tsk->mm)) { - rss = get_mm_rss(tsk->mm); - if (tsk->mm->hiwater_rss < rss) - tsk->mm->hiwater_rss = rss; - if (tsk->mm->hiwater_vm < tsk->mm->total_vm) - tsk->mm->hiwater_vm = tsk->mm->total_vm; - } -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/rmap.c b/mm/rmap.c index f69d5342ce7f..4c52c56c9905 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) if (pte_dirty(pteval)) set_page_dirty(page); + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + if (PageAnon(page)) { swp_entry_t entry = { .val = page->private }; /* @@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor, if (!pmd_present(*pmd)) goto out_unlock; + /* Update high watermark before we lower rss */ + update_hiwater_rss(mm); + for (original_pte = pte = pte_offset_map(pmd, address); address < end; pte++, address += PAGE_SIZE) { From f449952bc8bde7fbc73c6d20dff92b627a21f8b9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:19 -0700 Subject: [PATCH 45/98] [PATCH] mm: mm_struct hiwaters moved Slight and timid rearrangement of mm_struct: hiwater_rss and hiwater_vm were tacked on the end, but it seems better to keep them near _file_rss, _anon_rss and total_vm, in the same cacheline on those arches verified. There are likely to be more profitable rearrangements, but less obvious (is it good or bad that saved_auxv[AT_VECTOR_SIZE] isolates cpu_vm_mask and context from many others?), needing serious instrumentation. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index a9c0b7d26303..292cb57ce38f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -291,16 +291,19 @@ struct mm_struct { * by mmlist_lock */ - unsigned long start_code, end_code, start_data, end_data; - unsigned long start_brk, brk, start_stack; - unsigned long arg_start, arg_end, env_start, env_end; - unsigned long total_vm, locked_vm, shared_vm; - unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes; - /* Special counters protected by the page_table_lock */ mm_counter_t _file_rss; mm_counter_t _anon_rss; + unsigned long hiwater_rss; /* High-watermark of RSS usage */ + unsigned long hiwater_vm; /* High-water virtual memory usage */ + + unsigned long total_vm, locked_vm, shared_vm, exec_vm; + unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ unsigned dumpable:2; @@ -320,11 +323,7 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; - struct kioctx default_kioctx; - - unsigned long hiwater_rss; /* High-water RSS usage */ - unsigned long hiwater_vm; /* High-water virtual memory usage */ }; struct sighand_struct { From 46dea3d092d23a58b42499cc8a21de0fad079f4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:20 -0700 Subject: [PATCH 46/98] [PATCH] mm: ia64 use expand_upwards ia64 has expand_backing_store function for growing its Register Backing Store vma upwards. But more complete code for this purpose is found in the CONFIG_STACK_GROWSUP part of mm/mmap.c. Uglify its #ifdefs further to provide expand_upwards for ia64 as well as expand_stack for parisc. The Register Backing Store vma should be marked VM_ACCOUNT. Implement the intention of growing it only a page at a time, instead of passing an address outside of the vma to handle_mm_fault, with unknown consequences. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/fault.c | 34 +++++++--------------------------- arch/ia64/mm/init.c | 2 +- include/linux/mm.h | 3 ++- mm/mmap.c | 17 ++++++++++++++--- 4 files changed, 24 insertions(+), 32 deletions(-) diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index f21b55549787..af7eb087dca7 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c @@ -19,32 +19,6 @@ extern void die (char *, struct pt_regs *, long); -/* - * This routine is analogous to expand_stack() but instead grows the - * register backing store (which grows towards higher addresses). - * Since the register backing store is access sequentially, we - * disallow growing the RBS by more than a page at a time. Note that - * the VM_GROWSUP flag can be set on any VM area but that's fine - * because the total process size is still limited by RLIMIT_STACK and - * RLIMIT_AS. - */ -static inline long -expand_backing_store (struct vm_area_struct *vma, unsigned long address) -{ - unsigned long grow; - - grow = PAGE_SIZE >> PAGE_SHIFT; - if (address - vma->vm_start > current->signal->rlim[RLIMIT_STACK].rlim_cur - || (((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->signal->rlim[RLIMIT_AS].rlim_cur)) - return -ENOMEM; - vma->vm_end += PAGE_SIZE; - vma->vm_mm->total_vm += grow; - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm += grow; - vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow); - return 0; -} - /* * Return TRUE if ADDRESS points at a page in the kernel's mapped segment * (inside region 5, on ia64) and that page is present. @@ -185,7 +159,13 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re if (REGION_NUMBER(address) != REGION_NUMBER(vma->vm_start) || REGION_OFFSET(address) >= RGN_MAP_LIMIT) goto bad_area; - if (expand_backing_store(vma, address)) + /* + * Since the register backing store is accessed sequentially, + * we disallow growing it by more than a page at a time. + */ + if (address > vma->vm_end + PAGE_SIZE - sizeof(long)) + goto bad_area; + if (expand_upwards(vma, address)) goto bad_area; } goto good_area; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 98246acd4991..0063b2c50908 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -158,7 +158,7 @@ ia64_init_addr_space (void) vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; vma->vm_page_prot = protection_map[VM_DATA_DEFAULT_FLAGS & 0x7]; - vma->vm_flags = VM_DATA_DEFAULT_FLAGS | VM_GROWSUP; + vma->vm_flags = VM_DATA_DEFAULT_FLAGS|VM_GROWSUP|VM_ACCOUNT; down_write(¤t->mm->mmap_sem); if (insert_vm_struct(current->mm, vma)) { up_write(¤t->mm->mmap_sem); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7d4552fe0864..89398032bc4b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -896,7 +896,8 @@ void handle_ra_miss(struct address_space *mapping, unsigned long max_sane_readahead(unsigned long nr); /* Do stack extension */ -extern int expand_stack(struct vm_area_struct * vma, unsigned long address); +extern int expand_stack(struct vm_area_struct *vma, unsigned long address); +extern int expand_upwards(struct vm_area_struct *vma, unsigned long address); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); diff --git a/mm/mmap.c b/mm/mmap.c index c43b28457007..d931d7e49ac9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1508,11 +1508,15 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un return 0; } -#ifdef CONFIG_STACK_GROWSUP +#if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64) /* - * vma is the first one with address > vma->vm_end. Have to extend vma. + * PA-RISC uses this for its stack; IA64 for its Register Backing Store. + * vma is the last one with address > vma->vm_end. Have to extend vma. */ -int expand_stack(struct vm_area_struct * vma, unsigned long address) +#ifdef CONFIG_STACK_GROWSUP +static inline +#endif +int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1550,6 +1554,13 @@ int expand_stack(struct vm_area_struct * vma, unsigned long address) anon_vma_unlock(vma); return error; } +#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ + +#ifdef CONFIG_STACK_GROWSUP +int expand_stack(struct vm_area_struct *vma, unsigned long address) +{ + return expand_upwards(vma, address); +} struct vm_area_struct * find_extend_vma(struct mm_struct *mm, unsigned long addr) From 872fec16d9a0ed3b75b8893aa217e49cca575ee5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:21 -0700 Subject: [PATCH 47/98] [PATCH] mm: init_mm without ptlock First step in pushing down the page_table_lock. init_mm.page_table_lock has been used throughout the architectures (usually for ioremap): not to serialize kernel address space allocation (that's usually vmlist_lock), but because pud_alloc,pmd_alloc,pte_alloc_kernel expect caller holds it. Reverse that: don't lock or unlock init_mm.page_table_lock in any of the architectures; instead rely on pud_alloc,pmd_alloc,pte_alloc_kernel to take and drop it when allocating a new one, to check lest a racing task already did. Similarly no page_table_lock in vmalloc's map_vm_area. Some temporary ugliness in __pud_alloc and __pmd_alloc: since they also handle user mms, which are converted only by a later patch, for now they have to lock differently according to whether or not it's init_mm. If sources get muddled, there's a danger that an arch source taking init_mm.page_table_lock will be mixed with common source also taking it (or neither take it). So break the rules and make another change, which should break the build for such a mismatch: remove the redundant mm arg from pte_alloc_kernel (ppc64 scrapped its distinct ioremap_mm in 2.6.13). Exceptions: arm26 used pte_alloc_kernel on user mm, now pte_alloc_map; ia64 used pte_alloc_map on init_mm, now pte_alloc_kernel; parisc had bad args to pmd_alloc and pte_alloc_kernel in unused USE_HPPA_IOREMAP code; ppc64 map_io_page forgot to unlock on failure; ppc mmu_mapin_ram and ppc64 im_free took page_table_lock for no good reason. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/remap.c | 6 +--- arch/arm/mm/consistent.c | 6 +--- arch/arm/mm/ioremap.c | 4 +-- arch/arm26/mm/memc.c | 3 +- arch/cris/mm/ioremap.c | 4 +-- arch/frv/mm/dma-alloc.c | 5 +-- arch/i386/mm/ioremap.c | 4 +-- arch/ia64/mm/init.c | 11 ++----- arch/m32r/mm/ioremap.c | 4 +-- arch/m68k/mm/kmap.c | 2 +- arch/m68k/sun3x/dvma.c | 2 +- arch/mips/mm/ioremap.c | 4 +-- arch/parisc/kernel/pci-dma.c | 2 +- arch/parisc/mm/ioremap.c | 6 ++-- arch/ppc/kernel/dma-mapping.c | 6 +--- arch/ppc/mm/4xx_mmu.c | 4 --- arch/ppc/mm/pgtable.c | 4 +-- arch/ppc64/mm/imalloc.c | 5 --- arch/ppc64/mm/init.c | 4 +-- arch/s390/mm/ioremap.c | 4 +-- arch/sh/mm/ioremap.c | 4 +-- arch/sh64/mm/ioremap.c | 4 +-- arch/x86_64/mm/ioremap.c | 4 +-- include/linux/mm.h | 2 +- mm/memory.c | 60 ++++++++++++++++------------------- mm/vmalloc.c | 4 +-- 26 files changed, 54 insertions(+), 114 deletions(-) diff --git a/arch/alpha/mm/remap.c b/arch/alpha/mm/remap.c index 19817ad3d89b..a78356c3ead5 100644 --- a/arch/alpha/mm/remap.c +++ b/arch/alpha/mm/remap.c @@ -2,7 +2,6 @@ #include #include -/* called with the page_table_lock held */ static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) @@ -31,7 +30,6 @@ remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, } while (address && (address < end)); } -/* called with the page_table_lock held */ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) @@ -46,7 +44,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, @@ -70,7 +68,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -84,7 +81,6 @@ __alpha_remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); return error; } diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index 82f4d5e27c54..47b0b767f080 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c @@ -397,8 +397,6 @@ static int __init consistent_init(void) pte_t *pte; int ret = 0; - spin_lock(&init_mm.page_table_lock); - do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); @@ -409,7 +407,7 @@ static int __init consistent_init(void) } WARN_ON(!pmd_none(*pmd)); - pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); + pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); if (!pte) { printk(KERN_ERR "%s: no pte tables\n", __func__); ret = -ENOMEM; @@ -419,8 +417,6 @@ static int __init consistent_init(void) consistent_pte = pte; } while (0); - spin_unlock(&init_mm.page_table_lock); - return ret; } diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 6fb1258df1b5..0f128c28fee4 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -75,7 +75,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, pgprot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | L_PTE_WRITE | flags); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, pgprot); @@ -97,7 +97,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, phys_addr -= address; dir = pgd_offset(&init_mm, address); BUG_ON(address >= end); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd = pmd_alloc(&init_mm, dir, address); if (!pmd) { @@ -114,7 +113,6 @@ remap_area_pages(unsigned long start, unsigned long phys_addr, dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_cache_vmap(start, end); return err; } diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c index 8e8a2bb2487d..d6b008b8db76 100644 --- a/arch/arm26/mm/memc.c +++ b/arch/arm26/mm/memc.c @@ -92,7 +92,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_kernel(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, new_pmd, 0); if (!new_pte) goto no_pte; @@ -101,6 +101,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) init_pte = pte_offset(init_pmd, 0); set_pte(new_pte, *init_pte); + pte_unmap(new_pte); /* * the page table entries are zeroed diff --git a/arch/cris/mm/ioremap.c b/arch/cris/mm/ioremap.c index ebba11e270fa..a92ac9877582 100644 --- a/arch/cris/mm/ioremap.c +++ b/arch/cris/mm/ioremap.c @@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, prot); @@ -74,7 +74,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pmd_t *pmd; @@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index cfc4f97490c6..342823aad758 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -55,21 +55,18 @@ static int map_page(unsigned long va, unsigned long pa, pgprot_t prot) pte_t *pte; int err = -ENOMEM; - spin_lock(&init_mm.page_table_lock); - /* Use upper 10 bits of VA to index the first level map */ pge = pgd_offset_k(va); pue = pud_offset(pge, va); pme = pmd_offset(pue, va); /* Use middle 10 bits of VA to index the second-level map */ - pte = pte_alloc_kernel(&init_mm, pme, va); + pte = pte_alloc_kernel(pme, va); if (pte != 0) { err = 0; set_pte(pte, mk_pte_phys(pa & PAGE_MASK, prot)); } - spin_unlock(&init_mm.page_table_lock); return err; } diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index f379b8d67558..5d09de8d1c6b 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -28,7 +28,7 @@ static int ioremap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long pfn; pfn = phys_addr >> PAGE_SHIFT; - pte = pte_alloc_kernel(&init_mm, pmd, addr); + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { @@ -87,14 +87,12 @@ static int ioremap_page_range(unsigned long addr, flush_cache_all(); phys_addr -= addr; pgd = pgd_offset_k(addr); - spin_lock(&init_mm.page_table_lock); do { next = pgd_addr_end(addr, end); err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return err; } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 0063b2c50908..e3215ba64ffd 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -275,26 +275,21 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot) pgd = pgd_offset_k(address); /* note: this is NOT pgd_offset()! */ - spin_lock(&init_mm.page_table_lock); { pud = pud_alloc(&init_mm, pgd, address); if (!pud) goto out; - pmd = pmd_alloc(&init_mm, pud, address); if (!pmd) goto out; - pte = pte_alloc_map(&init_mm, pmd, address); + pte = pte_alloc_kernel(pmd, address); if (!pte) goto out; - if (!pte_none(*pte)) { - pte_unmap(pte); + if (!pte_none(*pte)) goto out; - } set_pte(pte, mk_pte(page, pgprot)); - pte_unmap(pte); } - out: spin_unlock(&init_mm.page_table_lock); + out: /* no need for flush_tlb */ return page; } diff --git a/arch/m32r/mm/ioremap.c b/arch/m32r/mm/ioremap.c index 70c59055c19c..a151849a605e 100644 --- a/arch/m32r/mm/ioremap.c +++ b/arch/m32r/mm/ioremap.c @@ -67,7 +67,7 @@ remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned long size, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -90,7 +90,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -104,7 +103,6 @@ remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/arch/m68k/mm/kmap.c b/arch/m68k/mm/kmap.c index 5dcb3fa35ea9..fe2383e36b06 100644 --- a/arch/m68k/mm/kmap.c +++ b/arch/m68k/mm/kmap.c @@ -201,7 +201,7 @@ void *__ioremap(unsigned long physaddr, unsigned long size, int cacheflag) virtaddr += PTRTREESIZE; size -= PTRTREESIZE; } else { - pte_dir = pte_alloc_kernel(&init_mm, pmd_dir, virtaddr); + pte_dir = pte_alloc_kernel(pmd_dir, virtaddr); if (!pte_dir) { printk("ioremap: no mem for pte_dir\n"); return NULL; diff --git a/arch/m68k/sun3x/dvma.c b/arch/m68k/sun3x/dvma.c index 32e55adfeb8e..117481e86305 100644 --- a/arch/m68k/sun3x/dvma.c +++ b/arch/m68k/sun3x/dvma.c @@ -116,7 +116,7 @@ inline int dvma_map_cpu(unsigned long kaddr, pte_t *pte; unsigned long end3; - if((pte = pte_alloc_kernel(&init_mm, pmd, vaddr)) == NULL) { + if((pte = pte_alloc_kernel(pmd, vaddr)) == NULL) { ret = -ENOMEM; goto out; } diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c index 9c44ca70befa..3101d1db5592 100644 --- a/arch/mips/mm/ioremap.c +++ b/arch/mips/mm/ioremap.c @@ -55,7 +55,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -77,7 +77,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pmd_t *pmd; @@ -96,7 +95,6 @@ static int remap_area_pages(unsigned long address, phys_t phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index ae6213d71670..f94a02ef3d95 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c @@ -114,7 +114,7 @@ static inline int map_pmd_uncached(pmd_t * pmd, unsigned long vaddr, if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, vaddr); + pte_t * pte = pte_alloc_kernel(pmd, vaddr); if (!pte) return -ENOMEM; if (map_pte_uncached(pte, orig_vaddr, end - vaddr, paddr_ptr)) diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c index f2df502cdae3..5c7a1b3b9326 100644 --- a/arch/parisc/mm/ioremap.c +++ b/arch/parisc/mm/ioremap.c @@ -52,7 +52,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(NULL, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -75,10 +75,9 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(dir, address); + pmd = pmd_alloc(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; @@ -89,7 +88,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 0f710d2baec6..685fd0defe23 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c @@ -335,8 +335,6 @@ static int __init dma_alloc_init(void) pte_t *pte; int ret = 0; - spin_lock(&init_mm.page_table_lock); - do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); @@ -347,7 +345,7 @@ static int __init dma_alloc_init(void) } WARN_ON(!pmd_none(*pmd)); - pte = pte_alloc_kernel(&init_mm, pmd, CONSISTENT_BASE); + pte = pte_alloc_kernel(pmd, CONSISTENT_BASE); if (!pte) { printk(KERN_ERR "%s: no pte tables\n", __func__); ret = -ENOMEM; @@ -357,8 +355,6 @@ static int __init dma_alloc_init(void) consistent_pte = pte; } while (0); - spin_unlock(&init_mm.page_table_lock); - return ret; } diff --git a/arch/ppc/mm/4xx_mmu.c b/arch/ppc/mm/4xx_mmu.c index b7bcbc232f39..4d006aa1a0d1 100644 --- a/arch/ppc/mm/4xx_mmu.c +++ b/arch/ppc/mm/4xx_mmu.c @@ -110,13 +110,11 @@ unsigned long __init mmu_mapin_ram(void) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_16M | _PAGE_HWEXEC | _PAGE_HWWRITE; - spin_lock(&init_mm.page_table_lock); pmdp = pmd_offset(pgd_offset_k(v), v); pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; pmd_val(*pmdp++) = val; - spin_unlock(&init_mm.page_table_lock); v += LARGE_PAGE_SIZE_16M; p += LARGE_PAGE_SIZE_16M; @@ -127,10 +125,8 @@ unsigned long __init mmu_mapin_ram(void) pmd_t *pmdp; unsigned long val = p | _PMD_SIZE_4M | _PAGE_HWEXEC | _PAGE_HWWRITE; - spin_lock(&init_mm.page_table_lock); pmdp = pmd_offset(pgd_offset_k(v), v); pmd_val(*pmdp) = val; - spin_unlock(&init_mm.page_table_lock); v += LARGE_PAGE_SIZE_4M; p += LARGE_PAGE_SIZE_4M; diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c index 43505b1fc5d8..6ea9185fd120 100644 --- a/arch/ppc/mm/pgtable.c +++ b/arch/ppc/mm/pgtable.c @@ -280,18 +280,16 @@ map_page(unsigned long va, phys_addr_t pa, int flags) pte_t *pg; int err = -ENOMEM; - spin_lock(&init_mm.page_table_lock); /* Use upper 10 bits of VA to index the first level map */ pd = pmd_offset(pgd_offset_k(va), va); /* Use middle 10 bits of VA to index the second-level map */ - pg = pte_alloc_kernel(&init_mm, pd, va); + pg = pte_alloc_kernel(pd, va); if (pg != 0) { err = 0; set_pte_at(&init_mm, va, pg, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); if (mem_init_done) flush_HPTE(0, va, pmd_val(*pd)); } - spin_unlock(&init_mm.page_table_lock); return err; } diff --git a/arch/ppc64/mm/imalloc.c b/arch/ppc64/mm/imalloc.c index c65b87b92756..f4ca29cf5364 100644 --- a/arch/ppc64/mm/imalloc.c +++ b/arch/ppc64/mm/imalloc.c @@ -300,12 +300,7 @@ void im_free(void * addr) for (p = &imlist ; (tmp = *p) ; p = &tmp->next) { if (tmp->addr == addr) { *p = tmp->next; - - /* XXX: do we need the lock? */ - spin_lock(&init_mm.page_table_lock); unmap_vm_area(tmp); - spin_unlock(&init_mm.page_table_lock); - kfree(tmp); up(&imlist_sem); return; diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index be64b157afce..a45584b3440c 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -155,7 +155,6 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) unsigned long vsid; if (mem_init_done) { - spin_lock(&init_mm.page_table_lock); pgdp = pgd_offset_k(ea); pudp = pud_alloc(&init_mm, pgdp, ea); if (!pudp) @@ -163,12 +162,11 @@ static int map_io_page(unsigned long ea, unsigned long pa, int flags) pmdp = pmd_alloc(&init_mm, pudp, ea); if (!pmdp) return -ENOMEM; - ptep = pte_alloc_kernel(&init_mm, pmdp, ea); + ptep = pte_alloc_kernel(pmdp, ea); if (!ptep) return -ENOMEM; set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, __pgprot(flags))); - spin_unlock(&init_mm.page_table_lock); } else { unsigned long va, vpn, hash, hpteg; diff --git a/arch/s390/mm/ioremap.c b/arch/s390/mm/ioremap.c index c6c39d868bc8..0f6e9ecbefe2 100644 --- a/arch/s390/mm/ioremap.c +++ b/arch/s390/mm/ioremap.c @@ -58,7 +58,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -80,7 +80,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -94,7 +93,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 9f490c2742f0..e794e27a72f1 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -57,7 +57,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -79,7 +79,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; pmd = pmd_alloc(&init_mm, dir, address); @@ -93,7 +92,6 @@ int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/arch/sh64/mm/ioremap.c b/arch/sh64/mm/ioremap.c index f4003da556bc..fb1866fa2c9d 100644 --- a/arch/sh64/mm/ioremap.c +++ b/arch/sh64/mm/ioremap.c @@ -79,7 +79,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -101,7 +101,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd = pmd_alloc(&init_mm, dir, address); error = -ENOMEM; @@ -115,7 +114,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return 0; } diff --git a/arch/x86_64/mm/ioremap.c b/arch/x86_64/mm/ioremap.c index 6972df480d2b..ecf7acb5db9b 100644 --- a/arch/x86_64/mm/ioremap.c +++ b/arch/x86_64/mm/ioremap.c @@ -60,7 +60,7 @@ static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, unsigned lo if (address >= end) BUG(); do { - pte_t * pte = pte_alloc_kernel(&init_mm, pmd, address); + pte_t * pte = pte_alloc_kernel(pmd, address); if (!pte) return -ENOMEM; remap_area_pte(pte, address, end - address, address + phys_addr, flags); @@ -105,7 +105,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, flush_cache_all(); if (address >= end) BUG(); - spin_lock(&init_mm.page_table_lock); do { pud_t *pud; pud = pud_alloc(&init_mm, pgd, address); @@ -119,7 +118,6 @@ static int remap_area_pages(unsigned long address, unsigned long phys_addr, address = (address + PGDIR_SIZE) & PGDIR_MASK; pgd++; } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); flush_tlb_all(); return error; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 89398032bc4b..b9fa82b96d9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -706,7 +706,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern int vmtruncate(struct inode * inode, loff_t offset); extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); diff --git a/mm/memory.c b/mm/memory.c index 692ad810263d..95a4553c75f7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -307,28 +307,22 @@ out: return pte_offset_map(pmd, address); } -pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +pte_t fastcall * pte_alloc_kernel(pmd_t *pmd, unsigned long address) { if (!pmd_present(*pmd)) { pte_t *new; - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one_kernel(mm, address); - spin_lock(&mm->page_table_lock); + new = pte_alloc_one_kernel(&init_mm, address); if (!new) return NULL; - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { + spin_lock(&init_mm.page_table_lock); + if (pmd_present(*pmd)) pte_free_kernel(new); - goto out; - } - pmd_populate_kernel(mm, pmd, new); + else + pmd_populate_kernel(&init_mm, pmd, new); + spin_unlock(&init_mm.page_table_lock); } -out: return pte_offset_kernel(pmd, address); } @@ -2097,30 +2091,30 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, #ifndef __PAGETABLE_PUD_FOLDED /* * Allocate page upper directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. + * We've already handled the fast-path in-line. */ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { pud_t *new; - spin_unlock(&mm->page_table_lock); + if (mm != &init_mm) /* Temporary bridging hack */ + spin_unlock(&mm->page_table_lock); new = pud_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + if (mm != &init_mm) /* Temporary bridging hack */ + spin_lock(&mm->page_table_lock); return NULL; + } - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ + spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) { pud_free(new); goto out; } pgd_populate(mm, pgd, new); out: + if (mm == &init_mm) /* Temporary bridging hack */ + spin_unlock(&mm->page_table_lock); return pud_offset(pgd, address); } #endif /* __PAGETABLE_PUD_FOLDED */ @@ -2128,24 +2122,22 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr #ifndef __PAGETABLE_PMD_FOLDED /* * Allocate page middle directory. - * - * We've already handled the fast-path in-line, and we own the - * page table lock. + * We've already handled the fast-path in-line. */ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { pmd_t *new; - spin_unlock(&mm->page_table_lock); + if (mm != &init_mm) /* Temporary bridging hack */ + spin_unlock(&mm->page_table_lock); new = pmd_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + if (mm != &init_mm) /* Temporary bridging hack */ + spin_lock(&mm->page_table_lock); return NULL; + } - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ + spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK if (pud_present(*pud)) { pmd_free(new); @@ -2161,6 +2153,8 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr #endif /* __ARCH_HAS_4LEVEL_HACK */ out: + if (mm == &init_mm) /* Temporary bridging hack */ + spin_unlock(&mm->page_table_lock); return pmd_offset(pud, address); } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5e9120598799..54a90e83cb31 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,7 +89,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, { pte_t *pte; - pte = pte_alloc_kernel(&init_mm, pmd, addr); + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { @@ -147,14 +147,12 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) BUG_ON(addr >= end); pgd = pgd_offset_k(addr); - spin_lock(&init_mm.page_table_lock); do { next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&init_mm.page_table_lock); flush_cache_vmap((unsigned long) area->addr, end); return err; } From 1bb3630e89cb8a7b3d3807629c20c5bad88290ff Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:22 -0700 Subject: [PATCH 48/98] [PATCH] mm: ptd_alloc inline and out It seems odd to me that, whereas pud_alloc and pmd_alloc test inline, only calling out-of-line __pud_alloc __pmd_alloc if allocation needed, pte_alloc_map and pte_alloc_kernel are entirely out-of-line. Though it does add a little to kernel size, change them to macros testing inline, calling __pte_alloc or __pte_alloc_kernel to allocate out-of-line. Mark none of them as fastcalls, leave that to CONFIG_REGPARM or not. It also seems more natural for the out-of-line functions to leave the offset calculation and map to the inline, which has to do it anyway for the common case. At least mremap move wants __pte_alloc without _map. Macros rather than inline functions, certainly to avoid the header file issues which arise from CONFIG_HIGHPTE needing kmap_types.h, but also in case any architectures I haven't built would have other such problems. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/4level-fixup.h | 11 +--- include/linux/mm.h | 38 ++++++------ mm/memory.c | 93 ++++++++++++------------------ mm/mremap.c | 7 +-- 4 files changed, 61 insertions(+), 88 deletions(-) diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index c20ec257ecc0..68c6fea994d9 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -10,14 +10,9 @@ #define pud_t pgd_t -#define pmd_alloc(mm, pud, address) \ -({ pmd_t *ret; \ - if (pgd_none(*pud)) \ - ret = __pmd_alloc(mm, pud, address); \ - else \ - ret = pmd_offset(pud, address); \ - ret; \ -}) +#define pmd_alloc(mm, pud, address) \ + ((unlikely(pgd_none(*(pud))) && __pmd_alloc(mm, pud, address))? \ + NULL: pmd_offset(pud, address)) #define pud_alloc(mm, pgd, address) (pgd) #define pud_offset(pgd, start) (pgd) diff --git a/include/linux/mm.h b/include/linux/mm.h index b9fa82b96d9e..22c2d6922c0e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -704,10 +704,6 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, } extern int vmtruncate(struct inode * inode, loff_t offset); -extern pud_t *FASTCALL(__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); -extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_kernel(pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -760,32 +756,36 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -/* - * On a two-level or three-level page table, this ends up being trivial. Thus - * the inlining and the symmetry break with pte_alloc_map() that does all - * of this out-of-line. - */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); + /* * The following ifdef needed to get the 4level-fixup.h header to work. * Remove it when 4level-fixup.h has been removed. */ -#ifdef CONFIG_MMU -#ifndef __ARCH_HAS_4LEVEL_HACK +#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK) static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - if (pgd_none(*pgd)) - return __pud_alloc(mm, pgd, address); - return pud_offset(pgd, address); + return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))? + NULL: pud_offset(pgd, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - if (pud_none(*pud)) - return __pmd_alloc(mm, pud, address); - return pmd_offset(pud, address); + return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? + NULL: pmd_offset(pud, address); } -#endif -#endif /* CONFIG_MMU */ +#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ + +#define pte_alloc_map(mm, pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map(pmd, address)) + +#define pte_alloc_kernel(pmd, address) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, diff --git a/mm/memory.c b/mm/memory.c index 95a4553c75f7..4bdd1186b43b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -280,50 +280,39 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, } } -pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { - if (!pmd_present(*pmd)) { - struct page *new; + struct page *new; - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); - if (!new) - return NULL; - /* - * Because we dropped the lock, we should re-check the - * entry, as somebody else could have populated it.. - */ - if (pmd_present(*pmd)) { - pte_free(new); - goto out; - } + spin_unlock(&mm->page_table_lock); + new = pte_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!new) + return -ENOMEM; + + if (pmd_present(*pmd)) /* Another has populated it */ + pte_free(new); + else { mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } -out: - return pte_offset_map(pmd, address); + return 0; } -pte_t fastcall * pte_alloc_kernel(pmd_t *pmd, unsigned long address) +int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { - if (!pmd_present(*pmd)) { - pte_t *new; + pte_t *new = pte_alloc_one_kernel(&init_mm, address); + if (!new) + return -ENOMEM; - new = pte_alloc_one_kernel(&init_mm, address); - if (!new) - return NULL; - - spin_lock(&init_mm.page_table_lock); - if (pmd_present(*pmd)) - pte_free_kernel(new); - else - pmd_populate_kernel(&init_mm, pmd, new); - spin_unlock(&init_mm.page_table_lock); - } - return pte_offset_kernel(pmd, address); + spin_lock(&init_mm.page_table_lock); + if (pmd_present(*pmd)) /* Another has populated it */ + pte_free_kernel(new); + else + pmd_populate_kernel(&init_mm, pmd, new); + spin_unlock(&init_mm.page_table_lock); + return 0; } static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) @@ -2093,7 +2082,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Allocate page upper directory. * We've already handled the fast-path in-line. */ -pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { pud_t *new; @@ -2103,19 +2092,17 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr if (!new) { if (mm != &init_mm) /* Temporary bridging hack */ spin_lock(&mm->page_table_lock); - return NULL; + return -ENOMEM; } spin_lock(&mm->page_table_lock); - if (pgd_present(*pgd)) { + if (pgd_present(*pgd)) /* Another has populated it */ pud_free(new); - goto out; - } - pgd_populate(mm, pgd, new); - out: + else + pgd_populate(mm, pgd, new); if (mm == &init_mm) /* Temporary bridging hack */ spin_unlock(&mm->page_table_lock); - return pud_offset(pgd, address); + return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ @@ -2124,7 +2111,7 @@ pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr * Allocate page middle directory. * We've already handled the fast-path in-line. */ -pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { pmd_t *new; @@ -2134,28 +2121,24 @@ pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr if (!new) { if (mm != &init_mm) /* Temporary bridging hack */ spin_lock(&mm->page_table_lock); - return NULL; + return -ENOMEM; } spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) { + if (pud_present(*pud)) /* Another has populated it */ pmd_free(new); - goto out; - } - pud_populate(mm, pud, new); + else + pud_populate(mm, pud, new); #else - if (pgd_present(*pud)) { + if (pgd_present(*pud)) /* Another has populated it */ pmd_free(new); - goto out; - } - pgd_populate(mm, pud, new); + else + pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - - out: if (mm == &init_mm) /* Temporary bridging hack */ spin_unlock(&mm->page_table_lock); - return pmd_offset(pud, address); + return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/mremap.c b/mm/mremap.c index ccf456477020..616facc3d28a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -51,7 +51,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) pgd_t *pgd; pud_t *pud; pmd_t *pmd = NULL; - pte_t *pte; /* * We do need page_table_lock: because allocators expect that. @@ -66,12 +65,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) goto out; - pte = pte_alloc_map(mm, pmd, addr); - if (!pte) { + if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) pmd = NULL; - goto out; - } - pte_unmap(pte); out: spin_unlock(&mm->page_table_lock); return pmd; From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:23 -0700 Subject: [PATCH 49/98] [PATCH] mm: ptd_alloc take ptlock Second step in pushing down the page_table_lock. Remove the temporary bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not to hold page_table_lock, whether it's on init_mm or a user mm; take page_table_lock internally to check if a racing task already allocated. Convert their callers from common code. But avoid coming back to change them again later: instead of moving the spin_lock(&mm->page_table_lock) down, switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which encapsulate the mapping+locking and unlocking+unmapping together, and in the end may use alternatives to the mm page_table_lock itself. These callers all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them; and pte_alloc uses the "atomic" pmd_present to test whether it needs to allocate. It appears that on all arches we can safely descend without page_table_lock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 14 +++--- include/linux/mm.h | 18 ++++++++ kernel/fork.c | 2 - mm/fremap.c | 48 ++++++++------------- mm/hugetlb.c | 12 ++++-- mm/memory.c | 104 ++++++++++++++------------------------------- mm/mremap.c | 27 ++++-------- 7 files changed, 90 insertions(+), 135 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 9bb55c8cf224..ba73797eb4cb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma, pud_t * pud; pmd_t * pmd; pte_t * pte; + spinlock_t *ptl; if (unlikely(anon_vma_prepare(vma))) - goto out_sig; + goto out; flush_dcache_page(page); pgd = pgd_offset(mm, address); - - spin_lock(&mm->page_table_lock); pud = pud_alloc(mm, pgd, address); if (!pud) goto out; pmd = pmd_alloc(mm, pud, address); if (!pmd) goto out; - pte = pte_alloc_map(mm, pmd, address); + pte = pte_alloc_map_lock(mm, pmd, address, &ptl); if (!pte) goto out; if (!pte_none(*pte)) { - pte_unmap(pte); + pte_unmap_unlock(pte, ptl); goto out; } inc_mm_counter(mm, anon_rss); @@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma, set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); page_add_anon_rmap(page, vma, address); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); /* no need for flush_tlb */ return; out: - spin_unlock(&mm->page_table_lock); -out_sig: __free_page(page); force_sig(SIGKILL, current); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 22c2d6922c0e..d4c3512e7db4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#define pte_offset_map_lock(mm, pmd, address, ptlp) \ +({ \ + spinlock_t *__ptl = &(mm)->page_table_lock; \ + pte_t *__pte = pte_offset_map(pmd, address); \ + *(ptlp) = __ptl; \ + spin_lock(__ptl); \ + __pte; \ +}) + +#define pte_unmap_unlock(pte, ptl) do { \ + spin_unlock(ptl); \ + pte_unmap(pte); \ +} while (0) + #define pte_alloc_map(mm, pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map_lock(mm, pmd, address, ptlp) \ + ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) + #define pte_alloc_kernel(pmd, address) \ ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) diff --git a/kernel/fork.c b/kernel/fork.c index 2a587b3224e3..8a069612eac3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) /* * Link in the new vma and copy the page table entries. */ - spin_lock(&mm->page_table_lock); *pprev = tmp; pprev = &tmp->vm_next; @@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) mm->map_count++; retval = copy_page_range(mm, oldmm, tmp); - spin_unlock(&mm->page_table_lock); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/fremap.c b/mm/fremap.c index 49719a35769a..d862be3bc3e3 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; + spinlock_t *ptl; BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, addr); if (!pud) - goto err_unlock; - + goto out; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto err_unlock; - - pte = pte_alloc_map(mm, pmd, addr); + goto out; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - goto err_unlock; + goto out; /* * This page may have been truncated. Tell the @@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, inode = vma->vm_file->f_mapping->host; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (!page->mapping || page->index >= size) - goto err_unlock; + goto unlock; err = -ENOMEM; if (page_mapcount(page) > INT_MAX/2) - goto err_unlock; + goto unlock; if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) inc_mm_counter(mm, file_rss); @@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, mk_pte(page, prot)); page_add_file_rmap(page); pte_val = *pte; - pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - err = 0; -err_unlock: - spin_unlock(&mm->page_table_lock); +unlock: + pte_unmap_unlock(pte, ptl); +out: return err; } EXPORT_SYMBOL(install_page); - /* * Install a file pte to a given virtual memory address, release any * previously existing mapping. @@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud; pgd_t *pgd; pte_t pte_val; + spinlock_t *ptl; BUG_ON(vma->vm_flags & VM_RESERVED); pgd = pgd_offset(mm, addr); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, addr); if (!pud) - goto err_unlock; - + goto out; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto err_unlock; - - pte = pte_alloc_map(mm, pmd, addr); + goto out; + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) - goto err_unlock; + goto out; if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { update_hiwater_rss(mm); @@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); pte_val = *pte; - pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); - spin_unlock(&mm->page_table_lock); - return 0; - -err_unlock: - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); + err = 0; +out: return err; } - /*** * sys_remap_file_pages - remap arbitrary pages of a shared backing store * file within an existing vma. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac5f044bf514..ea0826ff2663 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long addr; for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + src_pte = huge_pte_offset(src, addr); + if (!src_pte) + continue; dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; + spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); - src_pte = huge_pte_offset(src, addr); - if (src_pte && !pte_none(*src_pte)) { + if (!pte_none(*src_pte)) { entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); } return 0; @@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) hugetlb_prefault_arch_hook(mm); - spin_lock(&mm->page_table_lock); for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { unsigned long idx; pte_t *pte = huge_pte_alloc(mm, addr); @@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) goto out; } } + spin_lock(&mm->page_table_lock); add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); + spin_unlock(&mm->page_table_lock); } out: - spin_unlock(&mm->page_table_lock); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 4bdd1186b43b..a40e4b1cee4f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { - struct page *new; - - spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); - spin_lock(&mm->page_table_lock); + struct page *new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; + spin_lock(&mm->page_table_lock); if (pmd_present(*pmd)) /* Another has populated it */ pte_free(new); else { @@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); } + spin_unlock(&mm->page_table_lock); return 0; } @@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. - * - * dst->page_table_lock is held on entry and exit, - * but may be dropped within p[mg]d_alloc() and pte_alloc_map(). */ static inline void @@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; int progress = 0; int rss[2]; again: rss[1] = rss[0] = 0; - dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); + src_ptl = &src_mm->page_table_lock; + spin_lock(src_ptl); - spin_lock(&src_mm->page_table_lock); do { /* * We are holding two locks at this point - either of them @@ -438,8 +435,8 @@ again: if (progress >= 32) { progress = 0; if (need_resched() || - need_lockbreak(&src_mm->page_table_lock) || - need_lockbreak(&dst_mm->page_table_lock)) + need_lockbreak(src_ptl) || + need_lockbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { @@ -449,12 +446,12 @@ again: copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); - spin_unlock(&src_mm->page_table_lock); + spin_unlock(src_ptl); pte_unmap_nested(src_pte - 1); - pte_unmap(dst_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); - cond_resched_lock(&dst_mm->page_table_lock); + pte_unmap_unlock(dst_pte - 1, dst_ptl); + cond_resched(); if (addr != end) goto again; return 0; @@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot) { pte_t *pte; + spinlock_t *ptl; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { @@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, zero_pte); } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = zeromap_pud_range(mm, pgd, addr, next, prot); if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } @@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long pfn, pgprot_t prot) { pte_t *pte; + spinlock_t *ptl; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; do { @@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); return 0; } @@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); err = remap_pud_range(mm, pgd, addr, next, @@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * - * Note the "page_table_lock". It is to protect against kswapd removing - * pages from under us. Note that kswapd only ever _removes_ pages, never - * adds them. As such, once we have noticed that the page is not present, - * we can drop the lock early. - * - * The adding of pages is protected by the MM semaphore (which we hold), - * so we don't need to worry about a page being suddenly been added into - * our VM. - * - * We enter with the pagetable spinlock held, we are supposed to - * release it when done. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, { pte_t entry; + spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { @@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, write_access); - /* - * We need the page table lock to synchronize with kswapd - * and the SMP-safe atomic PTE updates. - */ pgd = pgd_offset(mm, address); - spin_lock(&mm->page_table_lock); - pud = pud_alloc(mm, pgd, address); if (!pud) - goto oom; - + return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); if (!pmd) - goto oom; - + return VM_FAULT_OOM; pte = pte_alloc_map(mm, pmd, address); if (!pte) - goto oom; - - return handle_pte_fault(mm, vma, address, pte, pmd, write_access); + return VM_FAULT_OOM; - oom: - spin_unlock(&mm->page_table_lock); - return VM_FAULT_OOM; + return handle_pte_fault(mm, vma, address, pte, pmd, write_access); } #ifndef __PAGETABLE_PUD_FOLDED @@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { - pud_t *new; - - if (mm != &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); - new = pud_alloc_one(mm, address); - if (!new) { - if (mm != &init_mm) /* Temporary bridging hack */ - spin_lock(&mm->page_table_lock); + pud_t *new = pud_alloc_one(mm, address); + if (!new) return -ENOMEM; - } spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ pud_free(new); else pgd_populate(mm, pgd, new); - if (mm == &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PUD_FOLDED */ @@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { - pmd_t *new; - - if (mm != &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); - if (!new) { - if (mm != &init_mm) /* Temporary bridging hack */ - spin_lock(&mm->page_table_lock); + pmd_t *new = pmd_alloc_one(mm, address); + if (!new) return -ENOMEM; - } spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK @@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) else pgd_populate(mm, pud, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - if (mm == &init_mm) /* Temporary bridging hack */ - spin_unlock(&mm->page_table_lock); + spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/mremap.c b/mm/mremap.c index 616facc3d28a..8de77b632a20 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) pud_t *pud; pmd_t *pmd; - /* - * We don't need page_table_lock: we have mmap_sem exclusively. - */ pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) return NULL; @@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; - pmd_t *pmd = NULL; + pmd_t *pmd; - /* - * We do need page_table_lock: because allocators expect that. - */ - spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) - goto out; + return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) - goto out; + return NULL; if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) - pmd = NULL; -out: - spin_unlock(&mm->page_table_lock); + return NULL; + return pmd; } @@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; + spinlock_t *old_ptl; if (vma->vm_file) { /* @@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_vma->vm_truncate_count = 0; } - spin_lock(&mm->page_table_lock); - old_pte = pte_offset_map(old_pmd, old_addr); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); + new_pte = pte_offset_map_nested(new_pmd, new_addr); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, } pte_unmap_nested(new_pte - 1); - pte_unmap(old_pte - 1); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); } From b462705ac679f6195d1b23a752cda592d9107495 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:24 -0700 Subject: [PATCH 50/98] [PATCH] mm: arches skip ptlock Convert those few architectures which are calling pud_alloc, pmd_alloc, pte_alloc_map on a user mm, not to take the page_table_lock first, nor drop it after. Each of these can continue to use pte_alloc_map, no need to change over to pte_alloc_map_lock, they're neither racy nor swappable. In the sparc64 io_remap_pfn_range, flush_tlb_range then falls outside of the page_table_lock: that's okay, on sparc64 it's like flush_tlb_mm, and that has always been called from outside of page_table_lock in dup_mmap. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mm-armv.c | 14 -------------- arch/arm26/mm/memc.c | 15 --------------- arch/sparc/mm/generic.c | 4 +--- arch/sparc64/mm/generic.c | 6 ++---- arch/um/kernel/skas/mmu.c | 3 --- 5 files changed, 3 insertions(+), 39 deletions(-) diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 61bc2fa0511e..60f3e039bac2 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -179,11 +179,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t)); if (!vectors_high()) { - /* - * This lock is here just to satisfy pmd_alloc and pte_lock - */ - spin_lock(&mm->page_table_lock); - /* * On ARM, first page must always be allocated since it * contains the machine vectors. @@ -201,23 +196,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); - - spin_unlock(&mm->page_table_lock); } return new_pgd; no_pte: - spin_unlock(&mm->page_table_lock); pmd_free(new_pmd); - free_pages((unsigned long)new_pgd, 2); - return NULL; - no_pmd: - spin_unlock(&mm->page_table_lock); free_pages((unsigned long)new_pgd, 2); - return NULL; - no_pgd: return NULL; } diff --git a/arch/arm26/mm/memc.c b/arch/arm26/mm/memc.c index d6b008b8db76..34def6397c3c 100644 --- a/arch/arm26/mm/memc.c +++ b/arch/arm26/mm/memc.c @@ -78,12 +78,6 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) if (!new_pgd) goto no_pgd; - /* - * This lock is here just to satisfy pmd_alloc and pte_lock - * FIXME: I bet we could avoid taking it pretty much altogether - */ - spin_lock(&mm->page_table_lock); - /* * On ARM, first page must always be allocated since it contains * the machine vectors. @@ -113,23 +107,14 @@ pgd_t *get_pgd_slow(struct mm_struct *mm) memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR, (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t)); - spin_unlock(&mm->page_table_lock); - /* update MEMC tables */ cpu_memc_update_all(new_pgd); return new_pgd; no_pte: - spin_unlock(&mm->page_table_lock); pmd_free(new_pmd); - free_pgd_slow(new_pgd); - return NULL; - no_pmd: - spin_unlock(&mm->page_table_lock); free_pgd_slow(new_pgd); - return NULL; - no_pgd: return NULL; } diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c index 659c9a71f867..9604893ffdbd 100644 --- a/arch/sparc/mm/generic.c +++ b/arch/sparc/mm/generic.c @@ -81,9 +81,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc(mm, dir, from); error = -ENOMEM; if (!pmd) break; @@ -93,7 +92,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, beg, end); return error; diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c index afc01cec701f..112c316e7cd2 100644 --- a/arch/sparc64/mm/generic.c +++ b/arch/sparc64/mm/generic.c @@ -135,9 +135,8 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); - spin_lock(&mm->page_table_lock); while (from < end) { - pud_t *pud = pud_alloc(current->mm, dir, from); + pud_t *pud = pud_alloc(mm, dir, from); error = -ENOMEM; if (!pud) break; @@ -147,8 +146,7 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } - flush_tlb_range(vma, beg, end); - spin_unlock(&mm->page_table_lock); + flush_tlb_range(vma, beg, end); return error; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 240143b616a2..02cf36e0331a 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -28,7 +28,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, pmd_t *pmd; pte_t *pte; - spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, proc); pud = pud_alloc(mm, pgd, proc); if (!pud) @@ -63,7 +62,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, *pte = mk_pte(virt_to_page(kernel), __pgprot(_PAGE_PRESENT)); *pte = pte_mkexec(*pte); *pte = pte_wrprotect(*pte); - spin_unlock(&mm->page_table_lock); return(0); out_pmd: @@ -71,7 +69,6 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, out_pte: pmd_free(pmd); out: - spin_unlock(&mm->page_table_lock); return(-ENOMEM); } From 8f4e2101fd7df9031a754eedb82e2060b51f8c45 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:26 -0700 Subject: [PATCH 51/98] [PATCH] mm: page fault handler locking On the page fault path, the patch before last pushed acquiring the page_table_lock down to the head of handle_pte_fault (though it's also taken and dropped earlier when a new page table has to be allocated). Now delete that line, read "entry = *pte" without it, and go off to this or that page fault handler on the basis of this unlocked peek. Usually the handler can proceed without the lock, relying on the subsequent locked pte_same or pte_none test to back out when necessary; though do_wp_page needs the lock immediately, and do_file_page doesn't check (if there's a race, install_page just zaps the entry and reinstalls it). But on those architectures (notably i386 with PAE) whose pte is too big to be read atomically, if SMP or preemption is enabled, do_swap_page and do_file_page might cause irretrievable damage if passed a Frankenstein entry stitched together from unrelated parts. In those configs, "pte_unmap_same" has to take page_table_lock, validate orig_pte still the same, and drop page_table_lock before unmapping, before proceeding. Use pte_offset_map_lock and pte_unmap_unlock throughout the handlers; but lock avoidance leaves more lone maps and unmaps than elsewhere. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 150 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 90 insertions(+), 60 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a40e4b1cee4f..24ba688876d6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1218,6 +1218,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(remap_pfn_range); +/* + * handle_pte_fault chooses page fault handler according to an entry + * which was read non-atomically. Before making any commitment, on + * those architectures or configurations (e.g. i386 with PAE) which + * might give a mix of unmatched parts, do_swap_page and do_file_page + * must check under lock before unmapping the pte and proceeding + * (but do_wp_page is only called after already making such a check; + * and do_anonymous_page and do_no_page can safely check later on). + */ +static inline int pte_unmap_same(struct mm_struct *mm, + pte_t *page_table, pte_t orig_pte) +{ + int same = 1; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + if (sizeof(pte_t) > sizeof(unsigned long)) { + spin_lock(&mm->page_table_lock); + same = pte_same(*page_table, orig_pte); + spin_unlock(&mm->page_table_lock); + } +#endif + pte_unmap(page_table); + return same; +} + /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want @@ -1245,12 +1269,13 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * - * We hold the mm semaphore and the page_table_lock on entry and exit - * with the page_table_lock released. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), with pte both mapped and locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, - pte_t orig_pte) + spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(orig_pte); @@ -1288,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -1307,8 +1331,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, /* * Re-check the pte - we dropped the lock */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { page_remove_rmap(old_page); if (!PageAnon(old_page)) { @@ -1321,7 +1344,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ptep_establish(vma, address, page_table, entry); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); @@ -1332,8 +1354,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(new_page); page_cache_release(old_page); unlock: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); return ret; oom: page_cache_release(old_page); @@ -1660,20 +1681,22 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc } /* - * We hold the mm semaphore and the page_table_lock on entry and - * should release the pagetable lock on exit.. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access, pte_t orig_pte) { + spinlock_t *ptl; struct page *page; swp_entry_t entry; pte_t pte; int ret = VM_FAULT_MINOR; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + if (!pte_unmap_same(mm, page_table, orig_pte)) + goto out; entry = pte_to_swp_entry(orig_pte); page = lookup_swap_cache(entry); @@ -1682,11 +1705,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page = read_swap_cache_async(entry, vma, address); if (!page) { /* - * Back out if somebody else faulted in this pte while - * we released the page table lock. + * Back out if somebody else faulted in this pte + * while we released the pte lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; goto unlock; @@ -1702,11 +1724,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, lock_page(page); /* - * Back out if somebody else faulted in this pte while we - * released the page table lock. + * Back out if somebody else already faulted in this pte. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*page_table, orig_pte))) goto out_nomap; @@ -1735,7 +1755,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access) { if (do_wp_page(mm, vma, address, - page_table, pmd, pte) == VM_FAULT_OOM) + page_table, pmd, ptl, pte) == VM_FAULT_OOM) ret = VM_FAULT_OOM; goto out; } @@ -1744,37 +1764,32 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, update_mmu_cache(vma, address, pte); lazy_mmu_prot_update(pte); unlock: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); out: return ret; out_nomap: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); unlock_page(page); page_cache_release(page); return ret; } /* - * We are called with the MM semaphore and page_table_lock - * spinlock held to protect against concurrent faults in - * multithreaded programs. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access) { - struct page *page = ZERO_PAGE(addr); + struct page *page; + spinlock_t *ptl; pte_t entry; - /* Mapping of ZERO_PAGE - vm_page_prot is readonly */ - entry = mk_pte(page, vma->vm_page_prot); - if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -1782,23 +1797,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!page) goto oom; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); - - if (!pte_none(*page_table)) { - page_cache_release(page); - goto unlock; - } - inc_mm_counter(mm, anon_rss); entry = mk_pte(page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto release; + inc_mm_counter(mm, anon_rss); lru_cache_add_active(page); SetPageReferenced(page); page_add_anon_rmap(page, vma, address); } else { + /* Map the ZERO_PAGE - vm_page_prot is readonly */ + page = ZERO_PAGE(address); + page_cache_get(page); + entry = mk_pte(page, vma->vm_page_prot); + + ptl = &mm->page_table_lock; + spin_lock(ptl); + if (!pte_none(*page_table)) + goto release; inc_mm_counter(mm, file_rss); page_add_file_rmap(page); - page_cache_get(page); } set_pte_at(mm, address, page_table, entry); @@ -1807,9 +1827,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); unlock: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); return VM_FAULT_MINOR; +release: + page_cache_release(page); + goto unlock; oom: return VM_FAULT_OOM; } @@ -1823,13 +1845,15 @@ oom: * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * - * This is called with the MM semaphore held and the page table - * spinlock held. Exit with the spinlock released. + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, int write_access) { + spinlock_t *ptl; struct page *new_page; struct address_space *mapping = NULL; pte_t entry; @@ -1838,7 +1862,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int anon = 0; pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -1878,21 +1901,20 @@ retry: anon = 1; } - spin_lock(&mm->page_table_lock); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* * For a file-backed vma, someone could have truncated or otherwise * invalidated this page. If unmap_mapping_range got called, * retry getting the page. */ if (mapping && unlikely(sequence != mapping->truncate_count)) { - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); page_cache_release(new_page); cond_resched(); sequence = mapping->truncate_count; smp_rmb(); goto retry; } - page_table = pte_offset_map(pmd, address); /* * This silly early PAGE_DIRTY setting removes a race @@ -1929,8 +1951,7 @@ retry: update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); unlock: - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(page_table, ptl); return ret; oom: page_cache_release(new_page); @@ -1941,6 +1962,10 @@ oom: * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable * nonlinear vmas. + * + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults), and pte mapped but not yet locked. + * We return with mmap_sem still held, but pte unmapped and unlocked. */ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -1949,8 +1974,8 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff; int err; - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); + if (!pte_unmap_same(mm, page_table, orig_pte)) + return VM_FAULT_MINOR; if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { /* @@ -1989,8 +2014,8 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; + spinlock_t *ptl; - spin_lock(&mm->page_table_lock); entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { @@ -2007,17 +2032,22 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access, entry); } + ptl = &mm->page_table_lock; + spin_lock(ptl); + if (unlikely(!pte_same(*pte, entry))) + goto unlock; if (write_access) { if (!pte_write(entry)) - return do_wp_page(mm, vma, address, pte, pmd, entry); + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); lazy_mmu_prot_update(entry); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); +unlock: + pte_unmap_unlock(pte, ptl); return VM_FAULT_MINOR; } From 705e87c0c3c38424f7f30556c85bc20e808d2f59 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:27 -0700 Subject: [PATCH 52/98] [PATCH] mm: pte_offset_map_lock loops Convert those common loops using page_table_lock on the outside and pte_offset_map within to use just pte_offset_map_lock within instead. These all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them. But whereas pte_alloc loops tested with the "atomic" pmd_present, these loops are testing with pmd_none, which on i386 PAE tests both lower and upper halves. That's now unsafe, so add a cast into pmd_none to test only the vital lower half: we lose a little sensitivity to a corrupt middle directory, but not enough to worry about. It appears that i386 and UML were the only architectures vulnerable in this way, and pgd and pud no problem. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 17 ++++++----------- include/asm-i386/pgtable.h | 3 ++- include/asm-um/pgtable.h | 2 +- mm/mempolicy.c | 7 +++---- mm/mprotect.c | 7 +++---- mm/msync.c | 21 ++++++--------------- mm/swapfile.c | 20 +++++++++----------- 7 files changed, 30 insertions(+), 47 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7c89b4549049..7e5e7ec2e36d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -203,13 +203,14 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct mem_size_stats *mss) { pte_t *pte, ptent; + spinlock_t *ptl; unsigned long pfn; struct page *page; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { ptent = *pte; - if (pte_none(ptent) || !pte_present(ptent)) + if (!pte_present(ptent)) continue; mss->resident += PAGE_SIZE; @@ -230,8 +231,8 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, mss->private_clean += PAGE_SIZE; } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); - cond_resched_lock(&vma->vm_mm->page_table_lock); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); } static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -285,17 +286,11 @@ static inline void smaps_pgd_range(struct vm_area_struct *vma, static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; - struct mm_struct *mm = vma->vm_mm; struct mem_size_stats mss; memset(&mss, 0, sizeof mss); - - if (mm) { - spin_lock(&mm->page_table_lock); + if (vma->vm_mm) smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); - spin_unlock(&mm->page_table_lock); - } - return show_map_internal(m, v, &mss); } diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index d101ac414f07..0e3ec809352d 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -203,7 +203,8 @@ extern unsigned long pg0[]; #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) -#define pmd_none(x) (!pmd_val(x)) +/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */ +#define pmd_none(x) (!(unsigned long)pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) diff --git a/include/asm-um/pgtable.h b/include/asm-um/pgtable.h index 616d02b57ea9..ac64eb955868 100644 --- a/include/asm-um/pgtable.h +++ b/include/asm-um/pgtable.h @@ -138,7 +138,7 @@ extern unsigned long pg0[1024]; #define pte_clear(mm,addr,xp) pte_set_val(*(xp), (phys_t) 0, __pgprot(_PAGE_NEWPAGE)) -#define pmd_none(x) (!(pmd_val(x) & ~_PAGE_NEWPAGE)) +#define pmd_none(x) (!((unsigned long)pmd_val(x) & ~_PAGE_NEWPAGE)) #define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { pmd_val(*(xp)) = _PAGE_NEWPAGE; } while (0) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11d824f282f1..902d4c9eccdc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -228,9 +228,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, { pte_t *orig_pte; pte_t *pte; + spinlock_t *ptl; - spin_lock(&vma->vm_mm->page_table_lock); - orig_pte = pte = pte_offset_map(pmd, addr); + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { unsigned long pfn; unsigned int nid; @@ -246,8 +246,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(orig_pte); - spin_unlock(&vma->vm_mm->page_table_lock); + pte_unmap_unlock(orig_pte, ptl); return addr != end; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 672a76fddd5e..17a2b52b753b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,8 +29,9 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { pte_t *pte; + spinlock_t *ptl; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { if (pte_present(*pte)) { pte_t ptent; @@ -44,7 +45,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, lazy_mmu_prot_update(ptent); } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); } static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, @@ -88,7 +89,6 @@ static void change_protection(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -96,7 +96,6 @@ static void change_protection(struct vm_area_struct *vma, change_pud_range(mm, pgd, addr, next, newprot); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); - spin_unlock(&mm->page_table_lock); } static int diff --git a/mm/msync.c b/mm/msync.c index 860395486060..0e040e9c39d8 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -17,28 +17,22 @@ #include #include -/* - * Called with mm->page_table_lock held to protect against other - * threads/the swapper from ripping pte's out from under us. - */ - static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; pte_t *pte; + spinlock_t *ptl; int progress = 0; again: - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { unsigned long pfn; struct page *page; if (progress >= 64) { progress = 0; - if (need_resched() || - need_lockbreak(&mm->page_table_lock)) + if (need_resched() || need_lockbreak(ptl)) break; } progress++; @@ -58,8 +52,8 @@ again: set_page_dirty(page); progress += 3; } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); - cond_resched_lock(&mm->page_table_lock); + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); if (addr != end) goto again; } @@ -97,7 +91,6 @@ static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, static void msync_page_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { - struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; unsigned long next; @@ -110,16 +103,14 @@ static void msync_page_range(struct vm_area_struct *vma, return; BUG_ON(addr >= end); - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(vma->vm_mm, addr); flush_cache_range(vma, addr, end); - spin_lock(&mm->page_table_lock); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; msync_pud_range(vma, pgd, addr, next); } while (pgd++, addr = next, addr != end); - spin_unlock(&mm->page_table_lock); } /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 296e0bbf7836..510f0039b000 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -401,8 +401,6 @@ void free_swap_and_cache(swp_entry_t entry) * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. - * - * vma->vm_mm->page_table_lock is held. */ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, swp_entry_t entry, struct page *page) @@ -424,23 +422,25 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { - pte_t *pte; pte_t swp_pte = swp_entry_to_pte(entry); + pte_t *pte; + spinlock_t *ptl; + int found = 0; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, pte, addr, entry, page); - pte_unmap(pte); - return 1; + unuse_pte(vma, pte++, addr, entry, page); + found = 1; + break; } } while (pte++, addr += PAGE_SIZE, addr != end); - pte_unmap(pte - 1); - return 0; + pte_unmap_unlock(pte - 1, ptl); + return found; } static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, @@ -522,12 +522,10 @@ static int unuse_mm(struct mm_struct *mm, down_read(&mm->mmap_sem); lock_page(page); } - spin_lock(&mm->page_table_lock); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && unuse_vma(vma, entry, page)) break; } - spin_unlock(&mm->page_table_lock); up_read(&mm->mmap_sem); /* * Currently unuse_mm cannot fail, but leave error handling From 663b97f7efd001b0c56bd5fce059c5272725b86f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:28 -0700 Subject: [PATCH 53/98] [PATCH] mm: flush_tlb_range outside ptlock There was one small but very significant change in the previous patch: mprotect's flush_tlb_range fell outside the page_table_lock: as it is in 2.4, but that doesn't prove it safe in 2.6. On some architectures flush_tlb_range comes to the same as flush_tlb_mm, which has always been called from outside page_table_lock in dup_mmap, and is so proved safe. Others required a deeper audit: I could find no reliance on page_table_lock in any; but in ia64 and parisc found some code which looks a bit as if it might want preemption disabled. That won't do any actual harm, so pending a decision from the maintainers, disable preemption there. Remove comments on page_table_lock from flush_tlb_mm, flush_tlb_range and flush_tlb_page entries in cachetlb.txt: they were rather misleading (what generic code does is different from what usually happens), the rules are now changing, and it's not yet clear where we'll end up (will the generic tlb_flush_mmu happen always under lock? never under lock? or sometimes under and sometimes not?). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cachetlb.txt | 9 --------- arch/ia64/mm/tlb.c | 2 ++ include/asm-parisc/tlbflush.h | 3 ++- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt index e132fb1163b0..7eb715e07eda 100644 --- a/Documentation/cachetlb.txt +++ b/Documentation/cachetlb.txt @@ -49,9 +49,6 @@ changes occur: page table operations such as what happens during fork, and exec. - Platform developers note that generic code will always - invoke this interface without mm->page_table_lock held. - 3) void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -72,9 +69,6 @@ changes occur: call flush_tlb_page (see below) for each entry which may be modified. - Platform developers note that generic code will always - invoke this interface with mm->page_table_lock held. - 4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) This time we need to remove the PAGE_SIZE sized translation @@ -93,9 +87,6 @@ changes occur: This is used primarily during fault processing. - Platform developers note that generic code will always - invoke this interface with mm->page_table_lock held. - 5) void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index c93e0f2b5fea..c79a9b96d02b 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -158,10 +158,12 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long # ifdef CONFIG_SMP platform_global_tlb_purge(mm, start, end, nbits); # else + preempt_disable(); do { ia64_ptcl(start, (nbits<<2)); start += (1UL << nbits); } while (start < end); + preempt_enable(); # endif ia64_srlz_i(); /* srlz.i implies srlz.d */ diff --git a/include/asm-parisc/tlbflush.h b/include/asm-parisc/tlbflush.h index 84af4ab1fe51..e97aa8d1eff5 100644 --- a/include/asm-parisc/tlbflush.h +++ b/include/asm-parisc/tlbflush.h @@ -88,7 +88,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, if (npages >= 512) /* 2MB of space: arbitrary, should be tuned */ flush_tlb_all(); else { - + preempt_disable(); mtsp(vma->vm_mm->context,1); purge_tlb_start(); if (split_tlb) { @@ -102,6 +102,7 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, pdtlb(start); start += PAGE_SIZE; } + preempt_enable(); } purge_tlb_end(); } From 8f4f8c164cb4af1432cc25eda82928ea4519ba72 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:29 -0700 Subject: [PATCH 54/98] [PATCH] mm: unlink vma before pagetables In most places the descent from pgd to pud to pmd to pte holds mmap_sem (exclusively or not), which ensures that free_pgtables cannot be freeing page tables from any level at the same time. But truncation and reverse mapping descend without mmap_sem. No problem: just make sure that a vma is unlinked from its prio_tree (or nonlinear list) and from its anon_vma list, after zapping the vma, but before freeing its page tables. Then neither vmtruncate nor rmap can reach that vma whose page tables are now volatile (nor do they need to reach it, since all its page entries have been zapped by this stage). The i_mmap_lock and anon_vma->lock already serialize this correctly; but the locking hierarchy is such that we cannot take them while holding page_table_lock. Well, we're trying to push that down anyway. So in this patch, move anon_vma_unlink and unlink_file_vma into free_pgtables, at the same time as moving page_table_lock around calls to unmap_vmas. tlb_gather_mmu and tlb_finish_mmu then fall outside the page_table_lock, but we made them preempt_disable and preempt_enable earlier; and a long source audit of all the architectures has shown no problem with removing page_table_lock from them. free_pgtables doesn't need page_table_lock for itself, nor for what it calls; tlb->mm->nr_ptes is usually protected by page_table_lock, but partly by non-exclusive mmap_sem - here it's decremented with exclusive mmap_sem, or mm_users 0. update_hiwater_rss and vm_unacct_memory don't need page_table_lock either. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 12 ++++++++++-- mm/mmap.c | 23 ++++++----------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 24ba688876d6..4ea89a2e3a83 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -260,6 +260,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; + /* + * Hide vma from rmap and vmtruncate before freeing pgtables + */ + anon_vma_unlink(vma); + unlink_file_vma(vma); + if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -272,6 +278,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, HPAGE_SIZE)) { vma = next; next = vma->vm_next; + anon_vma_unlink(vma); + unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); @@ -798,12 +806,12 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, } lru_add_drain(); - spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + spin_lock(&mm->page_table_lock); end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); - tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); + tlb_finish_mmu(tlb, address, end); return end; } diff --git a/mm/mmap.c b/mm/mmap.c index d931d7e49ac9..fa35323a3c5b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -203,14 +203,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) { struct vm_area_struct *next = vma->vm_next; - /* - * Hide vma from rmap and vmtruncate before freeing page tables: - * to be moved into free_pgtables once page_table_lock is lifted - * from it, but until then lock ordering forbids that move. - */ - anon_vma_unlink(vma); - unlink_file_vma(vma); - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -1679,15 +1671,15 @@ static void unmap_region(struct mm_struct *mm, unsigned long nr_accounted = 0; lru_add_drain(); - spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + spin_lock(&mm->page_table_lock); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); + spin_unlock(&mm->page_table_lock); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); - spin_unlock(&mm->page_table_lock); } /* @@ -1962,23 +1954,20 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; lru_add_drain(); - - spin_lock(&mm->page_table_lock); - flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ + spin_lock(&mm->page_table_lock); end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); + spin_unlock(&mm->page_table_lock); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); - spin_unlock(&mm->page_table_lock); - /* - * Walk the list again, actually closing and freeing it - * without holding any MM locks. + * Walk the list again, actually closing and freeing it, + * with preemption enabled, without holding any MM locks. */ while (vma) vma = remove_vma(vma); From 508034a32b819a2d40aa7ac0dbc8cd2e044c2de6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:30 -0700 Subject: [PATCH 55/98] [PATCH] mm: unmap_vmas with inner ptlock Remove the page_table_lock from around the calls to unmap_vmas, and replace the pte_offset_map in zap_pte_range by pte_offset_map_lock: all callers are now safe to descend without page_table_lock. Don't attempt fancy locking for hugepages, just take page_table_lock in unmap_hugepage_range. Which makes zap_hugepage_range, and the hugetlb test in zap_page_range, redundant: unmap_vmas calls unmap_hugepage_range anyway. Nor does unmap_vmas have much use for its mm arg now. The tlb_start_vma and tlb_end_vma in unmap_page_range are now called without page_table_lock: if they're implemented at all, they typically come down to flush_cache_range (usually done outside page_table_lock) and flush_tlb_range (which we already audited for the mprotect case). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 10 +++------- include/linux/hugetlb.h | 2 -- include/linux/mm.h | 2 +- mm/hugetlb.c | 12 +++--------- mm/memory.c | 41 ++++++++++++----------------------------- mm/mmap.c | 8 ++------ 6 files changed, 21 insertions(+), 54 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3a9b6d179cbd..a826a8add5e3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -92,7 +92,7 @@ out: } /* - * Called under down_write(mmap_sem), page_table_lock is not held + * Called under down_write(mmap_sem). */ #ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA @@ -308,7 +308,6 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) vma_prio_tree_foreach(vma, &iter, root, h_pgoff, ULONG_MAX) { unsigned long h_vm_pgoff; - unsigned long v_length; unsigned long v_offset; h_vm_pgoff = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); @@ -319,11 +318,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) if (h_vm_pgoff >= h_pgoff) v_offset = 0; - v_length = vma->vm_end - vma->vm_start; - - zap_hugepage_range(vma, - vma->vm_start + v_offset, - v_length - v_offset); + unmap_hugepage_range(vma, + vma->vm_start + v_offset, vma->vm_end); } } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d664330d900e..0cea162b08c0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,7 +16,6 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); -void zap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); int hugetlb_prefault(struct address_space *, struct vm_area_struct *); int hugetlb_report_meminfo(char *); @@ -87,7 +86,6 @@ static inline unsigned long hugetlb_total_pages(void) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) -#define zap_hugepage_range(vma, start, len) BUG() #define unmap_hugepage_range(vma, start, end) BUG() #define is_hugepage_mem_enough(size) 0 #define hugetlb_report_meminfo(buf) 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index d4c3512e7db4..972e2ce8e07c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -682,7 +682,7 @@ struct zap_details { unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *); -unsigned long unmap_vmas(struct mmu_gather **tlb, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ea0826ff2663..f29b7dc02c39 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -314,6 +314,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + spin_lock(&mm->page_table_lock); + /* Update high watermark before we lower rss */ update_hiwater_rss(mm); @@ -333,17 +335,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, put_page(page); add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE)); } - flush_tlb_range(vma, start, end); -} -void zap_hugepage_range(struct vm_area_struct *vma, - unsigned long start, unsigned long length) -{ - struct mm_struct *mm = vma->vm_mm; - - spin_lock(&mm->page_table_lock); - unmap_hugepage_range(vma, start, start + length); spin_unlock(&mm->page_table_lock); + flush_tlb_range(vma, start, end); } int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) diff --git a/mm/memory.c b/mm/memory.c index 4ea89a2e3a83..622a4ef5409f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -551,10 +551,11 @@ static void zap_pte_range(struct mmu_gather *tlb, { struct mm_struct *mm = tlb->mm; pte_t *pte; + spinlock_t *ptl; int file_rss = 0; int anon_rss = 0; - pte = pte_offset_map(pmd, addr); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { pte_t ptent = *pte; if (pte_none(ptent)) @@ -621,7 +622,7 @@ static void zap_pte_range(struct mmu_gather *tlb, } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss(mm, file_rss, anon_rss); - pte_unmap(pte - 1); + pte_unmap_unlock(pte - 1, ptl); } static inline void zap_pmd_range(struct mmu_gather *tlb, @@ -690,7 +691,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather - * @mm: the controlling mm_struct * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -699,10 +699,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * * Returns the end address of the unmapping (restart addr if interrupted). * - * Unmap all pages in the vma list. Called under page_table_lock. + * Unmap all pages in the vma list. * - * We aim to not hold page_table_lock for too long (for scheduling latency - * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to + * We aim to not hold locks for too long (for scheduling latency reasons). + * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. @@ -714,7 +714,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) @@ -764,19 +764,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || - need_lockbreak(&mm->page_table_lock) || (i_mmap_lock && need_lockbreak(i_mmap_lock))) { if (i_mmap_lock) { - /* must reset count of rss freed */ - *tlbp = tlb_gather_mmu(mm, fullmm); + *tlbp = NULL; goto out; } - spin_unlock(&mm->page_table_lock); cond_resched(); - spin_lock(&mm->page_table_lock); } - *tlbp = tlb_gather_mmu(mm, fullmm); + *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_bytes = ZAP_BLOCK_SIZE; } @@ -800,18 +796,12 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long end = address + size; unsigned long nr_accounted = 0; - if (is_vm_hugetlb_page(vma)) { - zap_hugepage_range(vma, address, size); - return end; - } - lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); - spin_lock(&mm->page_table_lock); - end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); - spin_unlock(&mm->page_table_lock); - tlb_finish_mmu(tlb, address, end); + end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + if (tlb) + tlb_finish_mmu(tlb, address, end); return end; } @@ -1434,13 +1424,6 @@ again: restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); - - /* - * We cannot rely on the break test in unmap_vmas: - * on the one hand, we don't want to restart our loop - * just because that broke out for the page_table_lock; - * on the other hand, it does no test when vma is small. - */ need_break = need_resched() || need_lockbreak(details->i_mmap_lock); diff --git a/mm/mmap.c b/mm/mmap.c index fa35323a3c5b..5ecc2cf3e1d7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1673,9 +1673,7 @@ static void unmap_region(struct mm_struct *mm, lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); - spin_lock(&mm->page_table_lock); - unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); - spin_unlock(&mm->page_table_lock); + unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); @@ -1958,9 +1956,7 @@ void exit_mmap(struct mm_struct *mm) tlb = tlb_gather_mmu(mm, 1); /* Don't update_hiwater_rss(mm) here, do_exit already did */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ - spin_lock(&mm->page_table_lock); - end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); - spin_unlock(&mm->page_table_lock); + end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); From 67b02f119df50ffad5a4e9e53ea4c896535862cd Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:31 -0700 Subject: [PATCH 56/98] [PATCH] mm: xip_unmap ZERO_PAGE fix Small fix to the PageReserved patch: the mips ZERO_PAGE(address) depends on address, so __xip_unmap is wrong to initialize page with that before address is initialized; and in fact must re-evaluate it each iteration. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap_xip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9354ee279b13..4e74ad60339a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -174,7 +174,7 @@ __xip_unmap (struct address_space * mapping, unsigned long address; pte_t *pte; pte_t pteval; - struct page *page = ZERO_PAGE(address); + struct page *page; spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { @@ -182,6 +182,7 @@ __xip_unmap (struct address_space * mapping, address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); + page = ZERO_PAGE(address); /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... From c0718806cf955d5eb51ea77bffb5b21d9bba4972 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:31 -0700 Subject: [PATCH 57/98] [PATCH] mm: rmap with inner ptlock rmap's page_check_address descend without page_table_lock. First just pte_offset_map in case there's no pte present worth locking for, then take page_table_lock for the full check, and pass ptl back to caller in the same style as pte_offset_map_lock. __xip_unmap, page_referenced_one and try_to_unmap_one use pte_unmap_unlock. try_to_unmap_cluster also. page_check_address reformatted to avoid progressive indentation. No use is made of its one error code, return NULL when it fails. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 4 +- mm/filemap_xip.c | 12 ++--- mm/rmap.c | 109 +++++++++++++++++++++---------------------- 3 files changed, 60 insertions(+), 65 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e80fb7ee6efd..35b30e6c8cf8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -95,8 +95,8 @@ int try_to_unmap(struct page *); /* * Called from mm/filemap_xip.c to unmap empty zero page */ -pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long); - +pte_t *page_check_address(struct page *, struct mm_struct *, + unsigned long, spinlock_t **); /* * Used by swapoff to help locate where page is expected in vma. diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 4e74ad60339a..9cf687e4a29a 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping, unsigned long address; pte_t *pte; pte_t pteval; + spinlock_t *ptl; struct page *page; spin_lock(&mapping->i_mmap_lock); @@ -183,20 +184,15 @@ __xip_unmap (struct address_space * mapping, ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); page = ZERO_PAGE(address); - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - pte = page_check_address(page, mm, address); - if (!IS_ERR(pte)) { + pte = page_check_address(page, mm, address, &ptl); + if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush(vma, address, pte); page_remove_rmap(page); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); page_cache_release(page); } } diff --git a/mm/rmap.c b/mm/rmap.c index 4c52c56c9905..a84bdfe582c0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -247,34 +247,41 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * On success returns with mapped pte and locked mm->page_table_lock. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address) + unsigned long address, spinlock_t **ptlp) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; + spinlock_t *ptl; - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - spin_lock(&mm->page_table_lock); pgd = pgd_offset(mm, address); - if (likely(pgd_present(*pgd))) { - pud = pud_offset(pgd, address); - if (likely(pud_present(*pud))) { - pmd = pmd_offset(pud, address); - if (likely(pmd_present(*pmd))) { - pte = pte_offset_map(pmd, address); - if (likely(pte_present(*pte) && - page_to_pfn(page) == pte_pfn(*pte))) - return pte; - pte_unmap(pte); - } - } + if (!pgd_present(*pgd)) + return NULL; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return NULL; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return NULL; + + pte = pte_offset_map(pmd, address); + /* Make a quick check before getting the lock */ + if (!pte_present(*pte)) { + pte_unmap(pte); + return NULL; } - spin_unlock(&mm->page_table_lock); - return ERR_PTR(-ENOENT); + + ptl = &mm->page_table_lock; + spin_lock(ptl); + if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { + *ptlp = ptl; + return pte; + } + pte_unmap_unlock(pte, ptl); + return NULL; } /* @@ -287,28 +294,28 @@ static int page_referenced_one(struct page *page, struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *pte; + spinlock_t *ptl; int referenced = 0; address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address); - if (!IS_ERR(pte)) { - if (ptep_clear_flush_young(vma, address, pte)) - referenced++; + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && !ignore_token && - has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; - (*mapcount)--; - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); - } + /* Pretend the page is referenced if the task has the + swap token and is in the middle of a page fault. */ + if (mm != current->mm && !ignore_token && has_swap_token(mm) && + rwsem_is_locked(&mm->mmap_sem)) + referenced++; + + (*mapcount)--; + pte_unmap_unlock(pte, ptl); out: return referenced; } @@ -507,14 +514,15 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) unsigned long address; pte_t *pte; pte_t pteval; + spinlock_t *ptl; int ret = SWAP_AGAIN; address = vma_address(page, vma); if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address); - if (IS_ERR(pte)) + pte = page_check_address(page, mm, address, &ptl); + if (!pte) goto out; /* @@ -564,8 +572,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) page_cache_release(page); out_unmap: - pte_unmap(pte); - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); out: return ret; } @@ -599,19 +606,14 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, *original_pte; + pte_t *pte; pte_t pteval; + spinlock_t *ptl; struct page *page; unsigned long address; unsigned long end; unsigned long pfn; - /* - * We need the page_table_lock to protect us from page faults, - * munmap, fork, etc... - */ - spin_lock(&mm->page_table_lock); - address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; if (address < vma->vm_start) @@ -621,22 +623,22 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out_unlock; + return; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - goto out_unlock; + return; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - goto out_unlock; + return; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - for (original_pte = pte = pte_offset_map(pmd, address); - address < end; pte++, address += PAGE_SIZE) { - + for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; @@ -669,10 +671,7 @@ static void try_to_unmap_cluster(unsigned long cursor, dec_mm_counter(mm, file_rss); (*mapcount)--; } - - pte_unmap(original_pte); -out_unlock: - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte - 1, ptl); } static int try_to_unmap_anon(struct page *page) From c34d1b4d165c67b966bca4aba026443d7ff161eb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:32 -0700 Subject: [PATCH 58/98] [PATCH] mm: kill check_user_page_readable check_user_page_readable is a problematic variant of follow_page. It's used only by oprofile's i386 and arm backtrace code, at interrupt time, to establish whether a userspace stackframe is currently readable. This is problematic, because we want to push the page_table_lock down inside follow_page, and later split it; whereas oprofile is doing a spin_trylock on it (in the i386 case, forgotten in the arm case), and needs that to pin perhaps two pages spanned by the stackframe (which might be covered by different locks when we split). I think oprofile is going about this in the wrong way: it doesn't need to know the area is readable (neither i386 nor arm uses read protection of user pages), it doesn't need to pin the memory, it should simply __copy_from_user_inatomic, and see if that succeeds or not. Sorry, but I've not got around to devising the sparse __user annotations for this. Then we can eliminate check_user_page_readable, and return to a single follow_page without the __follow_page variants. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/oprofile/backtrace.c | 46 +++++++--------------------------- arch/i386/oprofile/backtrace.c | 38 ++++++++++------------------ include/linux/mm.h | 1 - mm/memory.c | 29 +++------------------ 4 files changed, 26 insertions(+), 88 deletions(-) diff --git a/arch/arm/oprofile/backtrace.c b/arch/arm/oprofile/backtrace.c index df35c452a8bf..7c22c12618cc 100644 --- a/arch/arm/oprofile/backtrace.c +++ b/arch/arm/oprofile/backtrace.c @@ -49,42 +49,22 @@ static struct frame_tail* kernel_backtrace(struct frame_tail *tail) static struct frame_tail* user_backtrace(struct frame_tail *tail) { - struct frame_tail buftail; + struct frame_tail buftail[2]; - /* hardware pte might not be valid due to dirty/accessed bit emulation - * so we use copy_from_user and benefit from exception fixups */ - if (copy_from_user(&buftail, tail, sizeof(struct frame_tail))) + /* Also check accessibility of one struct frame_tail beyond */ + if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) + return NULL; + if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) return NULL; - oprofile_add_trace(buftail.lr); + oprofile_add_trace(buftail[0].lr); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (tail >= buftail.fp) + if (tail >= buftail[0].fp) return NULL; - return buftail.fp-1; -} - -/* Compare two addresses and see if they're on the same page */ -#define CMP_ADDR_EQUAL(x,y,offset) ((((unsigned long) x) >> PAGE_SHIFT) \ - == ((((unsigned long) y) + offset) >> PAGE_SHIFT)) - -/* check that the page(s) containing the frame tail are present */ -static int pages_present(struct frame_tail *tail) -{ - struct mm_struct * mm = current->mm; - - if (!check_user_page_readable(mm, (unsigned long)tail)) - return 0; - - if (CMP_ADDR_EQUAL(tail, tail, 8)) - return 1; - - if (!check_user_page_readable(mm, ((unsigned long)tail) + 8)) - return 0; - - return 1; + return buftail[0].fp-1; } /* @@ -118,7 +98,6 @@ static int valid_kernel_stack(struct frame_tail *tail, struct pt_regs *regs) void arm_backtrace(struct pt_regs * const regs, unsigned int depth) { struct frame_tail *tail; - unsigned long last_address = 0; tail = ((struct frame_tail *) regs->ARM_fp) - 1; @@ -132,13 +111,6 @@ void arm_backtrace(struct pt_regs * const regs, unsigned int depth) return; } - while (depth-- && tail && !((unsigned long) tail & 3)) { - if ((!CMP_ADDR_EQUAL(last_address, tail, 0) - || !CMP_ADDR_EQUAL(last_address, tail, 8)) - && !pages_present(tail)) - return; - last_address = (unsigned long) tail; + while (depth-- && tail && !((unsigned long) tail & 3)) tail = user_backtrace(tail); - } } - diff --git a/arch/i386/oprofile/backtrace.c b/arch/i386/oprofile/backtrace.c index 65dfd2edb671..21654be3f73f 100644 --- a/arch/i386/oprofile/backtrace.c +++ b/arch/i386/oprofile/backtrace.c @@ -12,6 +12,7 @@ #include #include #include +#include struct frame_head { struct frame_head * ebp; @@ -21,26 +22,22 @@ struct frame_head { static struct frame_head * dump_backtrace(struct frame_head * head) { - oprofile_add_trace(head->ret); + struct frame_head bufhead[2]; + + /* Also check accessibility of one struct frame_head beyond */ + if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) + return NULL; + if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + return NULL; + + oprofile_add_trace(bufhead[0].ret); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= head->ebp) + if (head >= bufhead[0].ebp) return NULL; - return head->ebp; -} - -/* check that the page(s) containing the frame head are present */ -static int pages_present(struct frame_head * head) -{ - struct mm_struct * mm = current->mm; - - /* FIXME: only necessary once per page */ - if (!check_user_page_readable(mm, (unsigned long)head)) - return 0; - - return check_user_page_readable(mm, (unsigned long)(head + 1)); + return bufhead[0].ebp; } /* @@ -97,15 +94,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) return; } -#ifdef CONFIG_SMP - if (!spin_trylock(¤t->mm->page_table_lock)) - return; -#endif - - while (depth-- && head && pages_present(head)) + while (depth-- && head) head = dump_backtrace(head); - -#ifdef CONFIG_SMP - spin_unlock(¤t->mm->page_table_lock); -#endif } diff --git a/include/linux/mm.h b/include/linux/mm.h index 972e2ce8e07c..aa8de20e2e80 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -944,7 +944,6 @@ extern struct page * vmalloc_to_page(void *addr); extern unsigned long vmalloc_to_pfn(void *addr); extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write); -extern int check_user_page_readable(struct mm_struct *mm, unsigned long address); int remap_pfn_range(struct vm_area_struct *, unsigned long, unsigned long, unsigned long, pgprot_t); diff --git a/mm/memory.c b/mm/memory.c index 622a4ef5409f..51f7c0a220d4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -809,8 +809,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static struct page *__follow_page(struct mm_struct *mm, unsigned long address, - int read, int write, int accessed) +struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) { pgd_t *pgd; pud_t *pud; @@ -846,16 +845,12 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address, if (pte_present(pte)) { if (write && !pte_write(pte)) goto out; - if (read && !pte_read(pte)) - goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (accessed) { - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - } + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); return page; } } @@ -864,22 +859,6 @@ out: return NULL; } -inline struct page * -follow_page(struct mm_struct *mm, unsigned long address, int write) -{ - return __follow_page(mm, address, 0, write, 1); -} - -/* - * check_user_page_readable() can be called frm niterrupt context by oprofile, - * so we need to avoid taking any non-irq-safe locks - */ -int check_user_page_readable(struct mm_struct *mm, unsigned long address) -{ - return __follow_page(mm, address, 1, 0, 0) != NULL; -} -EXPORT_SYMBOL(check_user_page_readable); - static inline int untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, unsigned long address) From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:33 -0700 Subject: [PATCH 59/98] [PATCH] mm: follow_page with inner ptlock Final step in pushing down common core's page_table_lock. follow_page no longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself; and so no page_table_lock is taken in get_user_pages itself. But get_user_pages (and get_futex_key) do then need follow_page to pin the page for them: take Daniel's suggestion of bitflags to follow_page. Need one for WRITE, another for TOUCH (it was the accessed flag before: vanished along with check_user_page_readable, but surely get_numa_maps is wrong to mark every page it finds as accessed), another for GET. And another, ANON to dispose of untouched_anonymous_page: it seems silly for that to descend a second time, let follow_page observe if there was no page table and return ZERO_PAGE if so. Fix minor bug in that: check VM_LOCKED - make_pages_present ought to make readonly anonymous present. Give get_numa_maps a cond_resched while we're there. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +- include/linux/mm.h | 18 +++--- kernel/futex.c | 6 +- mm/memory.c | 154 +++++++++++++++++++++------------------------ mm/nommu.c | 3 +- 5 files changed, 88 insertions(+), 96 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7e5e7ec2e36d..d2fa42006d8f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) for_each_node(i) md->node[i] =0; - spin_lock(&mm->page_table_lock); for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { page = follow_page(mm, vaddr, 0); if (page) { @@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) md->anon++; md->node[page_to_nid(page)]++; } + cond_resched(); } - spin_unlock(&mm->page_table_lock); return md; } diff --git a/include/linux/mm.h b/include/linux/mm.h index aa8de20e2e80..e8d1424153bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } -extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr); +struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); +struct page *vmalloc_to_page(void *addr); +unsigned long vmalloc_to_pfn(void *addr); +int remap_pfn_range(struct vm_area_struct *, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t); -extern struct page * vmalloc_to_page(void *addr); -extern unsigned long vmalloc_to_pfn(void *addr); -extern struct page * follow_page(struct mm_struct *mm, unsigned long address, - int write); -int remap_pfn_range(struct vm_area_struct *, unsigned long, - unsigned long, unsigned long, pgprot_t); +struct page *follow_page(struct mm_struct *, unsigned long address, + unsigned int foll_flags); +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */ #ifdef CONFIG_PROC_FS void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..3b4d5ad44cc6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) /* * Do a quick atomic lookup first - this is the fastpath. */ - spin_lock(¤t->mm->page_table_lock); - page = follow_page(mm, uaddr, 0); + page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); if (likely(page != NULL)) { key->shared.pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - spin_unlock(¤t->mm->page_table_lock); + put_page(page); return 0; } - spin_unlock(¤t->mm->page_table_lock); /* * Do it the general way. diff --git a/mm/memory.c b/mm/memory.c index 51f7c0a220d4..8461e2dd91d7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. - * mm->page_table_lock must be held. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, int write) +struct page *follow_page(struct mm_struct *mm, unsigned long address, + unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; + spinlock_t *ptl; unsigned long pfn; struct page *page; - page = follow_huge_addr(mm, address, write); - if (! IS_ERR(page)) - return page; + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto out; + goto no_page_table; pud = pud_offset(pgd, address); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - goto out; + goto no_page_table; pmd = pmd_offset(pud, address); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); + goto no_page_table; - ptep = pte_offset_map(pmd, address); + if (pmd_huge(*pmd)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + goto out; + } + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); if (!ptep) goto out; pte = *ptep; - pte_unmap(ptep); - if (pte_present(pte)) { - if (write && !pte_write(pte)) - goto out; - pfn = pte_pfn(pte); - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); - return page; - } + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto unlock; + + page = pfn_to_page(pfn); + if (flags & FOLL_GET) + get_page(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); } - +unlock: + pte_unmap_unlock(ptep, ptl); out: - return NULL; -} + return page; -static inline int -untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, - unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - - /* Check if the vma is for an anonymous mapping. */ - if (vma->vm_ops && vma->vm_ops->nopage) - return 0; - - /* Check if page directory entry exists. */ - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return 1; - - pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return 1; - - /* Check if page middle directory entry exists. */ - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return 1; - - /* There is a pte slot for 'address' in 'mm'. */ - return 0; +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate page tables. + */ + if (flags & FOLL_ANON) { + page = ZERO_PAGE(address); + if (flags & FOLL_GET) + get_page(page); + BUG_ON(flags & FOLL_WRITE); + } + return page; } int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, @@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int flags; + unsigned int vm_flags; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. */ - flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { - struct vm_area_struct * vma; + struct vm_area_struct *vma; + unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) - || !(flags & vma->vm_flags)) + || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, &start, &len, i); continue; } - spin_lock(&mm->page_table_lock); + + foll_flags = FOLL_TOUCH; + if (pages) + foll_flags |= FOLL_GET; + if (!write && !(vma->vm_flags & VM_LOCKED) && + (!vma->vm_ops || !vma->vm_ops->nopage)) + foll_flags |= FOLL_ANON; + do { - int write_access = write; struct page *page; - cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, write_access))) { + if (write) + foll_flags |= FOLL_WRITE; + + cond_resched(); + while (!(page = follow_page(mm, start, foll_flags))) { int ret; - - /* - * Shortcut for anonymous pages. We don't want - * to force the creation of pages tables for - * insanely big anonymously mapped areas that - * nobody touched so far. This is important - * for doing a core dump for these mappings. - */ - if (!write && untouched_anonymous_page(mm,vma,start)) { - page = ZERO_PAGE(start); - break; - } - spin_unlock(&mm->page_table_lock); - ret = __handle_mm_fault(mm, vma, start, write_access); - + ret = __handle_mm_fault(mm, vma, start, + foll_flags & FOLL_WRITE); /* * The VM_FAULT_WRITE bit tells us that do_wp_page has * broken COW when necessary, even if maybe_mkwrite @@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * subsequent page lookups as if they were reads. */ if (ret & VM_FAULT_WRITE) - write_access = 0; + foll_flags &= ~FOLL_WRITE; switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: @@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - spin_lock(&mm->page_table_lock); } if (pages) { pages[i] = page; flush_dcache_page(page); - page_cache_get(page); } if (vmas) vmas[i] = vma; @@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); - spin_unlock(&mm->page_table_lock); } while (len); return i; } diff --git a/mm/nommu.c b/mm/nommu.c index dfb124ffb9be..d1e076a487cb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write) +struct page *follow_page(struct mm_struct *mm, unsigned long address, + unsigned int foll_flags) { return NULL; } From 60ec5585496871345c1a8113d7b60ed9d9474866 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:34 -0700 Subject: [PATCH 60/98] [PATCH] mm: i386 sh sh64 ready for split ptlock Use pte_offset_map_lock, instead of pte_offset_map (or inappropriate pte_offset_kernel) and mm-wide page_table_lock, in sundry arch places. The i386 vm86 mark_screen_rdonly: yes, there was and is an assumption that the screen fits inside the one page table, as indeed it does. The sh __do_page_fault: which handles both kernel faults (without lock) and user mm faults (locked - though it set_pte without locking before). The sh64 flush_cache_range and helpers: which wrongly thought callers held page_table_lock before (only its tlb_start_vma did, and no longer does so); moved the flush loop down, and adjusted the large versus small range decision to consider a range which spans page tables as large. Signed-off-by: Hugh Dickins Acked-by: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/vm86.c | 17 +++++------ arch/sh/mm/fault.c | 40 +++++++++++++++----------- arch/sh64/mm/cache.c | 62 ++++++++++++++++++----------------------- 3 files changed, 57 insertions(+), 62 deletions(-) diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c index 16b485009622..fc1993564f98 100644 --- a/arch/i386/kernel/vm86.c +++ b/arch/i386/kernel/vm86.c @@ -134,17 +134,16 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) return ret; } -static void mark_screen_rdonly(struct task_struct * tsk) +static void mark_screen_rdonly(struct mm_struct *mm) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - pte_t *pte, *mapped; + pte_t *pte; + spinlock_t *ptl; int i; - preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; pud = pud_offset(pgd, 0xA0000); @@ -153,16 +152,14 @@ static void mark_screen_rdonly(struct task_struct * tsk) pmd = pmd_offset(pud, 0xA0000); if (pmd_none_or_clear_bad(pmd)) goto out; - pte = mapped = pte_offset_map(pmd, 0xA0000); + pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); for (i = 0; i < 32; i++) { if (pte_present(*pte)) set_pte(pte, pte_wrprotect(*pte)); pte++; } - pte_unmap(mapped); + pte_unmap_unlock(pte, ptl); out: - spin_unlock(&tsk->mm->page_table_lock); - preempt_enable(); flush_tlb(); } @@ -306,7 +303,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) - mark_screen_rdonly(tsk); + mark_screen_rdonly(tsk->mm); __asm__ __volatile__( "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" "movl %0,%%esp\n\t" diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 7abba2161da6..775f86cd3fe8 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c @@ -194,10 +194,13 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, unsigned long address) { unsigned long addrmax = P4SEG; - pgd_t *dir; + pgd_t *pgd; pmd_t *pmd; pte_t *pte; pte_t entry; + struct mm_struct *mm; + spinlock_t *ptl; + int ret = 1; #ifdef CONFIG_SH_KGDB if (kgdb_nofault && kgdb_bus_err_hook) @@ -208,28 +211,28 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, addrmax = P4SEG_STORE_QUE + 0x04000000; #endif - if (address >= P3SEG && address < addrmax) - dir = pgd_offset_k(address); - else if (address >= TASK_SIZE) + if (address >= P3SEG && address < addrmax) { + pgd = pgd_offset_k(address); + mm = NULL; + } else if (address >= TASK_SIZE) return 1; - else if (!current->mm) + else if (!(mm = current->mm)) return 1; else - dir = pgd_offset(current->mm, address); + pgd = pgd_offset(mm, address); - pmd = pmd_offset(dir, address); - if (pmd_none(*pmd)) + pmd = pmd_offset(pgd, address); + if (pmd_none_or_clear_bad(pmd)) return 1; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return 1; - } - pte = pte_offset_kernel(pmd, address); + if (mm) + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + else + pte = pte_offset_kernel(pmd, address); + entry = *pte; if (pte_none(entry) || pte_not_present(entry) || (writeaccess && !pte_write(entry))) - return 1; + goto unlock; if (writeaccess) entry = pte_mkdirty(entry); @@ -251,8 +254,11 @@ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, set_pte(pte, entry); update_mmu_cache(NULL, address, entry); - - return 0; + ret = 0; +unlock: + if (mm) + pte_unmap_unlock(pte, ptl); + return ret; } void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) diff --git a/arch/sh64/mm/cache.c b/arch/sh64/mm/cache.c index 3b87e25ea773..c0c1b21350d8 100644 --- a/arch/sh64/mm/cache.c +++ b/arch/sh64/mm/cache.c @@ -584,32 +584,36 @@ static void sh64_dcache_purge_phy_page(unsigned long paddr) } } -static void sh64_dcache_purge_user_page(struct mm_struct *mm, unsigned long eaddr) +static void sh64_dcache_purge_user_pages(struct mm_struct *mm, + unsigned long addr, unsigned long end) { pgd_t *pgd; pmd_t *pmd; pte_t *pte; pte_t entry; + spinlock_t *ptl; unsigned long paddr; - /* NOTE : all the callers of this have mm->page_table_lock held, so the - following page table traversal is safe even on SMP/pre-emptible. */ + if (!mm) + return; /* No way to find physical address of page */ - if (!mm) return; /* No way to find physical address of page */ - pgd = pgd_offset(mm, eaddr); - if (pgd_bad(*pgd)) return; + pgd = pgd_offset(mm, addr); + if (pgd_bad(*pgd)) + return; - pmd = pmd_offset(pgd, eaddr); - if (pmd_none(*pmd) || pmd_bad(*pmd)) return; - - pte = pte_offset_kernel(pmd, eaddr); - entry = *pte; - if (pte_none(entry) || !pte_present(entry)) return; - - paddr = pte_val(entry) & PAGE_MASK; - - sh64_dcache_purge_coloured_phy_page(paddr, eaddr); + pmd = pmd_offset(pgd, addr); + if (pmd_none(*pmd) || pmd_bad(*pmd)) + return; + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + do { + entry = *pte; + if (pte_none(entry) || !pte_present(entry)) + continue; + paddr = pte_val(entry) & PAGE_MASK; + sh64_dcache_purge_coloured_phy_page(paddr, addr); + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(pte - 1, ptl); } /****************************************************************************/ @@ -668,7 +672,7 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, int n_pages; n_pages = ((end - start) >> PAGE_SHIFT); - if (n_pages >= 64) { + if (n_pages >= 64 || ((start ^ (end - 1)) & PMD_MASK)) { #if 1 sh64_dcache_purge_all(); #else @@ -707,20 +711,10 @@ static void sh64_dcache_purge_user_range(struct mm_struct *mm, } #endif } else { - /* 'Small' range */ - unsigned long aligned_start; - unsigned long eaddr; - unsigned long last_page_start; - - aligned_start = start & PAGE_MASK; - /* 'end' is 1 byte beyond the end of the range */ - last_page_start = (end - 1) & PAGE_MASK; - - eaddr = aligned_start; - while (eaddr <= last_page_start) { - sh64_dcache_purge_user_page(mm, eaddr); - eaddr += PAGE_SIZE; - } + /* Small range, covered by a single page table page */ + start &= PAGE_MASK; /* should already be so */ + end = PAGE_ALIGN(end); /* should already be so */ + sh64_dcache_purge_user_pages(mm, start, end); } return; } @@ -880,9 +874,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, addresses from the user address space specified by mm, after writing back any dirty data. - Note(1), 'end' is 1 byte beyond the end of the range to flush. - - Note(2), this is called with mm->page_table_lock held.*/ + Note, 'end' is 1 byte beyond the end of the range to flush. */ sh64_dcache_purge_user_range(mm, start, end); sh64_icache_inv_user_page_range(mm, start, end); @@ -898,7 +890,7 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long eaddr, unsigned the I-cache must be searched too in case the page in question is both writable and being executed from (e.g. stack trampolines.) - Note(1), this is called with mm->page_table_lock held. + Note, this is called with pte lock held. */ sh64_dcache_purge_phy_page(pfn << PAGE_SHIFT); From 69b0475456ff7ef520e16f69d7a15c0d68b74e64 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:36 -0700 Subject: [PATCH 61/98] [PATCH] mm: arm ready for split ptlock Prepare arm for the split page_table_lock: three issues. Signal handling's preserve and restore of iwmmxt context currently involves reading and writing that context to and from user space, while holding page_table_lock to secure the user page(s) against kswapd. If we split the lock, then the structure might span two pages, secured by to read into and write from a kernel stack buffer, copying that out and in without locking (the structure is 160 bytes in size, and here we're near the top of the kernel stack). Or would the overhead be noticeable? arm_syscall's cmpxchg emulation use pte_offset_map_lock, instead of pte_offset_map and mm-wide page_table_lock; and strictly, it should now also take mmap_sem before descending to pmd, to guard against another thread munmapping, and the page table pulled out beneath this thread. Updated two comments in fault-armv.c. adjust_pte is interesting, since its modification of a pte in one part of the mm depends on the lock held when calling update_mmu_cache for a pte in some other part of that mm. This can't be done with a split page_table_lock (and we've already taken the lowest lock in the hierarchy here): so we'll have to disable split on arm, unless CONFIG_CPU_CACHE_VIPT to ensures adjust_pte never used. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/signal.c | 96 ++++++++-------------------------------- arch/arm/kernel/traps.c | 14 +++--- arch/arm/mm/fault-armv.c | 7 ++- 3 files changed, 33 insertions(+), 84 deletions(-) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index a94d75fef598..a917e3dd3666 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -139,93 +139,33 @@ struct iwmmxt_sigframe { unsigned long storage[0x98/4]; }; -static int page_present(struct mm_struct *mm, void __user *uptr, int wr) -{ - unsigned long addr = (unsigned long)uptr; - pgd_t *pgd = pgd_offset(mm, addr); - if (pgd_present(*pgd)) { - pmd_t *pmd = pmd_offset(pgd, addr); - if (pmd_present(*pmd)) { - pte_t *pte = pte_offset_map(pmd, addr); - return (pte_present(*pte) && (!wr || pte_write(*pte))); - } - } - return 0; -} - -static int copy_locked(void __user *uptr, void *kptr, size_t size, int write, - void (*copyfn)(void *, void __user *)) -{ - unsigned char v, __user *userptr = uptr; - int err = 0; - - do { - struct mm_struct *mm; - - if (write) { - __put_user_error(0, userptr, err); - __put_user_error(0, userptr + size - 1, err); - } else { - __get_user_error(v, userptr, err); - __get_user_error(v, userptr + size - 1, err); - } - - if (err) - break; - - mm = current->mm; - spin_lock(&mm->page_table_lock); - if (page_present(mm, userptr, write) && - page_present(mm, userptr + size - 1, write)) { - copyfn(kptr, uptr); - } else - err = 1; - spin_unlock(&mm->page_table_lock); - } while (err); - - return err; -} - static int preserve_iwmmxt_context(struct iwmmxt_sigframe *frame) { - int err = 0; + char kbuf[sizeof(*frame) + 8]; + struct iwmmxt_sigframe *kframe; /* the iWMMXt context must be 64 bit aligned */ - WARN_ON((unsigned long)frame & 7); - - __put_user_error(IWMMXT_MAGIC0, &frame->magic0, err); - __put_user_error(IWMMXT_MAGIC1, &frame->magic1, err); - - /* - * iwmmxt_task_copy() doesn't check user permissions. - * Let's do a dummy write on the upper boundary to ensure - * access to user mem is OK all way up. - */ - err |= copy_locked(&frame->storage, current_thread_info(), - sizeof(frame->storage), 1, iwmmxt_task_copy); - return err; + kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); + kframe->magic0 = IWMMXT_MAGIC0; + kframe->magic1 = IWMMXT_MAGIC1; + iwmmxt_task_copy(current_thread_info(), &kframe->storage); + return __copy_to_user(frame, kframe, sizeof(*frame)); } static int restore_iwmmxt_context(struct iwmmxt_sigframe *frame) { - unsigned long magic0, magic1; - int err = 0; + char kbuf[sizeof(*frame) + 8]; + struct iwmmxt_sigframe *kframe; - /* the iWMMXt context is 64 bit aligned */ - WARN_ON((unsigned long)frame & 7); - - /* - * Validate iWMMXt context signature. - * Also, iwmmxt_task_restore() doesn't check user permissions. - * Let's do a dummy write on the upper boundary to ensure - * access to user mem is OK all way up. - */ - __get_user_error(magic0, &frame->magic0, err); - __get_user_error(magic1, &frame->magic1, err); - if (!err && magic0 == IWMMXT_MAGIC0 && magic1 == IWMMXT_MAGIC1) - err = copy_locked(&frame->storage, current_thread_info(), - sizeof(frame->storage), 0, iwmmxt_task_restore); - return err; + /* the iWMMXt context must be 64 bit aligned */ + kframe = (struct iwmmxt_sigframe *)((unsigned long)(kbuf + 8) & ~7); + if (__copy_from_user(kframe, frame, sizeof(*frame))) + return -1; + if (kframe->magic0 != IWMMXT_MAGIC0 || + kframe->magic1 != IWMMXT_MAGIC1) + return -1; + iwmmxt_task_restore(current_thread_info(), &kframe->storage); + return 0; } #endif diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index baa09601a64e..66e5a0516f23 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -483,29 +483,33 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs) unsigned long addr = regs->ARM_r2; struct mm_struct *mm = current->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte; + spinlock_t *ptl; regs->ARM_cpsr &= ~PSR_C_BIT; - spin_lock(&mm->page_table_lock); + down_read(&mm->mmap_sem); pgd = pgd_offset(mm, addr); if (!pgd_present(*pgd)) goto bad_access; pmd = pmd_offset(pgd, addr); if (!pmd_present(*pmd)) goto bad_access; - pte = pte_offset_map(pmd, addr); - if (!pte_present(*pte) || !pte_write(*pte)) + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_present(*pte) || !pte_write(*pte)) { + pte_unmap_unlock(pte, ptl); goto bad_access; + } val = *(unsigned long *)addr; val -= regs->ARM_r0; if (val == 0) { *(unsigned long *)addr = regs->ARM_r1; regs->ARM_cpsr |= PSR_C_BIT; } - spin_unlock(&mm->page_table_lock); + pte_unmap_unlock(pte, ptl); + up_read(&mm->mmap_sem); return val; bad_access: - spin_unlock(&mm->page_table_lock); + up_read(&mm->mmap_sem); /* simulate a write access fault */ do_DataAbort(addr, 15 + (1 << 11), regs); return -1; diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index be4ab3d73c91..7fc1b35a6746 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -26,6 +26,11 @@ static unsigned long shared_pte_mask = L_PTE_CACHEABLE; /* * We take the easy way out of this problem - we make the * PTE uncacheable. However, we leave the write buffer on. + * + * Note that the pte lock held when calling update_mmu_cache must also + * guard the pte (somewhere else in the same mm) that we modify here. + * Therefore those configurations which might call adjust_pte (those + * without CONFIG_CPU_CACHE_VIPT) cannot support split page_table_lock. */ static int adjust_pte(struct vm_area_struct *vma, unsigned long address) { @@ -127,7 +132,7 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page); * 2. If we have multiple shared mappings of the same space in * an object, we need to deal with the cache aliasing issues. * - * Note that the page_table_lock will be held. + * Note that the pte lock will be held. */ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { From 92dc6fcc845d99e87d8168e0786796525832d130 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:36 -0700 Subject: [PATCH 62/98] [PATCH] mm: parisc pte atomicity There's a worrying function translation_exists in parisc cacheflush.h, unaffected by split ptlock since flush_dcache_page is using it on some other mm, without any relevant lock. Oh well, make it a slightly more robust by factoring the pfn check within it. And it looked liable to confuse a camouflaged swap or file entry with a good pte: fix that too. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/parisc/kernel/cache.c | 24 +++++++++------------- include/asm-parisc/cacheflush.h | 35 ++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index e15f09eaed12..a065349aee37 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -270,7 +270,6 @@ void flush_dcache_page(struct page *page) unsigned long offset; unsigned long addr; pgoff_t pgoff; - pte_t *pte; unsigned long pfn = page_to_pfn(page); @@ -301,21 +300,16 @@ void flush_dcache_page(struct page *page) * taking a page fault if the pte doesn't exist. * This is just for speed. If the page translation * isn't there, there's no point exciting the - * nadtlb handler into a nullification frenzy */ - - - if(!(pte = translation_exists(mpnt, addr))) - continue; - - /* make sure we really have this page: the private + * nadtlb handler into a nullification frenzy. + * + * Make sure we really have this page: the private * mappings may cover this area but have COW'd this - * particular page */ - if(pte_pfn(*pte) != pfn) - continue; - - __flush_cache_page(mpnt, addr); - - break; + * particular page. + */ + if (translation_exists(mpnt, addr, pfn)) { + __flush_cache_page(mpnt, addr); + break; + } } flush_dcache_mmap_unlock(mapping); } diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h index aa592d8c0e39..1bc3c83ee74b 100644 --- a/include/asm-parisc/cacheflush.h +++ b/include/asm-parisc/cacheflush.h @@ -100,30 +100,34 @@ static inline void flush_cache_range(struct vm_area_struct *vma, /* Simple function to work out if we have an existing address translation * for a user space vma. */ -static inline pte_t *__translation_exists(struct mm_struct *mm, - unsigned long addr) +static inline int translation_exists(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) { - pgd_t *pgd = pgd_offset(mm, addr); + pgd_t *pgd = pgd_offset(vma->vm_mm, addr); pmd_t *pmd; - pte_t *pte; + pte_t pte; if(pgd_none(*pgd)) - return NULL; + return 0; pmd = pmd_offset(pgd, addr); if(pmd_none(*pmd) || pmd_bad(*pmd)) - return NULL; + return 0; - pte = pte_offset_map(pmd, addr); + /* We cannot take the pte lock here: flush_cache_page is usually + * called with pte lock already held. Whereas flush_dcache_page + * takes flush_dcache_mmap_lock, which is lower in the hierarchy: + * the vma itself is secure, but the pte might come or go racily. + */ + pte = *pte_offset_map(pmd, addr); + /* But pte_unmap() does nothing on this architecture */ - /* The PA flush mappings show up as pte_none, but they're - * valid none the less */ - if(pte_none(*pte) && ((pte_val(*pte) & _PAGE_FLUSH) == 0)) - return NULL; - return pte; + /* Filter out coincidental file entries and swap entries */ + if (!(pte_val(pte) & (_PAGE_FLUSH|_PAGE_PRESENT))) + return 0; + + return pte_pfn(pte) == pfn; } -#define translation_exists(vma, addr) __translation_exists((vma)->vm_mm, addr) - /* Private function to flush a page from the cache of a non-current * process. cr25 contains the Page Directory of the current user @@ -175,9 +179,8 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long { BUG_ON(!vma->vm_mm->context); - if(likely(translation_exists(vma, vmaddr))) + if (likely(translation_exists(vma, vmaddr, pfn))) __flush_cache_page(vma, vmaddr); } #endif - From a7e4705b24e611574e5c23105005ffdff694fd58 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:37 -0700 Subject: [PATCH 63/98] [PATCH] mm: cris v32 mmu_context_lock The cris v32 switch_mm guards get_mmu_context with next->page_table_lock: good it's not really SMP yet, since get_mmu_context messes with global variables affecting other mms. Replace by global mmu_context_lock. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/cris/arch-v32/mm/tlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/cris/arch-v32/mm/tlb.c b/arch/cris/arch-v32/mm/tlb.c index 8233406798d3..b08a28bb58ab 100644 --- a/arch/cris/arch-v32/mm/tlb.c +++ b/arch/cris/arch-v32/mm/tlb.c @@ -175,6 +175,8 @@ init_new_context(struct task_struct *tsk, struct mm_struct *mm) return 0; } +static DEFINE_SPINLOCK(mmu_context_lock); + /* Called in schedule() just before actually doing the switch_to. */ void switch_mm(struct mm_struct *prev, struct mm_struct *next, @@ -183,10 +185,10 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, int cpu = smp_processor_id(); /* Make sure there is a MMU context. */ - spin_lock(&next->page_table_lock); + spin_lock(&mmu_context_lock); get_mmu_context(next); cpu_set(cpu, next->cpu_vm_mask); - spin_unlock(&next->page_table_lock); + spin_unlock(&mmu_context_lock); /* * Remember the pgd for the fault handlers. Keep a seperate copy of it From 8f5cd76c185a4c8aeb5fe1e560e3612bfc050c35 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:38 -0700 Subject: [PATCH 64/98] [PATCH] mm: uml pte atomicity There's usually a good reason when a pte is examined without the lock; but it makes me nervous when the pointer is dereferenced more than once. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/kernel/process_kern.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/um/kernel/process_kern.c b/arch/um/kernel/process_kern.c index 0d73ceeece72..34b54a3e2132 100644 --- a/arch/um/kernel/process_kern.c +++ b/arch/um/kernel/process_kern.c @@ -222,6 +222,7 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, pud_t *pud; pmd_t *pmd; pte_t *pte; + pte_t ptent; if(task->mm == NULL) return(ERR_PTR(-EINVAL)); @@ -238,12 +239,13 @@ void *um_virt_to_phys(struct task_struct *task, unsigned long addr, return(ERR_PTR(-EINVAL)); pte = pte_offset_kernel(pmd, addr); - if(!pte_present(*pte)) + ptent = *pte; + if(!pte_present(ptent)) return(ERR_PTR(-EINVAL)); if(pte_out != NULL) - *pte_out = *pte; - return((void *) (pte_val(*pte) & PAGE_MASK) + (addr & ~PAGE_MASK)); + *pte_out = ptent; + return((void *) (pte_val(ptent) & PAGE_MASK) + (addr & ~PAGE_MASK)); } char *current_cmd(void) From b38c6845b695141259019e2b7c0fe6c32a6e720d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:39 -0700 Subject: [PATCH 65/98] [PATCH] mm: uml kill unused In worrying over the various pte operations in different architectures, I came across some unused functions in UML: remove mprotect_kernel_vm, protect_vm_page and addr_pte. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/um/include/tlb.h | 1 - arch/um/kernel/tt/tlb.c | 36 ------------------------------------ 2 files changed, 37 deletions(-) diff --git a/arch/um/include/tlb.h b/arch/um/include/tlb.h index 45d7da6c3b2c..8efc1e0f1b84 100644 --- a/arch/um/include/tlb.h +++ b/arch/um/include/tlb.h @@ -34,7 +34,6 @@ struct host_vm_op { } u; }; -extern void mprotect_kernel_vm(int w); extern void force_flush_all(void); extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr, int force, diff --git a/arch/um/kernel/tt/tlb.c b/arch/um/kernel/tt/tlb.c index f1d85dbb45b9..ae6217c86135 100644 --- a/arch/um/kernel/tt/tlb.c +++ b/arch/um/kernel/tt/tlb.c @@ -74,42 +74,6 @@ void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end) atomic_inc(&vmchange_seq); } -static void protect_vm_page(unsigned long addr, int w, int must_succeed) -{ - int err; - - err = protect_memory(addr, PAGE_SIZE, 1, w, 1, must_succeed); - if(err == 0) return; - else if((err == -EFAULT) || (err == -ENOMEM)){ - flush_tlb_kernel_range(addr, addr + PAGE_SIZE); - protect_vm_page(addr, w, 1); - } - else panic("protect_vm_page : protect failed, errno = %d\n", err); -} - -void mprotect_kernel_vm(int w) -{ - struct mm_struct *mm; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - unsigned long addr; - - mm = &init_mm; - for(addr = start_vm; addr < end_vm;){ - pgd = pgd_offset(mm, addr); - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - if(pmd_present(*pmd)){ - pte = pte_offset_kernel(pmd, addr); - if(pte_present(*pte)) protect_vm_page(addr, w, 0); - addr += PAGE_SIZE; - } - else addr += PMD_SIZE; - } -} - void flush_tlb_kernel_vm_tt(void) { flush_tlb_kernel_range(start_vm, end_vm); From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:40 -0700 Subject: [PATCH 66/98] [PATCH] mm: split page table lock Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/mm-armv.c | 1 + arch/frv/mm/pgalloc.c | 4 ++-- arch/i386/mm/pgtable.c | 8 +++---- arch/um/kernel/skas/mmu.c | 1 + fs/afs/file.c | 4 ++-- fs/buffer.c | 2 +- fs/jfs/jfs_metapage.c | 12 +++++----- fs/xfs/linux-2.6/xfs_buf.c | 7 +++--- include/linux/buffer_head.h | 6 ++--- include/linux/mm.h | 46 ++++++++++++++++++++++++++++++------- kernel/kexec.c | 4 ++-- mm/Kconfig | 13 +++++++++++ mm/filemap.c | 2 +- mm/memory.c | 24 +++++++++++-------- mm/mremap.c | 11 ++++++++- mm/page_alloc.c | 16 ++++++------- mm/page_io.c | 6 +++-- mm/rmap.c | 4 ++-- mm/shmem.c | 22 ++++++++---------- mm/swap.c | 2 +- mm/swap_state.c | 8 +++---- mm/swapfile.c | 12 +++++----- mm/vmscan.c | 2 +- 23 files changed, 138 insertions(+), 79 deletions(-) diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c index 60f3e039bac2..1221fdde1769 100644 --- a/arch/arm/mm/mm-armv.c +++ b/arch/arm/mm/mm-armv.c @@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd) pte = pmd_page(*pmd); pmd_clear(pmd); dec_page_state(nr_page_table_pages); + pte_lock_deinit(pte); pte_free(pte); pmd_free(pmd); free: diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index 4eaec0f3525b..2c67dfe5a6b3 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c @@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd) if (pgd_list) pgd_list->private = (unsigned long) &page->index; pgd_list = page; - page->private = (unsigned long) &pgd_list; + set_page_private(page, (unsigned long)&pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *) page->index; - pprev = (struct page **) page->private; + pprev = (struct page **)page_private(page); *pprev = next; if (next) next->private = (unsigned long) pprev; diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index dcdce2c6c532..39c099f15b5f 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd) struct page *page = virt_to_page(pgd); page->index = (unsigned long)pgd_list; if (pgd_list) - pgd_list->private = (unsigned long)&page->index; + set_page_private(pgd_list, (unsigned long)&page->index); pgd_list = page; - page->private = (unsigned long)&pgd_list; + set_page_private(page, (unsigned long)&pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct page *next, **pprev, *page = virt_to_page(pgd); next = (struct page *)page->index; - pprev = (struct page **)page->private; + pprev = (struct page **)page_private(page); *pprev = next; if (next) - next->private = (unsigned long)pprev; + set_page_private(next, (unsigned long)pprev); } void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 02cf36e0331a..9e5e39cea821 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm) if(!proc_mm || !ptrace_faultinfo){ free_page(mmu->id.stack); + pte_lock_deinit(virt_to_page(mmu->last_page_table)); pte_free_kernel((pte_t *) mmu->last_page_table); dec_page_state(nr_page_table_pages); #ifdef CONFIG_3_LEVEL_PGTABLES diff --git a/fs/afs/file.c b/fs/afs/file.c index 0d576987ec67..4975c9c193dd 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) cachefs_uncache_page(vnode->cache, page); #endif - pageio = (struct cachefs_page *) page->private; - page->private = 0; + pageio = (struct cachefs_page *) page_private(page); + set_page_private(page, 0); ClearPagePrivate(page); if (pageio) diff --git a/fs/buffer.c b/fs/buffer.c index b1667986442f..2066e4cb700c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -96,7 +96,7 @@ static void __clear_page_buffers(struct page *page) { ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); page_cache_release(page); } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 26091a5f88d4..8a53981f9f27 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -86,7 +86,7 @@ struct meta_anchor { atomic_t io_count; struct metapage *mp[MPS_PER_PAGE]; }; -#define mp_anchor(page) ((struct meta_anchor *)page->private) +#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) static inline struct metapage *page_to_mp(struct page *page, uint offset) { @@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) if (!a) return -ENOMEM; memset(a, 0, sizeof(struct meta_anchor)); - page->private = (unsigned long)a; + set_page_private(page, (unsigned long)a); SetPagePrivate(page); kmap(page); } @@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) a->mp[index] = NULL; if (--a->mp_count == 0) { kfree(a); - page->private = 0; + set_page_private(page, 0); ClearPagePrivate(page); kunmap(page); } @@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) #else static inline struct metapage *page_to_mp(struct page *page, uint offset) { - return PagePrivate(page) ? (struct metapage *)page->private : NULL; + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; } static inline int insert_metapage(struct page *page, struct metapage *mp) { if (mp) { - page->private = (unsigned long)mp; + set_page_private(page, (unsigned long)mp); SetPagePrivate(page); kmap(page); } @@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) static inline void remove_metapage(struct page *page, struct metapage *mp) { - page->private = 0; + set_page_private(page, 0); ClearPagePrivate(page); kunmap(page); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ba4767c04adf..4cd46abe8434 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -181,8 +181,9 @@ set_page_region( size_t offset, size_t length) { - page->private |= page_region_mask(offset, length); - if (page->private == ~0UL) + set_page_private(page, + page_private(page) | page_region_mask(offset, length)); + if (page_private(page) == ~0UL) SetPageUptodate(page); } @@ -194,7 +195,7 @@ test_page_region( { unsigned long mask = page_region_mask(offset, length); - return (mask && (page->private & mask) == mask); + return (mask && (page_private(page) & mask) == mask); } /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 88af42f5e04a..c937d6e65502 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) /* If we *know* page->private refers to buffer_heads */ #define page_buffers(page) \ ({ \ - BUG_ON(!PagePrivate(page)); \ - ((struct buffer_head *)(page)->private); \ + BUG_ON(!PagePrivate(page)); \ + ((struct buffer_head *)page_private(page)); \ }) #define page_has_buffers(page) PagePrivate(page) @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, { page_cache_get(page); SetPagePrivate(page); - page->private = (unsigned long)head; + set_page_private(page, (unsigned long)head); } static inline void get_bh(struct buffer_head *bh) diff --git a/include/linux/mm.h b/include/linux/mm.h index e8d1424153bb..8a514eca40d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -226,13 +226,18 @@ struct page { * to show when page is mapped * & limit reverse map searches. */ - unsigned long private; /* Mapping-private opaque data: + union { + unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache * When page is free, this indicates * order in the buddy system. */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS + spinlock_t ptl; +#endif + } u; struct address_space *mapping; /* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous @@ -260,6 +265,9 @@ struct page { #endif /* WANT_PAGE_VIRTUAL */ }; +#define page_private(page) ((page)->u.private) +#define set_page_private(page, v) ((page)->u.private = (v)) + /* * FIXME: take this include out, include page-flags.h in * files which need it (119 of them) @@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); #ifdef CONFIG_HUGETLB_PAGE -static inline int page_count(struct page *p) +static inline int page_count(struct page *page) { - if (PageCompound(p)) - p = (struct page *)p->private; - return atomic_read(&(p)->_count) + 1; + if (PageCompound(page)) + page = (struct page *)page_private(page); + return atomic_read(&page->_count) + 1; } static inline void get_page(struct page *page) { if (unlikely(PageCompound(page))) - page = (struct page *)page->private; + page = (struct page *)page_private(page); atomic_inc(&page->_count); } @@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) - return page->private; + return page_private(page); return page->index; } @@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * We tuck a spinlock to guard each pagetable page into its struct page, + * at page->private, with BUILD_BUG_ON to make sure that this will not + * overflow into the next struct page (as it might with DEBUG_SPINLOCK). + * When freeing, reset page->mapping so free_pages_check won't complain. + */ +#define __pte_lockptr(page) &((page)->u.ptl) +#define pte_lock_init(_page) do { \ + spin_lock_init(__pte_lockptr(_page)); \ +} while (0) +#define pte_lock_deinit(page) ((page)->mapping = NULL) +#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) +#else +/* + * We use mm->page_table_lock to guard all pagetable pages of the mm. + */ +#define pte_lock_init(page) do {} while (0) +#define pte_lock_deinit(page) do {} while (0) +#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + #define pte_offset_map_lock(mm, pmd, address, ptlp) \ ({ \ - spinlock_t *__ptl = &(mm)->page_table_lock; \ + spinlock_t *__ptl = pte_lockptr(mm, pmd); \ pte_t *__pte = pte_offset_map(pmd, address); \ *(ptlp) = __ptl; \ spin_lock(__ptl); \ diff --git a/kernel/kexec.c b/kernel/kexec.c index 36c5d9cd4cc1..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) if (pages) { unsigned int count, i; pages->mapping = NULL; - pages->private = order; + set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); @@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page->private; + order = page_private(page); count = 1 << order; for (i = 0; i < count; i++) ClearPageReserved(page + i); diff --git a/mm/Kconfig b/mm/Kconfig index 391ffc54d136..f35a550ba4b9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -111,3 +111,16 @@ config SPARSEMEM_STATIC config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC + +# Heavily threaded applications may benefit from splitting the mm-wide +# page_table_lock, so that faults on different parts of the user address +# space can be handled with less contention: split it at this NR_CPUS. +# Default to 4 for wider testing, though 8 might be more appropriate. +# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. +# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. +# +config SPLIT_PTLOCK_CPUS + int + default "4096" if ARM && !CPU_CACHE_VIPT + default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT + default "4" diff --git a/mm/filemap.c b/mm/filemap.c index 8aa344e88489..f560b41c8f61 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -152,7 +152,7 @@ static int sync_page(void *word) * in the ->sync_page() methods make essential use of the * page_mapping(), merely passing the page down to the backing * device's unplug functions when it's non-NULL, which in turn - * ignore it for all cases but swap, where only page->private is + * ignore it for all cases but swap, where only page_private(page) is * of interest. When page_mapping() does go NULL, the entire * call stack gracefully ignores the page and returns. * -- wli diff --git a/mm/memory.c b/mm/memory.c index 8461e2dd91d7..e9ef599498b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { struct page *page = pmd_page(*pmd); pmd_clear(pmd); + pte_lock_deinit(page); pte_free_tlb(tlb, page); dec_page_state(nr_page_table_pages); tlb->mm->nr_ptes--; @@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) if (!new) return -ENOMEM; + pte_lock_init(new); spin_lock(&mm->page_table_lock); - if (pmd_present(*pmd)) /* Another has populated it */ + if (pmd_present(*pmd)) { /* Another has populated it */ + pte_lock_deinit(new); pte_free(new); - else { + } else { mm->nr_ptes++; inc_page_state(nr_page_table_pages); pmd_populate(mm, pmd, new); @@ -432,7 +435,7 @@ again: if (!dst_pte) return -ENOMEM; src_pte = pte_offset_map_nested(src_pmd, addr); - src_ptl = &src_mm->page_table_lock; + src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock(src_ptl); do { @@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range); * (but do_wp_page is only called after already making such a check; * and do_anonymous_page and do_no_page can safely check later on). */ -static inline int pte_unmap_same(struct mm_struct *mm, +static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) if (sizeof(pte_t) > sizeof(unsigned long)) { - spin_lock(&mm->page_table_lock); + spinlock_t *ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); same = pte_same(*page_table, orig_pte); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } #endif pte_unmap(page_table); @@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte; int ret = VM_FAULT_MINOR; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); @@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_get(page); entry = mk_pte(page, vma->vm_page_prot); - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (!pte_none(*page_table)) goto release; @@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff; int err; - if (!pte_unmap_same(mm, page_table, orig_pte)) + if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) return VM_FAULT_MINOR; if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { @@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte, pmd, write_access, entry); } - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; diff --git a/mm/mremap.c b/mm/mremap.c index 8de77b632a20..b535438c363c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct address_space *mapping = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; - spinlock_t *old_ptl; + spinlock_t *old_ptl, *new_ptl; if (vma->vm_file) { /* @@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_vma->vm_truncate_count = 0; } + /* + * We don't have to worry about the ordering of src and dst + * pte locks because exclusive mmap_sem prevents deadlock. + */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_ptl = pte_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock(new_ptl); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { @@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, set_pte_at(mm, new_addr, new_pte, pte); } + if (new_ptl != old_ptl) + spin_unlock(new_ptl); pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0541288ebf4b..a2995a5d012c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order) struct page *p = page + i; SetPageCompound(p); - p->private = (unsigned long)page; + set_page_private(p, (unsigned long)page); } } @@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (!PageCompound(p)) bad_page(__FUNCTION__, page); - if (p->private != (unsigned long)page) + if (page_private(p) != (unsigned long)page) bad_page(__FUNCTION__, page); ClearPageCompound(p); } @@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) * So, we don't need atomic page->flags operations here. */ static inline unsigned long page_order(struct page *page) { - return page->private; + return page_private(page); } static inline void set_page_order(struct page *page, int order) { - page->private = order; + set_page_private(page, order); __SetPagePrivate(page); } static inline void rmv_page_order(struct page *page) { __ClearPagePrivate(page); - page->private = 0; + set_page_private(page, 0); } /* @@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (a) the buddy is free && * (b) the buddy is on the buddy system && * (c) a page and its buddy have the same order. - * for recording page's order, we use page->private and PG_private. + * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) @@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order) * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous * free pages of length of (1 << order) and marked with PG_Private.Page's - * order is recorded in page->private field. + * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); - page->private = 0; + set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); } diff --git a/mm/page_io.c b/mm/page_io.c index 330e00d6db00..bb2b0d53889c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page_private(page), page, + end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page) BUG_ON(!PageLocked(page)); ClearPageUptodate(page); - bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page_private(page), page, + end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/mm/rmap.c b/mm/rmap.c index a84bdfe582c0..a33e779d1bd8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } - ptl = &mm->page_table_lock; + ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) update_hiwater_rss(mm); if (PageAnon(page)) { - swp_entry_t entry = { .val = page->private }; + swp_entry_t entry = { .val = page_private(page) }; /* * Store the swap location in the pte. * See handle_pte_fault() ... diff --git a/mm/shmem.c b/mm/shmem.c index 37777f4c11f8..dc25565a61e9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -71,9 +71,6 @@ /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 -/* Keep swapped page count in private field of indirect struct page */ -#define nr_swapped private - /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ enum sgp_type { SGP_QUICK, /* don't try more than file page cache lookup */ @@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns entry->val = value; info->swapped += incdec; - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) - kmap_atomic_to_page(entry)->nr_swapped += incdec; + if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { + struct page *page = kmap_atomic_to_page(entry); + set_page_private(page, page_private(page) + incdec); + } } /* @@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); - if (page) { - page->nr_swapped = 0; - } + if (page) + set_page_private(page, 0); spin_lock(&info->lock); if (!page) { @@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode) diroff = 0; } subdir = dir[diroff]; - if (subdir && subdir->nr_swapped) { + if (subdir && page_private(subdir)) { size = limit - idx; if (size > ENTRIES_PER_PAGE) size = ENTRIES_PER_PAGE; @@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode) nr_swaps_freed += freed; if (offset) spin_lock(&info->lock); - subdir->nr_swapped -= freed; + set_page_private(subdir, page_private(subdir) - freed); if (offset) spin_unlock(&info->lock); - BUG_ON(subdir->nr_swapped > offset); + BUG_ON(page_private(subdir) > offset); } if (offset) offset = 0; @@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s dir = shmem_dir_map(subdir); } subdir = *dir; - if (subdir && subdir->nr_swapped) { + if (subdir && page_private(subdir)) { ptr = shmem_swp_map(subdir); size = limit - idx; if (size > ENTRIES_PER_PAGE) diff --git a/mm/swap.c b/mm/swap.c index 21d15f99805c..b89512877ec2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -39,7 +39,7 @@ int page_cluster; void put_page(struct page *page) { if (unlikely(PageCompound(page))) { - page = (struct page *)page->private; + page = (struct page *)page_private(page); if (put_page_testzero(page)) { void (*dtor)(struct page *page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 132164f7d0a7..cafc1edcbeba 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, page_cache_get(page); SetPageLocked(page); SetPageSwapCache(page); - page->private = entry.val; + set_page_private(page, entry.val); total_swapcache_pages++; pagecache_acct(1); } @@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - radix_tree_delete(&swapper_space.page_tree, page->private); - page->private = 0; + radix_tree_delete(&swapper_space.page_tree, page_private(page)); + set_page_private(page, 0); ClearPageSwapCache(page); total_swapcache_pages--; pagecache_acct(-1); @@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) { swp_entry_t entry; - entry.val = page->private; + entry.val = page_private(page); write_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index 510f0039b000..8970c0b74194 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) swp_entry_t entry; down_read(&swap_unplug_sem); - entry.val = page->private; + entry.val = page_private(page); if (PageSwapCache(page)) { struct block_device *bdev = swap_info[swp_type(entry)].bdev; struct backing_dev_info *bdi; @@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) /* * If the page is removed from swapcache from under us (with a * racy try_to_unuse/swapoff) we need an additional reference - * count to avoid reading garbage from page->private above. If - * the WARN_ON triggers during a swapoff it maybe the race + * count to avoid reading garbage from page_private(page) above. + * If the WARN_ON triggers during a swapoff it maybe the race * condition and it's harmless. However if it triggers without * swapoff it signals a problem. */ @@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) struct swap_info_struct *p; swp_entry_t entry; - entry.val = page->private; + entry.val = page_private(page); p = swap_info_get(entry); if (p) { /* Subtract the 1 for the swap cache itself */ @@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->private; + entry.val = page_private(page); p = swap_info_get(entry); if (!p) return 0; @@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page) BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->private }; + swp_entry_t entry = { .val = page_private(page) }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff --git a/mm/vmscan.c b/mm/vmscan.c index 41d1064aabfb..135bf8ca96ee 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->private }; + swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); write_unlock_irq(&mapping->tree_lock); swap_free(swap); From f412ac08c9861b4791af0145934c22f1458686da Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH 67/98] [PATCH] mm: fix rss and mmlist locking A couple of oddities were guarded by page_table_lock, no longer properly guarded when that is split. The mm_counters of file_rss and anon_rss: make those an atomic_t, or an atomic64_t if the architecture supports it, in such a case. Definitions by courtesy of Christoph Lameter: who spent considerable effort on more scalable ways of counting, but found insufficient benefit in practice. And adding an mm with swap to the mmlist for swapoff: the list is well- guarded by its own lock, but the list_empty check now has to be repeated inside it. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 42 ++++++++++++++++++++++++++++++++++++++---- mm/memory.c | 4 +++- mm/rmap.c | 3 ++- 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 292cb57ce38f..1c30bc308ef1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -249,13 +249,47 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, extern void arch_unmap_area(struct mm_struct *, unsigned long); extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); +#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +#ifdef ATOMIC64_INIT +#define set_mm_counter(mm, member, value) atomic64_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic64_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic64_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic64_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic64_dec(&(mm)->_##member) +typedef atomic64_t mm_counter_t; +#else /* !ATOMIC64_INIT */ +/* + * The counters wrap back to 0 at 2^32 * PAGE_SIZE, + * that is, at 16TB if using 4kB page size. + */ +#define set_mm_counter(mm, member, value) atomic_set(&(mm)->_##member, value) +#define get_mm_counter(mm, member) ((unsigned long)atomic_read(&(mm)->_##member)) +#define add_mm_counter(mm, member, value) atomic_add(value, &(mm)->_##member) +#define inc_mm_counter(mm, member) atomic_inc(&(mm)->_##member) +#define dec_mm_counter(mm, member) atomic_dec(&(mm)->_##member) +typedef atomic_t mm_counter_t; +#endif /* !ATOMIC64_INIT */ + +#else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ #define set_mm_counter(mm, member, value) (mm)->_##member = (value) #define get_mm_counter(mm, member) ((mm)->_##member) #define add_mm_counter(mm, member, value) (mm)->_##member += (value) #define inc_mm_counter(mm, member) (mm)->_##member++ #define dec_mm_counter(mm, member) (mm)->_##member-- -#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss) +typedef unsigned long mm_counter_t; +#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ + +#define get_mm_rss(mm) \ + (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) #define update_hiwater_rss(mm) do { \ unsigned long _rss = get_mm_rss(mm); \ if ((mm)->hiwater_rss < _rss) \ @@ -266,8 +300,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); (mm)->hiwater_vm = (mm)->total_vm; \ } while (0) -typedef unsigned long mm_counter_t; - struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -291,7 +323,9 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters protected by the page_table_lock */ + /* Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ mm_counter_t _file_rss; mm_counter_t _anon_rss; diff --git a/mm/memory.c b/mm/memory.c index e9ef599498b5..d68421dd64ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -372,7 +372,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); - list_add(&dst_mm->mmlist, &src_mm->mmlist); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); spin_unlock(&mmlist_lock); } } diff --git a/mm/rmap.c b/mm/rmap.c index a33e779d1bd8..a7427bbf57e4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -559,7 +559,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) swap_duplicate(entry); if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); - list_add(&mm->mmlist, &init_mm.mmlist); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); From b8072f099b7829a6ff3eba618e1d079a81f753f8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sat, 29 Oct 2005 18:16:41 -0700 Subject: [PATCH 68/98] [PATCH] mm: update comments to pte lock Updated several references to page_table_lock in common code comments. Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 2 +- include/linux/mempolicy.h | 3 +-- mm/filemap.c | 6 +++--- mm/rmap.c | 10 +++++----- mm/swap_state.c | 3 +-- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index ff28c8b31f58..7dca30a26c53 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -8,7 +8,7 @@ * - update the page tables * - inform the TLB about the new one * - * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock. + * We hold the mm semaphore for reading, and the pte lock. * * Note: the old pte is known to not be writable, so we don't need to * worry about dirty bits etc getting lost. diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 38e60a099399..7af8cb836e78 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -47,8 +47,7 @@ struct vm_area_struct; * Locking policy for interlave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on - * mmap_sem. For allocating in the interleave policy the page_table_lock - * must be also aquired to protect il_next. + * mmap_sem. * * Freeing policy: * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. diff --git a/mm/filemap.c b/mm/filemap.c index f560b41c8f61..036599d1177e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * * ->mmap_sem * ->i_mmap_lock - * ->page_table_lock (various places, mainly in mmap.c) + * ->page_table_lock or pte_lock (various, mainly in memory.c) * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem @@ -86,9 +86,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->anon_vma.lock (vma_adjust) * * ->anon_vma.lock - * ->page_table_lock (anon_vma_prepare and various) + * ->page_table_lock or pte_lock (anon_vma_prepare and various) * - * ->page_table_lock + * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->tree_lock (try_to_unmap_one) diff --git a/mm/rmap.c b/mm/rmap.c index a7427bbf57e4..914d04b98bee 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,7 +32,7 @@ * page->flags PG_locked (lock_page) * mapping->i_mmap_lock * anon_vma->lock - * mm->page_table_lock + * mm->page_table_lock or pte_lock * zone->lru_lock (in mark_page_accessed) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) @@ -244,7 +244,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * - * On success returns with mapped pte and locked mm->page_table_lock. + * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp) @@ -445,7 +445,7 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) @@ -468,7 +468,7 @@ void page_add_anon_rmap(struct page *page, * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to * - * The caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_add_file_rmap(struct page *page) { @@ -483,7 +483,7 @@ void page_add_file_rmap(struct page *page) * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from * - * Caller needs to hold the mm->page_table_lock. + * The caller needs to hold the pte lock. */ void page_remove_rmap(struct page *page) { diff --git a/mm/swap_state.c b/mm/swap_state.c index cafc1edcbeba..dfd9a46755b8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -259,8 +259,7 @@ static inline void free_swap_cache(struct page *page) /* * Perform a free_page(), also freeing any swap cache associated with - * this page if it is the last user of the page. Can not do a lock_page, - * as we are holding the page_table_lock spinlock. + * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { From 96527980d4cb8f65fe49efdbc4ab92c0837d42f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Oct 2005 18:16:42 -0700 Subject: [PATCH 69/98] [PATCH] hugetlbfs: move free_inodes accounting Move hugetlbfs accounting into ->alloc_inode / ->destroy_inode. This keeps the code simpler, fixes a loeak where a failing inode allocation wouldn't decrement the counter and moves hugetlbfs_delete_inode and hugetlbfs_forget_inode closer to their generic counterparts. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 80 ++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a826a8add5e3..8e9d43633365 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -224,8 +224,6 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb); - hlist_del_init(&inode->i_hash); list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); @@ -238,12 +236,6 @@ static void hugetlbfs_delete_inode(struct inode *inode) security_inode_delete(inode); - if (sbinfo->free_inodes >= 0) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } - clear_inode(inode); destroy_inode(inode); } @@ -251,7 +243,6 @@ static void hugetlbfs_delete_inode(struct inode *inode) static void hugetlbfs_forget_inode(struct inode *inode) { struct super_block *super_block = inode->i_sb; - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(super_block); if (hlist_unhashed(&inode->i_hash)) goto out_truncate; @@ -278,12 +269,6 @@ out_truncate: if (inode->i_data.nrpages) truncate_hugepages(&inode->i_data, 0); - if (sbinfo->free_inodes >= 0) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); - } - clear_inode(inode); destroy_inode(inode); } @@ -375,17 +360,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, gid_t gid, int mode, dev_t dev) { struct inode *inode; - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); - - if (sbinfo->free_inodes >= 0) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return NULL; - } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); - } inode = new_inode(sb); if (inode) { @@ -527,29 +501,51 @@ static void hugetlbfs_put_super(struct super_block *sb) } } +static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + if (unlikely(!sbinfo->free_inodes)) { + spin_unlock(&sbinfo->stat_lock); + return 0; + } + sbinfo->free_inodes--; + spin_unlock(&sbinfo->stat_lock); + } + + return 1; +} + +static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) +{ + if (sbinfo->free_inodes >= 0) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_inodes++; + spin_unlock(&sbinfo->stat_lock); + } +} + + static kmem_cache_t *hugetlbfs_inode_cachep; static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) { + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); struct hugetlbfs_inode_info *p; - p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); - if (!p) + if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) return NULL; + p = kmem_cache_alloc(hugetlbfs_inode_cachep, SLAB_KERNEL); + if (unlikely(!p)) { + hugetlbfs_inc_free_inodes(sbinfo); + return NULL; + } return &p->vfs_inode; } -static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) -{ - struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; - - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(&ei->vfs_inode); -} - static void hugetlbfs_destroy_inode(struct inode *inode) { + hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -561,6 +557,16 @@ static struct address_space_operations hugetlbfs_aops = { .set_page_dirty = hugetlbfs_set_page_dirty, }; + +static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) +{ + struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&ei->vfs_inode); +} + struct file_operations hugetlbfs_file_operations = { .mmap = hugetlbfs_file_mmap, .fsync = simple_sync_file, From 149f4211afda85743e3a3db3fa3abbd81506cf2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Oct 2005 18:16:43 -0700 Subject: [PATCH 70/98] [PATCH] hugetlbfs: clean up hugetlbfs_delete_inode Make hugetlbfs looks the same as generic_detelte_inode, fixing a bunch of missing updates to it at the same time. Rename it to hugetlbfs_do_delete_inode and add a real hugetlbfs_delete_inode that implements ->delete_inode. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8e9d43633365..2b9d1bee9220 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -224,19 +224,44 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) static void hugetlbfs_delete_inode(struct inode *inode) { - hlist_del_init(&inode->i_hash); + if (inode->i_data.nrpages) + truncate_hugepages(&inode->i_data, 0); + clear_inode(inode); +} + +static void hugetlbfs_do_delete_inode(struct inode *inode) +{ + struct super_operations *op = inode->i_sb->s_op; + list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); - if (inode->i_data.nrpages) - truncate_hugepages(&inode->i_data, 0); - security_inode_delete(inode); - clear_inode(inode); + if (op->delete_inode) { + void (*delete)(struct inode *) = op->delete_inode; + if (!is_bad_inode(inode)) + DQUOT_INIT(inode); + /* Filesystems implementing their own + * s_op->delete_inode are required to call + * truncate_inode_pages and clear_inode() + * internally + */ + delete(inode); + } else { + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); + } + + spin_lock(&inode_lock); + hlist_del_init(&inode->i_hash); + spin_unlock(&inode_lock); + wake_up_inode(inode); + if (inode->i_state != I_CLEAR) + BUG(); destroy_inode(inode); } @@ -276,7 +301,7 @@ out_truncate: static void hugetlbfs_drop_inode(struct inode *inode) { if (!inode->i_nlink) - hugetlbfs_delete_inode(inode); + hugetlbfs_do_delete_inode(inode); else hugetlbfs_forget_inode(inode); } @@ -594,6 +619,7 @@ static struct super_operations hugetlbfs_ops = { .alloc_inode = hugetlbfs_alloc_inode, .destroy_inode = hugetlbfs_destroy_inode, .statfs = hugetlbfs_statfs, + .delete_inode = hugetlbfs_delete_inode, .drop_inode = hugetlbfs_drop_inode, .put_super = hugetlbfs_put_super, }; From 6b09b9df05f319ec27e0dae1721efe097b8b23ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Oct 2005 18:16:44 -0700 Subject: [PATCH 71/98] [PATCH] kill hugelbfs_do_delete_inode hugetlbfs_do_delete_inode is the same as generic_delete_inode now, so remove it in favour of the latter. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2b9d1bee9220..ffdad4e64671 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -229,42 +229,6 @@ static void hugetlbfs_delete_inode(struct inode *inode) clear_inode(inode); } -static void hugetlbfs_do_delete_inode(struct inode *inode) -{ - struct super_operations *op = inode->i_sb->s_op; - - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); - inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); - - security_inode_delete(inode); - - if (op->delete_inode) { - void (*delete)(struct inode *) = op->delete_inode; - if (!is_bad_inode(inode)) - DQUOT_INIT(inode); - /* Filesystems implementing their own - * s_op->delete_inode are required to call - * truncate_inode_pages and clear_inode() - * internally - */ - delete(inode); - } else { - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); - } - - spin_lock(&inode_lock); - hlist_del_init(&inode->i_hash); - spin_unlock(&inode_lock); - wake_up_inode(inode); - if (inode->i_state != I_CLEAR) - BUG(); - destroy_inode(inode); -} - static void hugetlbfs_forget_inode(struct inode *inode) { struct super_block *super_block = inode->i_sb; @@ -301,7 +265,7 @@ out_truncate: static void hugetlbfs_drop_inode(struct inode *inode) { if (!inode->i_nlink) - hugetlbfs_do_delete_inode(inode); + generic_delete_inode(inode); else hugetlbfs_forget_inode(inode); } From 0b1533f67cc1a595457af6d05ab3510294e2ca9c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 29 Oct 2005 18:16:45 -0700 Subject: [PATCH 72/98] [PATCH] cleanup hugelbfs_forget_inode Reformat hugelbfs_forget_inode and add the missing but harmless write_inode_now call. It looks the same as generic_forget_inode now except for the call to truncate_hugepages instead of truncate_inode_pages. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ffdad4e64671..8f94feb24c0a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -231,25 +231,28 @@ static void hugetlbfs_delete_inode(struct inode *inode) static void hugetlbfs_forget_inode(struct inode *inode) { - struct super_block *super_block = inode->i_sb; + struct super_block *sb = inode->i_sb; - if (hlist_unhashed(&inode->i_hash)) - goto out_truncate; - - if (!(inode->i_state & (I_DIRTY|I_LOCK))) { - list_del(&inode->i_list); - list_add(&inode->i_list, &inode_unused); - } - inodes_stat.nr_unused++; - if (!super_block || (super_block->s_flags & MS_ACTIVE)) { + if (!hlist_unhashed(&inode->i_hash)) { + if (!(inode->i_state & (I_DIRTY|I_LOCK))) + list_move(&inode->i_list, &inode_unused); + inodes_stat.nr_unused++; + if (!sb || (sb->s_flags & MS_ACTIVE)) { + spin_unlock(&inode_lock); + return; + } + inode->i_state |= I_WILL_FREE; spin_unlock(&inode_lock); - return; + /* + * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK + * in our backing_dev_info. + */ + write_inode_now(inode, 1); + spin_lock(&inode_lock); + inode->i_state &= ~I_WILL_FREE; + inodes_stat.nr_unused--; + hlist_del_init(&inode->i_hash); } - - /* write_inode_now() ? */ - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); -out_truncate: list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); inode->i_state |= I_FREEING; @@ -257,7 +260,6 @@ out_truncate: spin_unlock(&inode_lock); if (inode->i_data.nrpages) truncate_hugepages(&inode->i_data, 0); - clear_inode(inode); destroy_inode(inode); } From 551110a94aa15890d1709b179c4be1e66ff6db53 Mon Sep 17 00:00:00 2001 From: Krishnakumar R Date: Sat, 29 Oct 2005 18:16:45 -0700 Subject: [PATCH 73/98] [PATCH] hugetlb: remove repeated code Clean up some repeated code related to HugeTLB. hugetlb_zero_setup would have already allocated the file->f_op. Signed-off-by: Krishnakumar. R Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index dca90489e3b0..b58c651d31ae 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -233,10 +233,11 @@ static int newseg (key_t key, int shmflg, size_t size) shp->id = shm_buildid(id,shp->shm_perm.seq); shp->shm_file = file; file->f_dentry->d_inode->i_ino = shp->id; - if (shmflg & SHM_HUGETLB) - set_file_hugepages(file); - else + + /* Hugetlb ops would have already been assigned. */ + if (!(shmflg & SHM_HUGETLB)) file->f_op = &shm_file_operations; + shm_tot += numpages; shm_unlock(shp); return shp->id; From 4c887265977213985091476be40ab11dfdcb4caf Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Sat, 29 Oct 2005 18:16:46 -0700 Subject: [PATCH 74/98] [PATCH] hugetlb: demand fault handler Below is a patch to implement demand faulting for huge pages. The main motivation for changing from prefaulting to demand faulting is so that huge page memory areas can be allocated according to NUMA policy. Thanks to consolidated hugetlb code, switching the behavior requires changing only one fault handler. The bulk of the patch just moves the logic from hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page(). Signed-off-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 7 +- mm/hugetlb.c | 176 +++++++++++++++++++++++-------------------- 2 files changed, 95 insertions(+), 88 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8f94feb24c0a..2627efe767cf 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group; static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; loff_t len, vma_len; int ret; @@ -79,10 +78,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) goto out; - ret = hugetlb_prefault(mapping, vma); - if (ret) - goto out; - + ret = 0; + hugetlb_prefault_arch_hook(vma->vm_mm); if (inode->i_size < len) inode->i_size = len; out: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f29b7dc02c39..c9b43360fd33 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -321,10 +321,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); - if (! ptep) - /* This can happen on truncate, or if an - * mmap() is aborted due to an error before - * the prefault */ + if (!ptep) continue; pte = huge_ptep_get_and_clear(mm, address, ptep); @@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); } -int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) +static struct page *find_lock_huge_page(struct address_space *mapping, + unsigned long idx) { - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; + struct page *page; + int err; + struct inode *inode = mapping->host; + unsigned long size; - WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON(vma->vm_start & ~HPAGE_MASK); - BUG_ON(vma->vm_end & ~HPAGE_MASK); +retry: + page = find_lock_page(mapping, idx); + if (page) + goto out; - hugetlb_prefault_arch_hook(mm); + /* Check to make sure the mapping hasn't been truncated */ + size = i_size_read(inode) >> HPAGE_SHIFT; + if (idx >= size) + goto out; - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { - unsigned long idx; - pte_t *pte = huge_pte_alloc(mm, addr); - struct page *page; + if (hugetlb_get_quota(mapping)) + goto out; + page = alloc_huge_page(); + if (!page) { + hugetlb_put_quota(mapping); + goto out; + } - if (!pte) { - ret = -ENOMEM; - goto out; - } - - idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); - page = find_get_page(mapping, idx); - if (!page) { - /* charge the fs quota first */ - if (hugetlb_get_quota(mapping)) { - ret = -ENOMEM; - goto out; - } - page = alloc_huge_page(); - if (!page) { - hugetlb_put_quota(mapping); - ret = -ENOMEM; - goto out; - } - ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); - if (! ret) { - unlock_page(page); - } else { - hugetlb_put_quota(mapping); - free_huge_page(page); - goto out; - } - } - spin_lock(&mm->page_table_lock); - add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); - set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); - spin_unlock(&mm->page_table_lock); + err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); + if (err) { + put_page(page); + hugetlb_put_quota(mapping); + if (err == -EEXIST) + goto retry; + page = NULL; } out: - return ret; + return page; } -/* - * On ia64 at least, it is possible to receive a hugetlb fault from a - * stale zero entry left in the TLB from earlier hardware prefetching. - * Low-level arch code should already have flushed the stale entry as - * part of its fault handling, but we do need to accept this minor fault - * and return successfully. Whereas the "normal" case is that this is - * an access to a hugetlb page which has been truncated off since mmap. - */ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access) { int ret = VM_FAULT_SIGBUS; + unsigned long idx; + unsigned long size; pte_t *pte; + struct page *page; + struct address_space *mapping; + + pte = huge_pte_alloc(mm, address); + if (!pte) + goto out; + + mapping = vma->vm_file->f_mapping; + idx = ((address - vma->vm_start) >> HPAGE_SHIFT) + + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + + /* + * Use page lock to guard against racing truncation + * before we get page_table_lock. + */ + page = find_lock_huge_page(mapping, idx); + if (!page) + goto out; spin_lock(&mm->page_table_lock); - pte = huge_pte_offset(mm, address); - if (pte && !pte_none(*pte)) - ret = VM_FAULT_MINOR; + size = i_size_read(mapping->host) >> HPAGE_SHIFT; + if (idx >= size) + goto backout; + + ret = VM_FAULT_MINOR; + if (!pte_none(*pte)) + goto backout; + + add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); + set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); spin_unlock(&mm->page_table_lock); + unlock_page(page); +out: return ret; + +backout: + spin_unlock(&mm->page_table_lock); + hugetlb_put_quota(mapping); + unlock_page(page); + put_page(page); + goto out; } int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vpfn, vaddr = *position; int remainder = *length; - BUG_ON(!is_vm_hugetlb_page(vma)); - vpfn = vaddr/PAGE_SIZE; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { + pte_t *pte; + struct page *page; + + /* + * Some archs (sparc64, sh*) have multiple pte_ts to + * each hugepage. We have to make * sure we get the + * first, for the page indexing below to work. + */ + pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + + if (!pte || pte_none(*pte)) { + int ret; + + spin_unlock(&mm->page_table_lock); + ret = hugetlb_fault(mm, vma, vaddr, 0); + spin_lock(&mm->page_table_lock); + if (ret == VM_FAULT_MINOR) + continue; + + remainder = 0; + if (!i) + i = -EFAULT; + break; + } if (pages) { - pte_t *pte; - struct page *page; - - /* Some archs (sparc64, sh*) have multiple - * pte_ts to each hugepage. We have to make - * sure we get the first, for the page - * indexing below to work. */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); - - /* the hugetlb file might have been truncated */ - if (!pte || pte_none(*pte)) { - remainder = 0; - if (!i) - i = -EFAULT; - break; - } - page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; - - WARN_ON(!PageCompound(page)); - get_page(page); pages[i] = page; } From 2e9b367c2273ed21c9852a04d90944d472c4f3e6 Mon Sep 17 00:00:00 2001 From: Adam Litke Date: Sat, 29 Oct 2005 18:16:47 -0700 Subject: [PATCH 75/98] [PATCH] hugetlb: overcommit accounting check Basic overcommit checking for hugetlb_file_map() based on an implementation used with demand faulting in SLES9. Since demand faulting can't guarantee the availability of pages at mmap time, this patch implements a basic sanity check to ensure that the number of huge pages required to satisfy the mmap are currently available. Despite the obvious race, I think it is a good start on doing proper accounting. I'd like to work towards an accounting system that mimics the semantics of normal pages (especially for the MAP_PRIVATE/COW case). That work is underway and builds on what this patch starts. Huge page shared memory segments are simpler and still maintain their commit on shmget semantics. Signed-off-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 63 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2627efe767cf..e026c807e6b3 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -45,9 +45,58 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = { int sysctl_hugetlb_shm_group; +static void huge_pagevec_release(struct pagevec *pvec) +{ + int i; + + for (i = 0; i < pagevec_count(pvec); ++i) + put_page(pvec->pages[i]); + + pagevec_reinit(pvec); +} + +/* + * huge_pages_needed tries to determine the number of new huge pages that + * will be required to fully populate this VMA. This will be equal to + * the size of the VMA in huge pages minus the number of huge pages + * (covered by this VMA) that are found in the page cache. + * + * Result is in bytes to be compatible with is_hugepage_mem_enough() + */ +unsigned long +huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) +{ + int i; + struct pagevec pvec; + unsigned long start = vma->vm_start; + unsigned long end = vma->vm_end; + unsigned long hugepages = (end - start) >> HPAGE_SHIFT; + pgoff_t next = vma->vm_pgoff; + pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); + + pagevec_init(&pvec, 0); + while (next < endpg) { + if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + break; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + if (page->index > next) + next = page->index; + if (page->index >= endpg) + break; + next++; + hugepages--; + } + huge_pagevec_release(&pvec); + } + return hugepages << HPAGE_SHIFT; +} + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_dentry->d_inode; + struct address_space *mapping = inode->i_mapping; + unsigned long bytes; loff_t len, vma_len; int ret; @@ -66,6 +115,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_end - vma->vm_start < HPAGE_SIZE) return -EINVAL; + bytes = huge_pages_needed(mapping, vma); + if (!is_hugepage_mem_enough(bytes)) + return -ENOMEM; + vma_len = (loff_t)(vma->vm_end - vma->vm_start); down(&inode->i_sem); @@ -168,16 +221,6 @@ static int hugetlbfs_commit_write(struct file *file, return -EINVAL; } -static void huge_pagevec_release(struct pagevec *pvec) -{ - int i; - - for (i = 0; i < pagevec_count(pvec); ++i) - put_page(pvec->pages[i]); - - pagevec_reinit(pvec); -} - static void truncate_huge_page(struct page *page) { clear_page_dirty(page); From 1a44e149084d772a1bcf4cdbdde8a013a8a1cfde Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sat, 29 Oct 2005 18:16:48 -0700 Subject: [PATCH 76/98] [PATCH] .text page fault SMP scalability optimization We had a problem on ppc64 where with more than 4 threads a large system wouldn't scale well while faulting in the .text (most of the time was spent in the kernel despite it was an userland compute intensive app). The reason is the useless overwrite of the same pte from all cpu. I fixed it this way (verified on an older kernel but the forward port is almost identical). This will benefit all archs not just ppc64. Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d68421dd64ef..0f60baf6f69b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1980,9 +1980,10 @@ static inline int handle_pte_fault(struct mm_struct *mm, pte_t *pte, pmd_t *pmd, int write_access) { pte_t entry; + pte_t old_entry; spinlock_t *ptl; - entry = *pte; + old_entry = entry = *pte; if (!pte_present(entry)) { if (pte_none(entry)) { if (!vma->vm_ops || !vma->vm_ops->nopage) @@ -2009,9 +2010,20 @@ static inline int handle_pte_fault(struct mm_struct *mm, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - ptep_set_access_flags(vma, address, pte, entry, write_access); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + if (!pte_same(old_entry, entry)) { + ptep_set_access_flags(vma, address, pte, entry, write_access); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + } else { + /* + * This is needed only for protection faults but the arch code + * is not yet telling us if this is a protection fault or not. + * This still avoids useless tlb flushes for .text page faults + * with threads. + */ + if (write_access) + flush_tlb_page(vma, address); + } unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_MINOR; From 2774812f417db562f0d659d2c1b5755ba35d2770 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:49 -0700 Subject: [PATCH 77/98] [PATCH] memory hotplug prep: kill local_mapnr The following series implements memory hot-add for ppc64 and i386. There are x86_64 and ia64 implementations that will be submitted shortly as well, through the normal maintainers. This patch: local_mapnr is unused, except for in an alpha header. Keep the alpha one, kill the rest. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-i386/mmzone.h | 6 ------ include/asm-m32r/mmzone.h | 6 ------ include/asm-parisc/mmzone.h | 6 ------ include/asm-ppc64/mmzone.h | 3 --- 4 files changed, 21 deletions(-) diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index 348fe3a4879d..620a90641ea8 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -88,12 +88,6 @@ static inline int pfn_to_nid(unsigned long pfn) __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - /* XXX: FIXME -- wli */ #define kern_addr_valid(kaddr) (0) diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h index d58878ec899e..adc7970a77ec 100644 --- a/include/asm-m32r/mmzone.h +++ b/include/asm-m32r/mmzone.h @@ -21,12 +21,6 @@ extern struct pglist_data *node_data[]; __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ }) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = pfn; \ diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h index 595d3dce120a..ae039f4fd711 100644 --- a/include/asm-parisc/mmzone.h +++ b/include/asm-parisc/mmzone.h @@ -27,12 +27,6 @@ extern struct node_map_data node_data[]; }) #define node_localnr(pfn, nid) ((pfn) - node_start_pfn(nid)) -#define local_mapnr(kvaddr) \ -({ \ - unsigned long __pfn = __pa(kvaddr) >> PAGE_SHIFT; \ - (__pfn - node_start_pfn(pfn_to_nid(__pfn))); \ -}) - #define pfn_to_page(pfn) \ ({ \ unsigned long __pfn = (pfn); \ diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h index ed473f4b0152..80a708e7093a 100644 --- a/include/asm-ppc64/mmzone.h +++ b/include/asm-ppc64/mmzone.h @@ -67,9 +67,6 @@ static inline int pa_to_nid(unsigned long pa) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) -#define local_mapnr(kvaddr) \ - ( (__pa(kvaddr) >> PAGE_SHIFT) - node_start_pfn(kvaddr_to_nid(kvaddr)) - #ifdef CONFIG_DISCONTIGMEM /* From ed8ece2ec8d3c2031b1a1a0737568bb0d49454e0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:50 -0700 Subject: [PATCH 78/98] [PATCH] memory hotplug prep: break out zone initialization If a zone is empty at boot-time and then hot-added to later, it needs to run the same init code that would have been run on it at boot. This patch breaks out zone table and per-cpu-pages functions for use by the hotplug code. You can almost see all of the free_area_init_core() function on one page now. :) Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 98 +++++++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 40 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2995a5d012c..9a2fa8110afc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1875,6 +1875,60 @@ void __init setup_per_cpu_pageset() #endif +static __devinit +void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +{ + int i; + struct pglist_data *pgdat = zone->zone_pgdat; + + /* + * The per-page waitqueue mechanism uses hashed waitqueues + * per zone. + */ + zone->wait_table_size = wait_table_size(zone_size_pages); + zone->wait_table_bits = wait_table_bits(zone->wait_table_size); + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, zone->wait_table_size + * sizeof(wait_queue_head_t)); + + for(i = 0; i < zone->wait_table_size; ++i) + init_waitqueue_head(zone->wait_table + i); +} + +static __devinit void zone_pcp_init(struct zone *zone) +{ + int cpu; + unsigned long batch = zone_batchsize(zone); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif + } + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); +} + +static __devinit void init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, unsigned long size) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + + zone_wait_table_init(zone, size); + pgdat->nr_zones = zone_idx(zone) + 1; + + zone->zone_mem_map = pfn_to_page(zone_start_pfn); + zone->zone_start_pfn = zone_start_pfn; + + memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); + + zone_init_free_lists(pgdat, zone, zone->spanned_pages); +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -1884,8 +1938,8 @@ void __init setup_per_cpu_pageset() static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long i, j; - int cpu, nid = pgdat->node_id; + unsigned long j; + int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; @@ -1895,7 +1949,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; - unsigned long batch; realsize = size = zones_size[j]; if (zholes_size) @@ -1915,19 +1968,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - batch = zone_batchsize(zone); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif - } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone_names[j], realsize, batch); + zone_pcp_init(zone); INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; @@ -1938,32 +1979,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat, if (!size) continue; - /* - * The per-page waitqueue mechanism uses hashed waitqueues - * per zone. - */ - zone->wait_table_size = wait_table_size(size); - zone->wait_table_bits = - wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); - - for(i = 0; i < zone->wait_table_size; ++i) - init_waitqueue_head(zone->wait_table + i); - - pgdat->nr_zones = j+1; - - zone->zone_mem_map = pfn_to_page(zone_start_pfn); - zone->zone_start_pfn = zone_start_pfn; - - memmap_init(size, nid, j, zone_start_pfn); - zonetable_add(zone, nid, j, zone_start_pfn, size); - + init_currently_empty_zone(zone, zone_start_pfn, size); zone_start_pfn += size; - - zone_init_free_lists(pgdat, zone, zone->spanned_pages); } } From 4ca644d970bf2542623228a4624af356d20ca267 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:51 -0700 Subject: [PATCH 79/98] [PATCH] memory hotplug prep: __section_nr helper A little helper that we use in the hotplug code. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/sparse.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7519eb4191e7..4674145bb63d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -509,6 +509,7 @@ static inline struct mem_section *__nr_to_section(unsigned long nr) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } +extern int __section_nr(struct mem_section* ms); /* * We use the lower bits of the mem_map pointer to store diff --git a/mm/sparse.c b/mm/sparse.c index 347249a4917a..0d3bd4bf3aaa 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -72,6 +72,31 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) } #endif +/* + * Although written for the SPARSEMEM_EXTREME case, this happens + * to also work for the flat array case becase + * NR_SECTION_ROOTS==NR_MEM_SECTIONS. + */ +int __section_nr(struct mem_section* ms) +{ + unsigned long root_nr; + struct mem_section* root; + + for (root_nr = 0; + root_nr < NR_MEM_SECTIONS; + root_nr += SECTIONS_PER_ROOT) { + root = __nr_to_section(root_nr); + + if (!root) + continue; + + if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT))) + break; + } + + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); +} + /* Record a memory area against a node. */ void memory_present(int nid, unsigned long start, unsigned long end) { From c6a57e19e464db118dc4ab9cfe9e9748c6d630a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:52 -0700 Subject: [PATCH 80/98] [PATCH] memory hotplug prep: fixup bad_range() When doing memory hotplug operations, the size of existing zones can obviously change. This means that zone->zone_{start_pfn,spanned_pages} can change. There are currently no locks that protect these structure members. However, they are rarely accessed at runtime. Outside of swsusp, the only place that I can find is bad_range(). So, split bad_range() up into two pieces: one that needs to be locked and anther that doesn't. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9a2fa8110afc..a51ef94eec33 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -78,21 +78,37 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; -/* - * Temporary debugging check for pages not lying within a given zone. - */ -static int bad_range(struct zone *zone, struct page *page) +static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) return 1; if (page_to_pfn(page) < zone->zone_start_pfn) return 1; + + return 0; +} + +static int page_is_consistent(struct zone *zone, struct page *page) +{ #ifdef CONFIG_HOLES_IN_ZONE if (!pfn_valid(page_to_pfn(page))) - return 1; + return 0; #endif if (zone != page_zone(page)) + return 0; + + return 1; +} +/* + * Temporary debugging check for pages not lying within a given zone. + */ +static int bad_range(struct zone *zone, struct page *page) +{ + if (page_outside_zone_boundaries(zone, page)) return 1; + if (!page_is_consistent(zone, page)) + return 1; + return 0; } From 208d54e5513c0c02d85af0990901354c74364d5c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:52 -0700 Subject: [PATCH 81/98] [PATCH] memory hotplug locking: node_size_lock pgdat->node_size_lock is basically only neeeded in one place in the normal code: show_mem(), which is the arch-specific sysrq-m printing function. Strictly speaking, the architectures not doing memory hotplug do no need this locking in show_mem(). However, they are all included for completeness. This should also make any future consolidation of all of the implementations a little more straightforward. This lock is also held in the sparsemem code during a memory removal, as sections are invalidated. This is the place there pfn_valid() is made false for a memory area that's being removed. The lock is only required when doing pfn_valid() operations on memory which the user does not already have a reference on the page, such as in show_mem(). Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/numa.c | 3 +++ arch/i386/mm/pgtable.c | 3 +++ arch/ia64/mm/discontig.c | 7 ++++++- arch/m32r/mm/init.c | 9 ++++++++- arch/parisc/mm/init.c | 3 +++ arch/ppc64/mm/init.c | 6 ++++++ include/linux/memory_hotplug.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/mmzone.h | 12 ++++++++++++ mm/page_alloc.c | 1 + 9 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 include/linux/memory_hotplug.h diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index c7481d59b6df..6d5251254f68 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -371,6 +371,8 @@ show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); i = node_spanned_pages(nid); while (i-- > 0) { struct page *page = nid_page_nr(nid, i); @@ -384,6 +386,7 @@ show_mem(void) else shared += page_count(page) - 1; } + pgdat_resize_unlock(NODE_DATA(nid), &flags); } printk("%ld pages of RAM\n",total); printk("%ld free pages\n",free); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 39c099f15b5f..9db3242103be 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -31,11 +31,13 @@ void show_mem(void) pg_data_t *pgdat; unsigned long i; struct page_state ps; + unsigned long flags; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -48,6 +50,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk(KERN_INFO "%d pages of RAM\n", total); printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index a3788fb84809..a88cdb7232f8 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -555,9 +555,13 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { - unsigned long present = pgdat->node_present_pages; + unsigned long present; + unsigned long flags; int shared = 0, cached = 0, reserved = 0; + printk("Node ID: %d\n", pgdat->node_id); + pgdat_resize_lock(pgdat, &flags); + present = pgdat->node_present_pages; for(i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page; if (pfn_valid(pgdat->node_start_pfn + i)) @@ -571,6 +575,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page)-1; } + pgdat_resize_unlock(pgdat, &flags); total_present += present; total_reserved += reserved; total_cached += cached; diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index d9a40b1fe8ba..6facf15b04f3 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -48,6 +48,8 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; ++i) { page = pgdat_page_nr(pgdat, i); total++; @@ -60,6 +62,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%d pages of RAM\n", total); printk("%d pages of HIGHMEM\n",highmem); @@ -150,10 +153,14 @@ int __init reservedpages_count(void) int reservedpages, nid, i; reservedpages = 0; - for_each_online_node(nid) + for_each_online_node(nid) { + unsigned long flags; + pgdat_resize_lock(NODE_DATA(nid), &flags); for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++) if (PageReserved(nid_page_nr(nid, i))) reservedpages++; + pgdat_resize_unlock(NODE_DATA(nid), &flags); + } return reservedpages; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 2886ad70db48..29b998e430e6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -505,7 +505,9 @@ void show_mem(void) for (j = node_start_pfn(i); j < node_end_pfn(i); j++) { struct page *p; + unsigned long flags; + pgdat_resize_lock(NODE_DATA(i), &flags); p = nid_page_nr(i, j) - node_start_pfn(i); total++; @@ -517,6 +519,7 @@ void show_mem(void) free++; else shared += page_count(p) - 1; + pgdat_resize_unlock(NODE_DATA(i), &flags); } } #endif diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index a45584b3440c..975b26de34d6 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -104,6 +104,8 @@ void show_mem(void) show_free_areas(); printk("Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); total++; @@ -114,6 +116,7 @@ void show_mem(void) else if (page_count(page)) shared += page_count(page) - 1; } + pgdat_resize_unlock(pgdat, &flags); } printk("%ld pages of RAM\n", total); printk("%ld reserved pages\n", reserved); @@ -647,11 +650,14 @@ void __init mem_init(void) #endif for_each_pgdat(pgdat) { + unsigned long flags; + pgdat_resize_lock(pgdat, &flags); for (i = 0; i < pgdat->node_spanned_pages; i++) { page = pgdat_page_nr(pgdat, i); if (PageReserved(page)) reservedpages++; } + pgdat_resize_unlock(pgdat, &flags); } codesize = (unsigned long)&_etext - (unsigned long)&_stext; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h new file mode 100644 index 000000000000..e8103be9d528 --- /dev/null +++ b/include/linux/memory_hotplug.h @@ -0,0 +1,34 @@ +#ifndef __LINUX_MEMORY_HOTPLUG_H +#define __LINUX_MEMORY_HOTPLUG_H + +#include +#include + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * pgdat resizing functions + */ +static inline +void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqsave(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) +{ + spin_lock_irqrestore(&pgdat->node_size_lock, *flags); +} +static inline +void pgdat_resize_init(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->node_size_lock); +} +#else /* ! CONFIG_MEMORY_HOTPLUG */ +/* + * Stub functions for when hotplug is off + */ +static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} +static inline void pgdat_resize_init(struct pglist_data *pgdat) {} +#endif +#endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4674145bb63d..e050d68963a1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -273,6 +273,16 @@ typedef struct pglist_data { struct page *node_mem_map; #endif struct bootmem_data *bdata; +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Must be held any time you expect node_start_pfn, node_present_pages + * or node_spanned_pages stay constant. Holding this will also + * guarantee that any pfn_valid() stays that way. + * + * Nests above zone->lock and zone->size_seqlock. + */ + spinlock_t node_size_lock; +#endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page @@ -293,6 +303,8 @@ typedef struct pglist_data { #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) +#include + extern struct pglist_data *pgdat_list; void __get_zone_counts(unsigned long *active, unsigned long *inactive, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a51ef94eec33..32fad6d23200 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1958,6 +1958,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; From bdc8cb984576ab5b550c8b24c6fa111a873503e3 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:53 -0700 Subject: [PATCH 82/98] [PATCH] memory hotplug locking: zone span seqlock See the "fixup bad_range()" patch for more information, but this actually creates a the lock to protect things making assumptions about a zone's size staying constant at runtime. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 39 ++++++++++++++++++++++++++++++++-- include/linux/mmzone.h | 15 +++++++++++++ mm/page_alloc.c | 19 ++++++++++++----- 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index e8103be9d528..4b08bc947578 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -16,13 +16,36 @@ void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags) static inline void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags) { - spin_lock_irqrestore(&pgdat->node_size_lock, *flags); + spin_unlock_irqrestore(&pgdat->node_size_lock, *flags); } static inline void pgdat_resize_init(struct pglist_data *pgdat) { spin_lock_init(&pgdat->node_size_lock); } +/* + * Zone resizing functions + */ +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return read_seqbegin(&zone->span_seqlock); +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return read_seqretry(&zone->span_seqlock, iv); +} +static inline void zone_span_writelock(struct zone *zone) +{ + write_seqlock(&zone->span_seqlock); +} +static inline void zone_span_writeunlock(struct zone *zone) +{ + write_sequnlock(&zone->span_seqlock); +} +static inline void zone_seqlock_init(struct zone *zone) +{ + seqlock_init(&zone->span_seqlock); +} #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -30,5 +53,17 @@ void pgdat_resize_init(struct pglist_data *pgdat) static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {} static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {} static inline void pgdat_resize_init(struct pglist_data *pgdat) {} -#endif + +static inline unsigned zone_span_seqbegin(struct zone *zone) +{ + return 0; +} +static inline int zone_span_seqretry(struct zone *zone, unsigned iv) +{ + return 0; +} +static inline void zone_span_writelock(struct zone *zone) {} +static inline void zone_span_writeunlock(struct zone *zone) {} +static inline void zone_seqlock_init(struct zone *zone) {} +#endif /* ! CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e050d68963a1..f5fa3082fd6a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -12,6 +12,7 @@ #include #include #include +#include #include /* Free memory management - zoned buddy allocator. */ @@ -137,6 +138,10 @@ struct zone { * free areas of different sizes */ spinlock_t lock; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif struct free_area free_area[MAX_ORDER]; @@ -220,6 +225,16 @@ struct zone { /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; + /* + * zone_start_pfn, spanned_pages and present_pages are all + * protected by span_seqlock. It is a seqlock because it has + * to be read outside of zone->lock, and it is done in the main + * allocator path. But, it is written quite infrequently. + * + * The lock is declared along with zone->lock because it is + * frequently read in proximity to zone->lock. It's good to + * give them a chance of being in the same cacheline. + */ unsigned long spanned_pages; /* total size, including holes */ unsigned long present_pages; /* amount of memory (excluding holes) */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 32fad6d23200..817635f2ab62 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -80,12 +81,19 @@ unsigned long __initdata nr_all_pages; static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { - if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages) - return 1; - if (page_to_pfn(page) < zone->zone_start_pfn) - return 1; + int ret = 0; + unsigned seq; + unsigned long pfn = page_to_pfn(page); - return 0; + do { + seq = zone_span_seqbegin(zone); + if (pfn >= zone->zone_start_pfn + zone->spanned_pages) + ret = 1; + else if (pfn < zone->zone_start_pfn) + ret = 1; + } while (zone_span_seqretry(zone, seq)); + + return ret; } static int page_is_consistent(struct zone *zone, struct page *page) @@ -1980,6 +1988,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); + zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->free_pages = 0; From 3947be1969a9ce455ec30f60ef51efb10e4323d1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:54 -0700 Subject: [PATCH 83/98] [PATCH] memory hotplug: sysfs and add/remove functions This adds generic memory add/remove and supporting functions for memory hotplug into a new file as well as a memory hotplug kernel config option. Individual architecture patches will follow. For now, disable memory hotplug when swsusp is enabled. There's a lot of churn there right now. We'll fix it up properly once it calms down. Signed-off-by: Matt Tolentino Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/Makefile | 1 + drivers/base/init.c | 2 + drivers/base/memory.c | 455 +++++++++++++++++++++++++++++++++ include/linux/memory.h | 94 +++++++ include/linux/memory_hotplug.h | 35 +++ include/linux/mm.h | 1 + mm/Kconfig | 8 + mm/Makefile | 2 +- mm/memory_hotplug.c | 178 +++++++++++++ mm/page_alloc.c | 4 +- 10 files changed, 777 insertions(+), 3 deletions(-) create mode 100644 drivers/base/memory.c create mode 100644 include/linux/memory.h create mode 100644 mm/memory_hotplug.c diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 66d9c4643fc1..f12898d53078 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -7,6 +7,7 @@ obj-y := core.o sys.o bus.o dd.o \ obj-y += power/ obj-$(CONFIG_FW_LOADER) += firmware_class.o obj-$(CONFIG_NUMA) += node.o +obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG diff --git a/drivers/base/init.c b/drivers/base/init.c index 84e604e25c4f..c648914b9cde 100644 --- a/drivers/base/init.c +++ b/drivers/base/init.c @@ -9,6 +9,7 @@ #include #include +#include #include "base.h" @@ -33,5 +34,6 @@ void __init driver_init(void) platform_bus_init(); system_bus_init(); cpu_dev_init(); + memory_dev_init(); attribute_container_init(); } diff --git a/drivers/base/memory.c b/drivers/base/memory.c new file mode 100644 index 000000000000..785cb6e6b91c --- /dev/null +++ b/drivers/base/memory.c @@ -0,0 +1,455 @@ +/* + * drivers/base/memory.c - basic Memory class support + * + * Written by Matt Tolentino + * Dave Hansen + * + * This file provides the necessary infrastructure to represent + * a SPARSEMEM-memory-model system's physical memory in /sysfs. + * All arch-independent code that assumes MEMORY_HOTPLUG requires + * SPARSEMEM should be contained here, or in mm/memory_hotplug.c. + */ + +#include +#include +#include +#include /* capable() */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define MEMORY_CLASS_NAME "memory" + +static struct sysdev_class memory_sysdev_class = { + set_kset_name(MEMORY_CLASS_NAME), +}; +EXPORT_SYMBOL(memory_sysdev_class); + +static char *memory_hotplug_name(struct kset *kset, struct kobject *kobj) +{ + return MEMORY_CLASS_NAME; +} + +static int memory_hotplug(struct kset *kset, struct kobject *kobj, char **envp, + int num_envp, char *buffer, int buffer_size) +{ + int retval = 0; + + return retval; +} + +static struct kset_hotplug_ops memory_hotplug_ops = { + .name = memory_hotplug_name, + .hotplug = memory_hotplug, +}; + +static struct notifier_block *memory_chain; + +static int register_memory_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&memory_chain, nb); +} + +static void unregister_memory_notifier(struct notifier_block *nb) +{ + notifier_chain_unregister(&memory_chain, nb); +} + +/* + * register_memory - Setup a sysfs device for a memory block + */ +static int +register_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + int error; + + memory->sysdev.cls = &memory_sysdev_class; + memory->sysdev.id = __section_nr(section); + + error = sysdev_register(&memory->sysdev); + + if (root && !error) + error = sysfs_create_link(&root->sysdev.kobj, + &memory->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); + + return error; +} + +static void +unregister_memory(struct memory_block *memory, struct mem_section *section, + struct node *root) +{ + BUG_ON(memory->sysdev.cls != &memory_sysdev_class); + BUG_ON(memory->sysdev.id != __section_nr(section)); + + sysdev_unregister(&memory->sysdev); + if (root) + sysfs_remove_link(&root->sysdev.kobj, + kobject_name(&memory->sysdev.kobj)); +} + +/* + * use this as the physical section index that this memsection + * uses. + */ + +static ssize_t show_mem_phys_index(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%08lx\n", mem->phys_index); +} + +/* + * online, offline, going offline, etc. + */ +static ssize_t show_mem_state(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + ssize_t len = 0; + + /* + * We can probably put these states in a nice little array + * so that they're not open-coded + */ + switch (mem->state) { + case MEM_ONLINE: + len = sprintf(buf, "online\n"); + break; + case MEM_OFFLINE: + len = sprintf(buf, "offline\n"); + break; + case MEM_GOING_OFFLINE: + len = sprintf(buf, "going-offline\n"); + break; + default: + len = sprintf(buf, "ERROR-UNKNOWN-%ld\n", + mem->state); + WARN_ON(1); + break; + } + + return len; +} + +static inline int memory_notify(unsigned long val, void *v) +{ + return notifier_call_chain(&memory_chain, val, v); +} + +/* + * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is + * OK to have direct references to sparsemem variables in here. + */ +static int +memory_block_action(struct memory_block *mem, unsigned long action) +{ + int i; + unsigned long psection; + unsigned long start_pfn, start_paddr; + struct page *first_page; + int ret; + int old_state = mem->state; + + psection = mem->phys_index; + first_page = pfn_to_page(psection << PFN_SECTION_SHIFT); + + /* + * The probe routines leave the pages reserved, just + * as the bootmem code does. Make sure they're still + * that way. + */ + if (action == MEM_ONLINE) { + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageReserved(first_page+i)) + continue; + + printk(KERN_WARNING "section number %ld page number %d " + "not reserved, was it already online? \n", + psection, i); + return -EBUSY; + } + } + + switch (action) { + case MEM_ONLINE: + start_pfn = page_to_pfn(first_page); + ret = online_pages(start_pfn, PAGES_PER_SECTION); + break; + case MEM_OFFLINE: + mem->state = MEM_GOING_OFFLINE; + memory_notify(MEM_GOING_OFFLINE, NULL); + start_paddr = page_to_pfn(first_page) << PAGE_SHIFT; + ret = remove_memory(start_paddr, + PAGES_PER_SECTION << PAGE_SHIFT); + if (ret) { + mem->state = old_state; + break; + } + memory_notify(MEM_MAPPING_INVALID, NULL); + break; + default: + printk(KERN_WARNING "%s(%p, %ld) unknown action: %ld\n", + __FUNCTION__, mem, action, action); + WARN_ON(1); + ret = -EINVAL; + } + /* + * For now, only notify on successful memory operations + */ + if (!ret) + memory_notify(action, NULL); + + return ret; +} + +static int memory_block_change_state(struct memory_block *mem, + unsigned long to_state, unsigned long from_state_req) +{ + int ret = 0; + down(&mem->state_sem); + + if (mem->state != from_state_req) { + ret = -EINVAL; + goto out; + } + + ret = memory_block_action(mem, to_state); + if (!ret) + mem->state = to_state; + +out: + up(&mem->state_sem); + return ret; +} + +static ssize_t +store_mem_state(struct sys_device *dev, const char *buf, size_t count) +{ + struct memory_block *mem; + unsigned int phys_section_nr; + int ret = -EINVAL; + + mem = container_of(dev, struct memory_block, sysdev); + phys_section_nr = mem->phys_index; + + if (!valid_section_nr(phys_section_nr)) + goto out; + + if (!strncmp(buf, "online", min((int)count, 6))) + ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); + else if(!strncmp(buf, "offline", min((int)count, 7))) + ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE); +out: + if (ret) + return ret; + return count; +} + +/* + * phys_device is a bad name for this. What I really want + * is a way to differentiate between memory ranges that + * are part of physical devices that constitute + * a complete removable unit or fru. + * i.e. do these ranges belong to the same physical device, + * s.t. if I offline all of these sections I can then + * remove the physical device? + */ +static ssize_t show_phys_device(struct sys_device *dev, char *buf) +{ + struct memory_block *mem = + container_of(dev, struct memory_block, sysdev); + return sprintf(buf, "%d\n", mem->phys_device); +} + +static SYSDEV_ATTR(phys_index, 0444, show_mem_phys_index, NULL); +static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state); +static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL); + +#define mem_create_simple_file(mem, attr_name) \ + sysdev_create_file(&mem->sysdev, &attr_##attr_name) +#define mem_remove_simple_file(mem, attr_name) \ + sysdev_remove_file(&mem->sysdev, &attr_##attr_name) + +/* + * Block size attribute stuff + */ +static ssize_t +print_block_size(struct class *class, char *buf) +{ + return sprintf(buf, "%lx\n", (unsigned long)PAGES_PER_SECTION * PAGE_SIZE); +} + +static CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL); + +static int block_size_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_block_size_bytes.attr); + return 0; +} + +/* + * Some architectures will have custom drivers to do this, and + * will not need to do it from userspace. The fake hot-add code + * as well as ppc64 will do all of their discovery in userspace + * and will require this interface. + */ +#ifdef CONFIG_ARCH_MEMORY_PROBE +static ssize_t +memory_probe_store(struct class *class, const char __user *buf, size_t count) +{ + u64 phys_addr; + int ret; + + phys_addr = simple_strtoull(buf, NULL, 0); + + ret = add_memory(phys_addr, PAGES_PER_SECTION << PAGE_SHIFT); + + if (ret) + count = ret; + + return count; +} +static CLASS_ATTR(probe, 0700, NULL, memory_probe_store); + +static int memory_probe_init(void) +{ + sysfs_create_file(&memory_sysdev_class.kset.kobj, + &class_attr_probe.attr); + return 0; +} +#else +#define memory_probe_init(...) do {} while (0) +#endif + +/* + * Note that phys_device is optional. It is here to allow for + * differentiation between which *physical* devices each + * section belongs to... + */ + +static int add_memory_block(unsigned long node_id, struct mem_section *section, + unsigned long state, int phys_device) +{ + size_t size = sizeof(struct memory_block); + struct memory_block *mem = kmalloc(size, GFP_KERNEL); + int ret = 0; + + if (!mem) + return -ENOMEM; + + memset(mem, 0, size); + + mem->phys_index = __section_nr(section); + mem->state = state; + init_MUTEX(&mem->state_sem); + mem->phys_device = phys_device; + + ret = register_memory(mem, section, NULL); + if (!ret) + ret = mem_create_simple_file(mem, phys_index); + if (!ret) + ret = mem_create_simple_file(mem, state); + if (!ret) + ret = mem_create_simple_file(mem, phys_device); + + return ret; +} + +/* + * For now, we have a linear search to go find the appropriate + * memory_block corresponding to a particular phys_index. If + * this gets to be a real problem, we can always use a radix + * tree or something here. + * + * This could be made generic for all sysdev classes. + */ +static struct memory_block *find_memory_block(struct mem_section *section) +{ + struct kobject *kobj; + struct sys_device *sysdev; + struct memory_block *mem; + char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1]; + + /* + * This only works because we know that section == sysdev->id + * slightly redundant with sysdev_register() + */ + sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, __section_nr(section)); + + kobj = kset_find_obj(&memory_sysdev_class.kset, name); + if (!kobj) + return NULL; + + sysdev = container_of(kobj, struct sys_device, kobj); + mem = container_of(sysdev, struct memory_block, sysdev); + + return mem; +} + +int remove_memory_block(unsigned long node_id, struct mem_section *section, + int phys_device) +{ + struct memory_block *mem; + + mem = find_memory_block(section); + mem_remove_simple_file(mem, phys_index); + mem_remove_simple_file(mem, state); + mem_remove_simple_file(mem, phys_device); + unregister_memory(mem, section, NULL); + + return 0; +} + +/* + * need an interface for the VM to add new memory regions, + * but without onlining it. + */ +int register_new_memory(struct mem_section *section) +{ + return add_memory_block(0, section, MEM_OFFLINE, 0); +} + +int unregister_memory_section(struct mem_section *section) +{ + if (!valid_section(section)) + return -EINVAL; + + return remove_memory_block(0, section, 0); +} + +/* + * Initialize the sysfs support for memory devices... + */ +int __init memory_dev_init(void) +{ + unsigned int i; + int ret; + + memory_sysdev_class.kset.hotplug_ops = &memory_hotplug_ops; + ret = sysdev_class_register(&memory_sysdev_class); + + /* + * Create entries for memory sections that were found + * during boot and have been initialized + */ + for (i = 0; i < NR_MEM_SECTIONS; i++) { + if (!valid_section_nr(i)) + continue; + add_memory_block(0, __nr_to_section(i), MEM_ONLINE, 0); + } + + memory_probe_init(); + block_size_init(); + + return ret; +} diff --git a/include/linux/memory.h b/include/linux/memory.h new file mode 100644 index 000000000000..0def328ab5cf --- /dev/null +++ b/include/linux/memory.h @@ -0,0 +1,94 @@ +/* + * include/linux/memory.h - generic memory definition + * + * This is mainly for topological representation. We define the + * basic "struct memory_block" here, which can be embedded in per-arch + * definitions or NUMA information. + * + * Basic handling of the devices is done in drivers/base/memory.c + * and system devices are handled in drivers/base/sys.c. + * + * Memory block are exported via sysfs in the class/memory/devices/ + * directory. + * + */ +#ifndef _LINUX_MEMORY_H_ +#define _LINUX_MEMORY_H_ + +#include +#include +#include + +#include + +struct memory_block { + unsigned long phys_index; + unsigned long state; + /* + * This serializes all state change requests. It isn't + * held during creation because the control files are + * created long after the critical areas during + * initialization. + */ + struct semaphore state_sem; + int phys_device; /* to which fru does this belong? */ + void *hw; /* optional pointer to fw/hw data */ + int (*phys_callback)(struct memory_block *); + struct sys_device sysdev; +}; + +/* These states are exposed to userspace as text strings in sysfs */ +#define MEM_ONLINE (1<<0) /* exposed to userspace */ +#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ +#define MEM_OFFLINE (1<<2) /* exposed to userspace */ + +/* + * All of these states are currently kernel-internal for notifying + * kernel components and architectures. + * + * For MEM_MAPPING_INVALID, all notifier chains with priority >0 + * are called before pfn_to_page() becomes invalid. The priority=0 + * entry is reserved for the function that actually makes + * pfn_to_page() stop working. Any notifiers that want to be called + * after that should have priority <0. + */ +#define MEM_MAPPING_INVALID (1<<3) + +#ifndef CONFIG_MEMORY_HOTPLUG +static inline int memory_dev_init(void) +{ + return 0; +} +static inline int register_memory_notifier(struct notifier_block *nb) +{ + return 0; +} +static inline void unregister_memory_notifier(struct notifier_block *nb) +{ +} +#else +extern int register_memory(struct memory_block *, struct mem_section *section, struct node *); +extern int register_new_memory(struct mem_section *); +extern int unregister_memory_section(struct mem_section *); +extern int memory_dev_init(void); +extern int register_memory_notifier(struct notifier_block *nb); +extern void unregister_memory_notifier(struct notifier_block *nb); + +#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION< #include +#include +#include #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -46,6 +48,19 @@ static inline void zone_seqlock_init(struct zone *zone) { seqlock_init(&zone->span_seqlock); } +extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); +extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); +extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); +/* need some defines for these for archs that don't support it */ +extern void online_page(struct page *page); +/* VM interface that may be used by firmware interface */ +extern int add_memory(u64 start, u64 size); +extern int remove_memory(u64 start, u64 size); +extern int online_pages(unsigned long, unsigned long); + +/* reasonably generic interface to expand the physical pages in a zone */ +extern int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages); #else /* ! CONFIG_MEMORY_HOTPLUG */ /* * Stub functions for when hotplug is off @@ -65,5 +80,25 @@ static inline int zone_span_seqretry(struct zone *zone, unsigned iv) static inline void zone_span_writelock(struct zone *zone) {} static inline void zone_span_writeunlock(struct zone *zone) {} static inline void zone_seqlock_init(struct zone *zone) {} + +static inline int mhp_notimplemented(const char *func) +{ + printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func); + dump_stack(); + return -ENOSYS; +} + +static inline int __add_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + return mhp_notimplemented(__FUNCTION__); +} #endif /* ! CONFIG_MEMORY_HOTPLUG */ +static inline int __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) +{ + printk(KERN_WARNING "%s() called, not yet supported\n", __FUNCTION__); + dump_stack(); + return -ENOSYS; +} #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8a514eca40d5..5c1fb0a2e806 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -840,6 +840,7 @@ extern void free_area_init_node(int nid, pg_data_t *pgdat, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long); +extern void setup_per_zone_pages_min(void); extern void mem_init(void); extern void show_mem(void); extern void si_meminfo(struct sysinfo * val); diff --git a/mm/Kconfig b/mm/Kconfig index f35a550ba4b9..1a4473fcb2ca 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -112,6 +112,14 @@ config SPARSEMEM_EXTREME def_bool y depends on SPARSEMEM && !SPARSEMEM_STATIC +# eventually, we can have this option just 'select SPARSEMEM' +config MEMORY_HOTPLUG + bool "Allow for memory hot-add" + depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND + +comment "Memory hotplug is currently incompatible with Software Suspend" + depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. diff --git a/mm/Makefile b/mm/Makefile index 4cd69e3ce421..2fa6d2ca9f28 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o - +obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c new file mode 100644 index 000000000000..855e0fc928b3 --- /dev/null +++ b/mm/memory_hotplug.c @@ -0,0 +1,178 @@ +/* + * linux/mm/memory_hotplug.c + * + * Copyright (C) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size); +static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + int nid = pgdat->node_id; + int zone_type; + + zone_type = zone - pgdat->node_zones; + memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn); + zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); +} + +extern int sparse_add_one_section(struct zone *, unsigned long, + struct page *mem_map); +static int __add_section(struct zone *zone, unsigned long phys_start_pfn) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + int nr_pages = PAGES_PER_SECTION; + struct page *memmap; + int ret; + + /* + * This can potentially allocate memory, and does its own + * internal locking. + */ + sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id); + + pgdat_resize_lock(pgdat, &flags); + memmap = __kmalloc_section_memmap(nr_pages); + ret = sparse_add_one_section(zone, phys_start_pfn, memmap); + pgdat_resize_unlock(pgdat, &flags); + + if (ret <= 0) { + /* the mem_map didn't get used */ + if (memmap >= (struct page *)VMALLOC_START && + memmap < (struct page *)VMALLOC_END) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); + } + + if (ret < 0) + return ret; + + __add_zone(zone, phys_start_pfn); + return register_new_memory(__pfn_to_section(phys_start_pfn)); +} + +/* + * Reasonably generic function for adding memory. It is + * expected that archs that support memory hotplug will + * call this function after deciding the zone to which to + * add the new pages. + */ +int __add_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages) +{ + unsigned long i; + int err = 0; + + for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { + err = __add_section(zone, phys_start_pfn + i); + + if (err) + break; + } + + return err; +} + +static void grow_zone_span(struct zone *zone, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_zone_end_pfn; + + zone_span_writelock(zone); + + old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + if (start_pfn < zone->zone_start_pfn) + zone->zone_start_pfn = start_pfn; + + if (end_pfn > old_zone_end_pfn) + zone->spanned_pages = end_pfn - zone->zone_start_pfn; + + zone_span_writeunlock(zone); +} + +static void grow_pgdat_span(struct pglist_data *pgdat, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (start_pfn < pgdat->node_start_pfn) + pgdat->node_start_pfn = start_pfn; + + if (end_pfn > old_pgdat_end_pfn) + pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; +} + +int online_pages(unsigned long pfn, unsigned long nr_pages) +{ + unsigned long i; + unsigned long flags; + unsigned long onlined_pages = 0; + struct zone *zone; + + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_sem. + */ + zone = page_zone(pfn_to_page(pfn)); + pgdat_resize_lock(zone->zone_pgdat, &flags); + grow_zone_span(zone, pfn, pfn + nr_pages); + grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages); + pgdat_resize_unlock(zone->zone_pgdat, &flags); + + for (i = 0; i < nr_pages; i++) { + struct page *page = pfn_to_page(pfn + i); + online_page(page); + onlined_pages++; + } + zone->present_pages += onlined_pages; + + return 0; +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 817635f2ab62..183abf39b445 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1686,7 +1686,7 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; @@ -2407,7 +2407,7 @@ static void setup_per_zone_lowmem_reserve(void) * that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -static void setup_per_zone_pages_min(void) +void setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; From 0b0acbec1bed75ec1e1daa7f7006323a2a2b2844 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:55 -0700 Subject: [PATCH 84/98] [PATCH] memory hotplug: move section_mem_map alloc to sparse.c This basically keeps up from having to extern __kmalloc_section_memmap(). The vaddr_in_vmalloc_area() helper could go in a vmalloc header, but that header gets hard to work with, because it needs some arch-specific macros. Just stick it in here for now, instead of creating another header. Signed-off-by: Dave Hansen Signed-off-by: Lion Vollnhals Signed-off-by: Jiri Slaby Signed-off-by: Yasunori Goto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/acpi/acpi_memhotplug.c | 5 +-- drivers/base/memory.c | 5 +-- mm/memory_hotplug.c | 48 ++-------------------- mm/sparse.c | 74 +++++++++++++++++++++++++++++++--- 4 files changed, 75 insertions(+), 57 deletions(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 01a1bd239263..2143609d2936 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -200,8 +200,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) * Note: Assume that this function returns zero on success */ result = add_memory(mem_device->start_addr, - (mem_device->end_addr - mem_device->start_addr) + 1, - mem_device->read_write_attribute); + (mem_device->end_addr - mem_device->start_addr) + 1); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "\nadd_memory failed\n")); mem_device->state = MEMORY_INVALID_STATE; @@ -259,7 +258,7 @@ static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) * Ask the VM to offline this memory range. * Note: Assume that this function returns zero on success */ - result = remove_memory(start, len, attr); + result = remove_memory(start, len); if (result) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Hot-Remove failed.\n")); return_VALUE(result); diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 785cb6e6b91c..b7ddd651d664 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -340,15 +340,12 @@ static int memory_probe_init(void) static int add_memory_block(unsigned long node_id, struct mem_section *section, unsigned long state, int phys_device) { - size_t size = sizeof(struct memory_block); - struct memory_block *mem = kmalloc(size, GFP_KERNEL); + struct memory_block *mem = kzalloc(sizeof(*mem), GFP_KERNEL); int ret = 0; if (!mem) return -ENOMEM; - memset(mem, 0, size); - mem->phys_index = __section_nr(section); mem->state = state; init_MUTEX(&mem->state_sem); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 855e0fc928b3..2e916c308ae6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -24,28 +24,6 @@ #include -static struct page *__kmalloc_section_memmap(unsigned long nr_pages) -{ - struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * nr_pages; - - page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); - if (page) - goto got_map_page; - - ret = vmalloc(memmap_size); - if (ret) - goto got_map_ptr; - - return NULL; -got_map_page: - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); -got_map_ptr: - memset(ret, 0, memmap_size); - - return ret; -} - extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, unsigned long size); static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) @@ -60,35 +38,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn) zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages); } -extern int sparse_add_one_section(struct zone *, unsigned long, - struct page *mem_map); +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages); static int __add_section(struct zone *zone, unsigned long phys_start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; - struct page *memmap; int ret; - /* - * This can potentially allocate memory, and does its own - * internal locking. - */ - sparse_index_init(pfn_to_section_nr(phys_start_pfn), pgdat->node_id); - - pgdat_resize_lock(pgdat, &flags); - memmap = __kmalloc_section_memmap(nr_pages); - ret = sparse_add_one_section(zone, phys_start_pfn, memmap); - pgdat_resize_unlock(pgdat, &flags); - - if (ret <= 0) { - /* the mem_map didn't get used */ - if (memmap >= (struct page *)VMALLOC_START && - memmap < (struct page *)VMALLOC_END) - vfree(memmap); - else - free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * nr_pages)); - } + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); if (ret < 0) return ret; diff --git a/mm/sparse.c b/mm/sparse.c index 0d3bd4bf3aaa..72079b538e2d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -5,8 +5,10 @@ #include #include #include +#include #include #include +#include #include /* @@ -187,6 +189,45 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum) return NULL; } +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +{ + struct page *page, *ret; + unsigned long memmap_size = sizeof(struct page) * nr_pages; + + page = alloc_pages(GFP_KERNEL, get_order(memmap_size)); + if (page) + goto got_map_page; + + ret = vmalloc(memmap_size); + if (ret) + goto got_map_ptr; + + return NULL; +got_map_page: + ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); +got_map_ptr: + memset(ret, 0, memmap_size); + + return ret; +} + +static int vaddr_in_vmalloc_area(void *addr) +{ + if (addr >= (void *)VMALLOC_START && + addr < (void *)VMALLOC_END) + return 1; + return 0; +} + +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +{ + if (vaddr_in_vmalloc_area(memmap)) + vfree(memmap); + else + free_pages((unsigned long)memmap, + get_order(sizeof(struct page) * nr_pages)); +} + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -212,14 +253,37 @@ void sparse_init(void) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map) +int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { - struct mem_section *ms = __pfn_to_section(start_pfn); + unsigned long section_nr = pfn_to_section_nr(start_pfn); + struct pglist_data *pgdat = zone->zone_pgdat; + struct mem_section *ms; + struct page *memmap; + unsigned long flags; + int ret; - if (ms->section_mem_map & SECTION_MARKED_PRESENT) - return -EEXIST; + /* + * no locking for this, because it does its own + * plus, it does a kmalloc + */ + sparse_index_init(section_nr, pgdat->node_id); + memmap = __kmalloc_section_memmap(nr_pages); + pgdat_resize_lock(pgdat, &flags); + + ms = __pfn_to_section(start_pfn); + if (ms->section_mem_map & SECTION_MARKED_PRESENT) { + ret = -EEXIST; + goto out; + } ms->section_mem_map |= SECTION_MARKED_PRESENT; - return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map); + ret = sparse_init_one_section(ms, section_nr, memmap); + + if (ret <= 0) + __kfree_section_memmap(memmap, nr_pages); +out: + pgdat_resize_unlock(pgdat, &flags); + return ret; } From 61b13993a81866fc1d4830dfab80530c9c061e37 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:56 -0700 Subject: [PATCH 85/98] [PATCH] memory hotplug: call setup_per_zone_pages_min after hotplug From: IWAMOTO Toshihiro > I found the tests does not work well with Dave's patchset. > I've found the followings: > > - setup_per_zone_pages_min() calls should be added in > capture_page_range() and online_pages() > - lru_add_drain() should be called before try_to_migrate_pages() The following patch deals with the first item. Signed-off-by: IWAMOTO Toshihiro Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2e916c308ae6..431a64f021c0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -132,5 +132,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) } zone->present_pages += onlined_pages; + setup_per_zone_pages_min(); + return 0; } From 05039b926374212b2d861860cf54b9e839d4dd76 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:57 -0700 Subject: [PATCH 86/98] [PATCH] memory hotplug: i386 addition functions Adds the necessary for non-NUMA hot-add of highmem to an existing zone on i386. Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/discontig.c | 4 +-- arch/i386/mm/init.c | 62 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 7 deletions(-) diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c index 244d8ec66be2..c4af9638dbfa 100644 --- a/arch/i386/mm/discontig.c +++ b/arch/i386/mm/discontig.c @@ -98,7 +98,7 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long find_max_low_pfn(void); extern void find_max_pfn(void); -extern void one_highpage_init(struct page *, int, int); +extern void add_one_highpage_init(struct page *, int, int); extern struct e820map e820; extern unsigned long init_pg_tables_end; @@ -427,7 +427,7 @@ void __init set_highmem_pages_init(int bad_ppro) if (!pfn_valid(node_pfn)) continue; page = pfn_to_page(node_pfn); - one_highpage_init(page, node_pfn, bad_ppro); + add_one_highpage_init(page, node_pfn, bad_ppro); } } totalram_pages += totalhigh_pages; diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2ebaf75f732e..542d9298da5e 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -266,17 +267,46 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) +void __devinit free_new_highpage(struct page *page) +{ + set_page_count(page, 1); + __free_page(page); + totalhigh_pages++; +} + +void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - set_page_count(page, 1); - __free_page(page); - totalhigh_pages++; + free_new_highpage(page); } else SetPageReserved(page); } +static int add_one_highpage_hotplug(struct page *page, unsigned long pfn) +{ + free_new_highpage(page); + totalram_pages++; +#ifdef CONFIG_FLATMEM + max_mapnr = max(pfn, max_mapnr); +#endif + num_physpages++; + return 0; +} + +/* + * Not currently handling the NUMA case. + * Assuming single node and all memory that + * has been added dynamically that would be + * onlined here is in HIGHMEM + */ +void online_page(struct page *page) +{ + ClearPageReserved(page); + add_one_highpage_hotplug(page, page_to_pfn(page)); +} + + #ifdef CONFIG_NUMA extern void set_highmem_pages_init(int); #else @@ -284,7 +314,7 @@ static void __init set_highmem_pages_init(int bad_ppro) { int pfn; for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) - one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); + add_one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro); totalram_pages += totalhigh_pages; } #endif /* CONFIG_FLATMEM */ @@ -615,6 +645,28 @@ void __init mem_init(void) #endif } +/* + * this is for the non-NUMA, single node SMP system case. + * Specifically, in the case of x86, we will always add + * memory to the highmem for now. + */ +#ifndef CONFIG_NEED_MULTIPLE_NODES +int add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = &contig_page_data; + struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + return __add_pages(zone, start_pfn, nr_pages); +} + +int remove_memory(u64 start, u64 size) +{ + return -EINVAL; +} +#endif + kmem_cache_t *pgd_cache; kmem_cache_t *pmd_cache; From bb7e7e032d2cb8e0e9a88a2be209de5e61033b39 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Sat, 29 Oct 2005 18:16:58 -0700 Subject: [PATCH 87/98] [PATCH] memory hotplug: ppc64 specific hot-add functions Here is a set of ppc64 specific patches that at least allow compilation/booting with the following configurations: FLATMEM SPARSEMEN SPARSEMEM + MEMORY_HOTPLUG Signed-off-by: Mike Kravetz Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc64/mm/init.c | 77 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 975b26de34d6..e2bd7776622f 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -871,3 +871,80 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr, return vma_prot; } EXPORT_SYMBOL(phys_mem_access_prot); + +#ifdef CONFIG_MEMORY_HOTPLUG + +void online_page(struct page *page) +{ + ClearPageReserved(page); + free_cold_page(page); + totalram_pages++; + num_physpages++; +} + +/* + * This works only for the non-NUMA case. Later, we'll need a lookup + * to convert from real physical addresses to nid, that doesn't use + * pfn_to_nid(). + */ +int __devinit add_memory(u64 start, u64 size) +{ + struct pglist_data *pgdata = NODE_DATA(0); + struct zone *zone; + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones; + + return __add_pages(zone, start_pfn, nr_pages); + + return 0; +} + +/* + * First pass at this code will check to determine if the remove + * request is within the RMO. Do not allow removal within the RMO. + */ +int __devinit remove_memory(u64 start, u64 size) +{ + struct zone *zone; + unsigned long start_pfn, end_pfn, nr_pages; + + start_pfn = start >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; + end_pfn = start_pfn + nr_pages; + + printk("%s(): Attempting to remove memoy in range " + "%lx to %lx\n", __func__, start, start+size); + /* + * check for range within RMO + */ + zone = page_zone(pfn_to_page(start_pfn)); + + printk("%s(): memory will be removed from " + "the %s zone\n", __func__, zone->name); + + /* + * not handling removing memory ranges that + * overlap multiple zones yet + */ + if (end_pfn > (zone->zone_start_pfn + zone->spanned_pages)) + goto overlap; + + /* make sure it is NOT in RMO */ + if ((start < lmb.rmo_size) || ((start+size) < lmb.rmo_size)) { + printk("%s(): range to be removed must NOT be in RMO!\n", + __func__); + goto in_rmo; + } + + return __remove_pages(zone, start_pfn, nr_pages); + +overlap: + printk("%s(): memory range to be removed overlaps " + "multiple zones!!!\n", __func__); +in_rmo: + return -1; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ From 8bccd85ffbaf8ff1448d1235fa6594e207695531 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:16:59 -0700 Subject: [PATCH 88/98] [PATCH] Implement sys_* do_* layering in the memory policy layer. - Do a separation between do_xxx and sys_xxx functions. sys_xxx functions take variable sized bitmaps from user space as arguments. do_xxx functions take fixed sized nodemask_t as arguments and may be used from inside the kernel. Doing so simplifies the initialization code. There is no fs = kernel_ds assumption anymore. - Split up get_nodes into get_nodes (which gets the node list) and contextualize_policy which restricts the nodes to those accessible to the task and updates cpusets. - Add comments explaining limitations of bind policy Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 276 +++++++++++++++++++++++++++++-------------------- 1 file changed, 162 insertions(+), 114 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 902d4c9eccdc..123925f50f86 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2,6 +2,7 @@ * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. + * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * Subject to the GNU Public License, version 2. * * NUMA policy allows the user to give hints in which node(s) memory should @@ -17,13 +18,19 @@ * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. + * * bind Only allocate memory on a specific set of nodes, * no fallback. + * FIXME: memory is allocated starting with the first node + * to the last. It would be better if bind would truly restrict + * the allocation to memory nodes instead + * * preferred Try a specific node first before normal fallback. * As a special case node -1 here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. + * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. @@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } - -/* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, - unsigned long maxnode, int mode) -{ - unsigned long k; - unsigned long nlongs; - unsigned long endmask; - - --maxnode; - nodes_clear(*nodes); - if (maxnode == 0 || !nmask) - return 0; - - nlongs = BITS_TO_LONGS(maxnode); - if ((maxnode % BITS_PER_LONG) == 0) - endmask = ~0UL; - else - endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ - if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; - for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; - if (get_user(t, nmask + k)) - return -EFAULT; - if (k == nlongs - 1) { - if (t & endmask) - return -EINVAL; - } else if (t) - return -EINVAL; - } - nlongs = BITS_TO_LONGS(MAX_NUMNODES); - endmask = ~0UL; - } - - if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) - return -EFAULT; - nodes_addr(*nodes)[nlongs-1] &= endmask; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - /* AK: shouldn't this error out instead? */ - cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); - return mpol_check_policy(mode, nodes); -} - /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { @@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -/* Change policy for a memory range */ -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +static int contextualize_policy(int mode, nodemask_t *nodes) +{ + if (!nodes) + return 0; + + /* Update current mems_allowed */ + cpuset_update_current_mems_allowed(); + /* Ignore nodes not set in current->mems_allowed */ + cpuset_restrict_to_mems_allowed(nodes->bits); + return mpol_check_policy(mode, nodes); +} + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; - nodemask_t nodes; int err; if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) @@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - - new = mpol_new(mode, &nodes); + if (contextualize_policy(mode, nmask)) + return -EINVAL; + new = mpol_new(mode, nmask); if (IS_ERR(new)) return PTR_ERR(new); @@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, mode,nodes_addr(nodes)[0]); down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, &nodes, flags); + vma = check_range(mm, start, end, nmask, flags); err = PTR_ERR(vma); if (!IS_ERR(vma)) err = mbind_range(vma, start, end, new); @@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +long do_set_mempolicy(int mode, nodemask_t *nodes) { - int err; struct mempolicy *new; - nodemask_t nodes; - if (mode < 0 || mode > MPOL_MAX) + if (contextualize_policy(mode, nodes)) return -EINVAL; - err = get_nodes(&nodes, nmask, maxnode, mode); - if (err) - return err; - new = mpol_new(mode, &nodes); + new = mpol_new(mode, nodes); if (IS_ERR(new)) return PTR_ERR(new); mpol_free(current->mempolicy); @@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); + node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + *nodes); break; case MPOL_DEFAULT: break; @@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) return err; } -/* Copy a kernel node mask to user space */ -static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, - nodemask_t *nodes) -{ - unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); - - if (copy > nbytes) { - if (copy > PAGE_SIZE) - return -EINVAL; - if (clear_user((char __user *)mask + nbytes, copy - nbytes)) - return -EFAULT; - copy = nbytes; - } - return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; -} - /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { - int err, pval; + int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; - if (nmask != NULL && maxnode < MAX_NUMNODES) - return -EINVAL; if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, err = lookup_node(mm, addr); if (err < 0) goto out; - pval = err; + *policy = err; } else if (pol == current->mempolicy && pol->policy == MPOL_INTERLEAVE) { - pval = current->il_next; + *policy = current->il_next; } else { err = -EINVAL; goto out; } } else - pval = pol->policy; + *policy = pol->policy; if (vma) { up_read(¤t->mm->mmap_sem); vma = NULL; } - if (policy && put_user(pval, policy)) - return -EFAULT; - err = 0; - if (nmask) { - nodemask_t nodes; - get_zonemask(pol, &nodes); - err = copy_nodes_to_user(nmask, maxnode, &nodes); - } + if (nmask) + get_zonemask(pol, nmask); out: if (vma) @@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, return err; } +/* + * User space interface with variable sized bitmaps for nodelists. + */ + +/* Copy a node mask from user space. */ +static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, + unsigned long maxnode) +{ + unsigned long k; + unsigned long nlongs; + unsigned long endmask; + + --maxnode; + nodes_clear(*nodes); + if (maxnode == 0 || !nmask) + return 0; + + nlongs = BITS_TO_LONGS(maxnode); + if ((maxnode % BITS_PER_LONG) == 0) + endmask = ~0UL; + else + endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; + + /* When the user specified more nodes than supported just check + if the non supported part is all zero. */ + if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { + if (nlongs > PAGE_SIZE/sizeof(long)) + return -EINVAL; + for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { + unsigned long t; + if (get_user(t, nmask + k)) + return -EFAULT; + if (k == nlongs - 1) { + if (t & endmask) + return -EINVAL; + } else if (t) + return -EINVAL; + } + nlongs = BITS_TO_LONGS(MAX_NUMNODES); + endmask = ~0UL; + } + + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) + return -EFAULT; + nodes_addr(*nodes)[nlongs-1] &= endmask; + return 0; +} + +/* Copy a kernel node mask to user space */ +static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, + nodemask_t *nodes) +{ + unsigned long copy = ALIGN(maxnode-1, 64) / 8; + const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + + if (copy > nbytes) { + if (copy > PAGE_SIZE) + return -EINVAL; + if (clear_user((char __user *)mask + nbytes, copy - nbytes)) + return -EFAULT; + copy = nbytes; + } + return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; +} + +asmlinkage long sys_mbind(unsigned long start, unsigned long len, + unsigned long mode, + unsigned long __user *nmask, unsigned long maxnode, + unsigned flags) +{ + nodemask_t nodes; + int err; + + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_mbind(start, len, mode, &nodes, flags); +} + +/* Set the process memory policy */ +asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, + unsigned long maxnode) +{ + int err; + nodemask_t nodes; + + if (mode < 0 || mode > MPOL_MAX) + return -EINVAL; + err = get_nodes(&nodes, nmask, maxnode); + if (err) + return err; + return do_set_mempolicy(mode, &nodes); +} + +/* Retrieve NUMA policy */ +asmlinkage long sys_get_mempolicy(int __user *policy, + unsigned long __user *nmask, + unsigned long maxnode, + unsigned long addr, unsigned long flags) +{ + int err, pval; + nodemask_t nodes; + + if (nmask != NULL && maxnode < MAX_NUMNODES) + return -EINVAL; + + err = do_get_mempolicy(&pval, &nodes, addr, flags); + + if (err) + return err; + + if (policy && put_user(pval, policy)) + return -EFAULT; + + if (nmask) + err = copy_nodes_to_user(nmask, maxnode, &nodes); + + return err; +} + #ifdef CONFIG_COMPAT asmlinkage long compat_sys_get_mempolicy(int __user *policy, @@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); + pol = vma->vm_ops->get_policy(vma, addr); else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; @@ -1147,14 +1197,12 @@ void __init numa_policy_init(void) /* Set interleaving policy for system init. This way not all the data structures allocated at system boot end up in node zero. */ - if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), - MAX_NUMNODES) < 0) + if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) printk("numa_policy_init: interleaving failed\n"); } -/* Reset policy of current process to default. - * Assumes fs == KERNEL_DS */ +/* Reset policy of current process to default */ void numa_default_policy(void) { - sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); + do_set_mempolicy(MPOL_DEFAULT, NULL); } From 5fcbb23050936d69de8087d4b311eaf55cb42740 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 29 Oct 2005 18:17:00 -0700 Subject: [PATCH 89/98] [PATCH] Remove policy contextualization from mbind Policy contextualization is only useful for task based policies and not for vma based policies. It may be useful to define allowed nodes that are not accessible from this thread because other threads may have access to these nodes. Without this patch strange memory policy situations may cause an application to fail with out of memory. Example: Let's say we have two threads A and B that share the same address space and a huge array computational array X. Thread A is restricted by its cpuset to nodes 0 and 1 and thread B is restricted by its cpuset to nodes 2 and 3. Thread A now wants to restrict allocations to the first node and thus applies a BIND policy on X to node 0 and 2. The cpuset limits this to node 0. Thus pages for X must be allocated on node 0 now. Thread B now touches a page that has never been used in X and faults in a page. According to the BIND policy of the vma for X the page must be allocated on page 0. However, the cpuset of B does not allow allocation on 0 and 1. Now the application fails in alloc_pages with out of memory. Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 123925f50f86..2076b1542b8a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -370,7 +370,7 @@ long do_mbind(unsigned long start, unsigned long len, return -EINVAL; if (end == start) return 0; - if (contextualize_policy(mode, nmask)) + if (mpol_check_policy(mode, nmask)) return -EINVAL; new = mpol_new(mode, nmask); if (IS_ERR(new)) From 2f96996de0eda378df2a5f857ee1ef615ae10a4f Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Sat, 29 Oct 2005 18:17:01 -0700 Subject: [PATCH 90/98] [PATCH] mm: wider use of for_each_*cpu() In 'mm' change the explicit use of a for-loop using NR_CPUS into the general for_each_cpu() constructs. This widens the scope of potential future optimizations of the general constructs, as well as takes advantage of the existing optimizations of first_cpu() and next_cpu(), which is advantageous when the true CPU count is much smaller than NR_CPUS. Signed-off-by: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 183abf39b445..2dbdd98426fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1331,12 +1331,9 @@ void show_free_areas(void) } else printk("\n"); - for (cpu = 0; cpu < NR_CPUS; ++cpu) { + for_each_cpu(cpu) { struct per_cpu_pageset *pageset; - if (!cpu_possible(cpu)) - continue; - pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) From b1459461f1e0abd5c28317d6bff6f2ca612a719d Mon Sep 17 00:00:00 2001 From: Nikita Danilov Date: Sat, 29 Oct 2005 18:17:02 -0700 Subject: [PATCH 91/98] [PATCH] mm/filemap.c:filemap_populate(): move export. move EXPORT_SYMBOL(filemap_populate) to the proper place: just after function itself: it's easy to miss that function is exported otherwise. Signed-off-by: Nikita Danilov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 036599d1177e..768687f1d46b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1537,6 +1537,7 @@ repeat: return 0; } +EXPORT_SYMBOL(filemap_populate); struct vm_operations_struct generic_file_vm_ops = { .nopage = filemap_nopage, @@ -1555,7 +1556,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &generic_file_vm_ops; return 0; } -EXPORT_SYMBOL(filemap_populate); /* * This is for filesystems which do not implement ->writepage. From a7dac447bb9cef27d4d29cdf63e2d7809c50b1f4 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 30 Oct 2005 04:44:42 -0500 Subject: [PATCH 92/98] [libata] change ata_qc_complete() to take error mask as second arg The second argument to ata_qc_complete() was being used for two purposes: communicate the ATA Status register to the completion function, and indicate an error. On legacy PCI IDE hardware, the latter is often implicit in the former. On more modern hardware, the driver often completely emulated a Status register value, passing ATA_ERR as an indication that something went wrong. Now that previous code changes have eliminated the need to use drv_stat arg to communicate the ATA Status register value, we can convert it to a mask of possible error classes. This will lead to more flexible error handling in the future. --- drivers/scsi/ahci.c | 4 ++-- drivers/scsi/libata-core.c | 31 ++++++++++++------------- drivers/scsi/libata-scsi.c | 46 ++++++++++++++++++++++--------------- drivers/scsi/libata.h | 2 +- drivers/scsi/pdc_adma.c | 13 +++++++---- drivers/scsi/sata_mv.c | 11 +++++---- drivers/scsi/sata_promise.c | 16 ++++++------- drivers/scsi/sata_qstor.c | 7 +++--- drivers/scsi/sata_sil24.c | 12 ++++++---- drivers/scsi/sata_sx4.c | 10 ++++---- include/linux/libata.h | 28 ++++++++++++++++++++-- 11 files changed, 109 insertions(+), 71 deletions(-) diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c index 03829aedfd39..5efb3c50aa8a 100644 --- a/drivers/scsi/ahci.c +++ b/drivers/scsi/ahci.c @@ -600,7 +600,7 @@ static void ahci_eng_timeout(struct ata_port *ap) * not being called from the SCSI EH. */ qc->scsidone = scsi_finish_command; - ata_qc_complete(qc, ATA_ERR); + ata_qc_complete(qc, AC_ERR_OTHER); } spin_unlock_irqrestore(&host_set->lock, flags); @@ -629,7 +629,7 @@ static inline int ahci_host_intr(struct ata_port *ap, struct ata_queued_cmd *qc) if (status & PORT_IRQ_FATAL) { ahci_intr_error(ap, status); if (qc) - ata_qc_complete(qc, ATA_ERR); + ata_qc_complete(qc, AC_ERR_OTHER); } return 1; diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index 771bc7d376bc..cc089f1fb114 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -2663,7 +2663,7 @@ static int ata_sg_setup(struct ata_queued_cmd *qc) * None. (grabs host lock) */ -void ata_poll_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) +void ata_poll_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask) { struct ata_port *ap = qc->ap; unsigned long flags; @@ -2671,7 +2671,7 @@ void ata_poll_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) spin_lock_irqsave(&ap->host_set->lock, flags); ap->flags &= ~ATA_FLAG_NOINTR; ata_irq_on(ap); - ata_qc_complete(qc, drv_stat); + ata_qc_complete(qc, err_mask); spin_unlock_irqrestore(&ap->host_set->lock, flags); } @@ -2768,7 +2768,7 @@ static int ata_pio_complete (struct ata_port *ap) ap->hsm_task_state = HSM_ST_IDLE; - ata_poll_qc_complete(qc, drv_stat); + ata_poll_qc_complete(qc, 0); /* another command may start at this point */ @@ -3136,18 +3136,15 @@ static void ata_pio_block(struct ata_port *ap) static void ata_pio_error(struct ata_port *ap) { struct ata_queued_cmd *qc; - u8 drv_stat; + + printk(KERN_WARNING "ata%u: PIO error\n", ap->id); qc = ata_qc_from_tag(ap, ap->active_tag); assert(qc != NULL); - drv_stat = ata_chk_status(ap); - printk(KERN_WARNING "ata%u: PIO error, drv_stat 0x%x\n", - ap->id, drv_stat); - ap->hsm_task_state = HSM_ST_IDLE; - ata_poll_qc_complete(qc, drv_stat | ATA_ERR); + ata_poll_qc_complete(qc, AC_ERR_ATA_BUS); } static void ata_pio_task(void *_data) @@ -3270,7 +3267,7 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc) ap->id, qc->tf.command, drv_stat, host_stat); /* complete taskfile transaction */ - ata_qc_complete(qc, drv_stat); + ata_qc_complete(qc, ac_err_mask(drv_stat)); break; } @@ -3375,7 +3372,7 @@ struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap, return qc; } -int ata_qc_complete_noop(struct ata_queued_cmd *qc, u8 drv_stat) +int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask) { return 0; } @@ -3434,7 +3431,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) * spin_lock_irqsave(host_set lock) */ -void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) +void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask) { int rc; @@ -3451,7 +3448,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) qc->flags &= ~ATA_QCFLAG_ACTIVE; /* call completion callback */ - rc = qc->complete_fn(qc, drv_stat); + rc = qc->complete_fn(qc, err_mask); /* if callback indicates not to complete command (non-zero), * return immediately @@ -3889,7 +3886,7 @@ inline unsigned int ata_host_intr (struct ata_port *ap, ap->ops->irq_clear(ap); /* complete taskfile transaction */ - ata_qc_complete(qc, status); + ata_qc_complete(qc, ac_err_mask(status)); break; default: @@ -3984,7 +3981,7 @@ static void atapi_packet_task(void *_data) /* sleep-wait for BSY to clear */ DPRINTK("busy wait\n"); if (ata_busy_sleep(ap, ATA_TMOUT_CDB_QUICK, ATA_TMOUT_CDB)) - goto err_out; + goto err_out_status; /* make sure DRQ is set */ status = ata_chk_status(ap); @@ -4021,8 +4018,10 @@ static void atapi_packet_task(void *_data) return; +err_out_status: + status = ata_chk_status(ap); err_out: - ata_poll_qc_complete(qc, ATA_ERR); + ata_poll_qc_complete(qc, __ac_err_mask(status)); } diff --git a/drivers/scsi/libata-scsi.c b/drivers/scsi/libata-scsi.c index 89a04b1a5a0e..1e3792f86fcf 100644 --- a/drivers/scsi/libata-scsi.c +++ b/drivers/scsi/libata-scsi.c @@ -560,7 +560,7 @@ void ata_gen_ata_desc_sense(struct ata_queued_cmd *qc) * Use ata_to_sense_error() to map status register bits * onto sense key, asc & ascq. */ - if (unlikely(tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ))) { + if (tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) { ata_to_sense_error(qc->ap->id, tf->command, tf->feature, &sb[1], &sb[2], &sb[3]); sb[1] &= 0x0f; @@ -635,7 +635,7 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc) * Use ata_to_sense_error() to map status register bits * onto sense key, asc & ascq. */ - if (unlikely(tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ))) { + if (tf->command & (ATA_BUSY | ATA_DF | ATA_ERR | ATA_DRQ)) { ata_to_sense_error(qc->ap->id, tf->command, tf->feature, &sb[2], &sb[12], &sb[13]); sb[2] &= 0x0f; @@ -644,7 +644,11 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc) sb[0] = 0x70; sb[7] = 0x0a; - if (tf->flags & ATA_TFLAG_LBA && !(tf->flags & ATA_TFLAG_LBA48)) { + if (tf->flags & ATA_TFLAG_LBA48) { + /* TODO: find solution for LBA48 descriptors */ + } + + else if (tf->flags & ATA_TFLAG_LBA) { /* A small (28b) LBA will fit in the 32b info field */ sb[0] |= 0x80; /* set valid bit */ sb[3] = tf->device & 0x0f; @@ -652,6 +656,10 @@ void ata_gen_fixed_sense(struct ata_queued_cmd *qc) sb[5] = tf->lbam; sb[6] = tf->lbal; } + + else { + /* TODO: C/H/S */ + } } /** @@ -1199,10 +1207,12 @@ nothing_to_do: return 1; } -static int ata_scsi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) +static int ata_scsi_qc_complete(struct ata_queued_cmd *qc, + unsigned int err_mask) { struct scsi_cmnd *cmd = qc->scsicmd; - int need_sense = drv_stat & (ATA_ERR | ATA_BUSY | ATA_DRQ); + u8 *cdb = cmd->cmnd; + int need_sense = (err_mask != 0); /* For ATA pass thru (SAT) commands, generate a sense block if * user mandated it or if there's an error. Note that if we @@ -1211,8 +1221,8 @@ static int ata_scsi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) * whether the command completed successfully or not. If there * was no error, SK, ASC and ASCQ will all be zero. */ - if (((cmd->cmnd[0] == ATA_16) || (cmd->cmnd[0] == ATA_12)) && - ((cmd->cmnd[2] & 0x20) || need_sense)) { + if (((cdb[0] == ATA_16) || (cdb[0] == ATA_12)) && + ((cdb[2] & 0x20) || need_sense)) { ata_gen_ata_desc_sense(qc); } else { if (!need_sense) { @@ -1995,21 +2005,13 @@ void atapi_request_sense(struct ata_port *ap, struct ata_device *dev, DPRINTK("EXIT\n"); } -static int atapi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) +static int atapi_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask) { struct scsi_cmnd *cmd = qc->scsicmd; - VPRINTK("ENTER, drv_stat == 0x%x\n", drv_stat); + VPRINTK("ENTER, err_mask 0x%X\n", err_mask); - if (unlikely(drv_stat & (ATA_BUSY | ATA_DRQ))) - /* FIXME: not quite right; we don't want the - * translation of taskfile registers into - * a sense descriptors, since that's only - * correct for ATA, not ATAPI - */ - ata_gen_ata_desc_sense(qc); - - else if (unlikely(drv_stat & ATA_ERR)) { + if (unlikely(err_mask & AC_ERR_DEV)) { DPRINTK("request check condition\n"); /* FIXME: command completion with check condition @@ -2026,6 +2028,14 @@ static int atapi_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat) return 1; } + else if (unlikely(err_mask)) + /* FIXME: not quite right; we don't want the + * translation of taskfile registers into + * a sense descriptors, since that's only + * correct for ATA, not ATAPI + */ + ata_gen_ata_desc_sense(qc); + else { u8 *scsicmd = cmd->cmnd; diff --git a/drivers/scsi/libata.h b/drivers/scsi/libata.h index 65c264b91136..10ecd9e15e4f 100644 --- a/drivers/scsi/libata.h +++ b/drivers/scsi/libata.h @@ -39,7 +39,7 @@ struct ata_scsi_args { /* libata-core.c */ extern int atapi_enabled; -extern int ata_qc_complete_noop(struct ata_queued_cmd *qc, u8 drv_stat); +extern int ata_qc_complete_noop(struct ata_queued_cmd *qc, unsigned int err_mask); extern struct ata_queued_cmd *ata_qc_new_init(struct ata_port *ap, struct ata_device *dev); extern void ata_rwcmd_protocol(struct ata_queued_cmd *qc); diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c index af99feb9d237..74b3b2c629b6 100644 --- a/drivers/scsi/pdc_adma.c +++ b/drivers/scsi/pdc_adma.c @@ -451,7 +451,7 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set) struct adma_port_priv *pp; struct ata_queued_cmd *qc; void __iomem *chan = ADMA_REGS(mmio_base, port_no); - u8 drv_stat = 0, status = readb(chan + ADMA_STATUS); + u8 status = readb(chan + ADMA_STATUS); if (status == 0) continue; @@ -464,11 +464,14 @@ static inline unsigned int adma_intr_pkt(struct ata_host_set *host_set) continue; qc = ata_qc_from_tag(ap, ap->active_tag); if (qc && (!(qc->tf.ctl & ATA_NIEN))) { + unsigned int err_mask = 0; + if ((status & (aPERR | aPSD | aUIRQ))) - drv_stat = ATA_ERR; + err_mask = AC_ERR_OTHER; else if (pp->pkt[0] != cDONE) - drv_stat = ATA_ERR; - ata_qc_complete(qc, drv_stat); + err_mask = AC_ERR_OTHER; + + ata_qc_complete(qc, err_mask); } } return handled; @@ -498,7 +501,7 @@ static inline unsigned int adma_intr_mmio(struct ata_host_set *host_set) /* complete taskfile transaction */ pp->state = adma_state_idle; - ata_qc_complete(qc, status); + ata_qc_complete(qc, ac_err_mask(status)); handled = 1; } } diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c index dcef5fe8600b..936d1ce5575f 100644 --- a/drivers/scsi/sata_mv.c +++ b/drivers/scsi/sata_mv.c @@ -1065,6 +1065,7 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant, struct ata_queued_cmd *qc; u32 hc_irq_cause; int shift, port, port0, hard_port, handled; + unsigned int err_mask; u8 ata_status = 0; if (hc == 0) { @@ -1100,15 +1101,15 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant, handled++; } + err_mask = ac_err_mask(ata_status); + shift = port << 1; /* (port * 2) */ if (port >= MV_PORTS_PER_HC) { shift++; /* skip bit 8 in the HC Main IRQ reg */ } if ((PORT0_ERR << shift) & relevant) { mv_err_intr(ap); - /* OR in ATA_ERR to ensure libata knows we took one */ - ata_status = readb((void __iomem *) - ap->ioaddr.status_addr) | ATA_ERR; + err_mask |= AC_ERR_OTHER; handled++; } @@ -1118,7 +1119,7 @@ static void mv_host_intr(struct ata_host_set *host_set, u32 relevant, VPRINTK("port %u IRQ found for qc, " "ata_status 0x%x\n", port,ata_status); /* mark qc status appropriately */ - ata_qc_complete(qc, ata_status); + ata_qc_complete(qc, err_mask); } } } @@ -1294,7 +1295,7 @@ static void mv_eng_timeout(struct ata_port *ap) */ spin_lock_irqsave(&ap->host_set->lock, flags); qc->scsidone = scsi_finish_command; - ata_qc_complete(qc, ATA_ERR); + ata_qc_complete(qc, AC_ERR_OTHER); spin_unlock_irqrestore(&ap->host_set->lock, flags); } } diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 63911f16b6ec..8f41702275db 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -399,7 +399,8 @@ static void pdc_eng_timeout(struct ata_port *ap) case ATA_PROT_DMA: case ATA_PROT_NODATA: printk(KERN_ERR "ata%u: command timeout\n", ap->id); - ata_qc_complete(qc, ata_wait_idle(ap) | ATA_ERR); + drv_stat = ata_wait_idle(ap); + ata_qc_complete(qc, __ac_err_mask(drv_stat)); break; default: @@ -408,7 +409,7 @@ static void pdc_eng_timeout(struct ata_port *ap) printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n", ap->id, qc->tf.command, drv_stat); - ata_qc_complete(qc, drv_stat); + ata_qc_complete(qc, ac_err_mask(drv_stat)); break; } @@ -420,24 +421,21 @@ out: static inline unsigned int pdc_host_intr( struct ata_port *ap, struct ata_queued_cmd *qc) { - u8 status; - unsigned int handled = 0, have_err = 0; + unsigned int handled = 0, err_mask = 0; u32 tmp; void __iomem *mmio = (void __iomem *) ap->ioaddr.cmd_addr + PDC_GLOBAL_CTL; tmp = readl(mmio); if (tmp & PDC_ERR_MASK) { - have_err = 1; + err_mask = AC_ERR_DEV; pdc_reset_port(ap); } switch (qc->tf.protocol) { case ATA_PROT_DMA: case ATA_PROT_NODATA: - status = ata_wait_idle(ap); - if (have_err) - status |= ATA_ERR; - ata_qc_complete(qc, status); + err_mask |= ac_err_mask(ata_wait_idle(ap)); + ata_qc_complete(qc, err_mask); handled = 1; break; diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c index 1aaf3304d397..d95a02fa7afb 100644 --- a/drivers/scsi/sata_qstor.c +++ b/drivers/scsi/sata_qstor.c @@ -400,11 +400,12 @@ static inline unsigned int qs_intr_pkt(struct ata_host_set *host_set) qc = ata_qc_from_tag(ap, ap->active_tag); if (qc && (!(qc->tf.ctl & ATA_NIEN))) { switch (sHST) { - case 0: /* sucessful CPB */ + case 0: /* successful CPB */ case 3: /* device error */ pp->state = qs_state_idle; qs_enter_reg_mode(qc->ap); - ata_qc_complete(qc, sDST); + ata_qc_complete(qc, + ac_err_mask(sDST)); break; default: break; @@ -441,7 +442,7 @@ static inline unsigned int qs_intr_mmio(struct ata_host_set *host_set) /* complete taskfile transaction */ pp->state = qs_state_idle; - ata_qc_complete(qc, status); + ata_qc_complete(qc, ac_err_mask(status)); handled = 1; } } diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c index e18a1e2bb65e..4afe2b15b803 100644 --- a/drivers/scsi/sata_sil24.c +++ b/drivers/scsi/sata_sil24.c @@ -498,7 +498,7 @@ static void sil24_eng_timeout(struct ata_port *ap) qc = ata_qc_from_tag(ap, ap->active_tag); if (!qc) { - printk(KERN_ERR "ata%u: BUG: tiemout without command\n", + printk(KERN_ERR "ata%u: BUG: timeout without command\n", ap->id); return; } @@ -512,7 +512,7 @@ static void sil24_eng_timeout(struct ata_port *ap) */ printk(KERN_ERR "ata%u: command timeout\n", ap->id); qc->scsidone = scsi_finish_command; - ata_qc_complete(qc, ATA_ERR); + ata_qc_complete(qc, AC_ERR_OTHER); sil24_reset_controller(ap); } @@ -523,6 +523,7 @@ static void sil24_error_intr(struct ata_port *ap, u32 slot_stat) struct sil24_port_priv *pp = ap->private_data; void __iomem *port = (void __iomem *)ap->ioaddr.cmd_addr; u32 irq_stat, cmd_err, sstatus, serror; + unsigned int err_mask; irq_stat = readl(port + PORT_IRQ_STAT); writel(irq_stat, port + PORT_IRQ_STAT); /* clear irq */ @@ -550,17 +551,18 @@ static void sil24_error_intr(struct ata_port *ap, u32 slot_stat) * Device is reporting error, tf registers are valid. */ sil24_update_tf(ap); + err_mask = ac_err_mask(pp->tf.command); } else { /* * Other errors. libata currently doesn't have any * mechanism to report these errors. Just turn on * ATA_ERR. */ - pp->tf.command = ATA_ERR; + err_mask = AC_ERR_OTHER; } if (qc) - ata_qc_complete(qc, pp->tf.command); + ata_qc_complete(qc, err_mask); sil24_reset_controller(ap); } @@ -585,7 +587,7 @@ static inline void sil24_host_intr(struct ata_port *ap) sil24_update_tf(ap); if (qc) - ata_qc_complete(qc, pp->tf.command); + ata_qc_complete(qc, ac_err_mask(pp->tf.command)); } else sil24_error_intr(ap, slot_stat); } diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c index af08f4f650c1..d9a8baff0d4d 100644 --- a/drivers/scsi/sata_sx4.c +++ b/drivers/scsi/sata_sx4.c @@ -718,7 +718,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap, VPRINTK("ata%u: read hdma, 0x%x 0x%x\n", ap->id, readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT)); /* get drive status; clear intr; complete txn */ - ata_qc_complete(qc, ata_wait_idle(ap)); + ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap))); pdc20621_pop_hdma(qc); } @@ -756,7 +756,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap, VPRINTK("ata%u: write ata, 0x%x 0x%x\n", ap->id, readl(mmio + 0x104), readl(mmio + PDC_HDMA_CTLSTAT)); /* get drive status; clear intr; complete txn */ - ata_qc_complete(qc, ata_wait_idle(ap)); + ata_qc_complete(qc, ac_err_mask(ata_wait_idle(ap))); pdc20621_pop_hdma(qc); } handled = 1; @@ -766,7 +766,7 @@ static inline unsigned int pdc20621_host_intr( struct ata_port *ap, status = ata_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000); DPRINTK("BUS_NODATA (drv_stat 0x%X)\n", status); - ata_qc_complete(qc, status); + ata_qc_complete(qc, ac_err_mask(status)); handled = 1; } else { @@ -881,7 +881,7 @@ static void pdc_eng_timeout(struct ata_port *ap) case ATA_PROT_DMA: case ATA_PROT_NODATA: printk(KERN_ERR "ata%u: command timeout\n", ap->id); - ata_qc_complete(qc, ata_wait_idle(ap) | ATA_ERR); + ata_qc_complete(qc, __ac_err_mask(ata_wait_idle(ap))); break; default: @@ -890,7 +890,7 @@ static void pdc_eng_timeout(struct ata_port *ap) printk(KERN_ERR "ata%u: unknown timeout, cmd 0x%x stat 0x%x\n", ap->id, qc->tf.command, drv_stat); - ata_qc_complete(qc, drv_stat); + ata_qc_complete(qc, ac_err_mask(drv_stat)); break; } diff --git a/include/linux/libata.h b/include/linux/libata.h index a4cce9936a80..0ba3af7a1236 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -172,6 +172,13 @@ enum hsm_task_states { HSM_ST_ERR, }; +enum ata_completion_errors { + AC_ERR_OTHER = (1 << 0), + AC_ERR_DEV = (1 << 1), + AC_ERR_ATA_BUS = (1 << 2), + AC_ERR_HOST_BUS = (1 << 3), +}; + /* forward declarations */ struct scsi_device; struct ata_port_operations; @@ -179,7 +186,7 @@ struct ata_port; struct ata_queued_cmd; /* typedefs */ -typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc, u8 drv_stat); +typedef int (*ata_qc_cb_t) (struct ata_queued_cmd *qc, unsigned int err_mask); struct ata_ioports { unsigned long cmd_addr; @@ -453,7 +460,7 @@ extern void ata_bmdma_start (struct ata_queued_cmd *qc); extern void ata_bmdma_stop(struct ata_queued_cmd *qc); extern u8 ata_bmdma_status(struct ata_port *ap); extern void ata_bmdma_irq_clear(struct ata_port *ap); -extern void ata_qc_complete(struct ata_queued_cmd *qc, u8 drv_stat); +extern void ata_qc_complete(struct ata_queued_cmd *qc, unsigned int err_mask); extern void ata_eng_timeout(struct ata_port *ap); extern void ata_scsi_simulate(u16 *id, struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)); @@ -716,4 +723,21 @@ static inline int ata_try_flush_cache(const struct ata_device *dev) ata_id_has_flush_ext(dev->id); } +static inline unsigned int ac_err_mask(u8 status) +{ + if (status & ATA_BUSY) + return AC_ERR_ATA_BUS; + if (status & (ATA_ERR | ATA_DF)) + return AC_ERR_DEV; + return 0; +} + +static inline unsigned int __ac_err_mask(u8 status) +{ + unsigned int mask = ac_err_mask(status); + if (mask == 0) + return AC_ERR_OTHER; + return mask; +} + #endif /* __LINUX_LIBATA_H__ */ From 0f0d5192696eeb3072944a4a813809cc2c85891a Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 30 Oct 2005 06:41:29 -0500 Subject: [PATCH 93/98] [libata] fix legacy IDE probing ata_pci_init_one() receives an array of struct ata_port_info. Recent updates to the code had always obtained port information from array element 0, rather than array element N. Change to avoid hardcoding port_info[0], thereby restoring proper hardware information to secondary legacy ports. --- drivers/scsi/libata-core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/libata-core.c b/drivers/scsi/libata-core.c index cc089f1fb114..8be7dc0b47b8 100644 --- a/drivers/scsi/libata-core.c +++ b/drivers/scsi/libata-core.c @@ -4527,11 +4527,11 @@ ata_pci_init_native_mode(struct pci_dev *pdev, struct ata_port_info **port, int return probe_ent; } -static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev, struct ata_port_info **port, int port_num) +static struct ata_probe_ent *ata_pci_init_legacy_port(struct pci_dev *pdev, struct ata_port_info *port, int port_num) { struct ata_probe_ent *probe_ent; - probe_ent = ata_probe_ent_alloc(pci_dev_to_dev(pdev), port[0]); + probe_ent = ata_probe_ent_alloc(pci_dev_to_dev(pdev), port); if (!probe_ent) return NULL; @@ -4678,9 +4678,9 @@ int ata_pci_init_one (struct pci_dev *pdev, struct ata_port_info **port_info, if (legacy_mode) { if (legacy_mode & (1 << 0)) - probe_ent = ata_pci_init_legacy_port(pdev, port, 0); + probe_ent = ata_pci_init_legacy_port(pdev, port[0], 0); if (legacy_mode & (1 << 1)) - probe_ent2 = ata_pci_init_legacy_port(pdev, port, 1); + probe_ent2 = ata_pci_init_legacy_port(pdev, port[1], 1); } else { if (n_ports == 2) probe_ent = ata_pci_init_native_mode(pdev, port, ATA_PORT_PRIMARY | ATA_PORT_SECONDARY); From 6248e64721a4f3db9fbedd207206f47835acce44 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 30 Oct 2005 06:42:18 -0500 Subject: [PATCH 94/98] [libata ata_piix] use dev_printk() where appropriate --- drivers/scsi/ata_piix.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c index be021478f416..b7fbf11f3fae 100644 --- a/drivers/scsi/ata_piix.c +++ b/drivers/scsi/ata_piix.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -625,7 +626,8 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) unsigned int pata_chan = 0, sata_chan = 0; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, + "version " DRV_VERSION "\n"); /* no hotplugging support (FIXME) */ if (!in_module_init) @@ -672,7 +674,9 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) port_info[pata_chan] = &piix_port_info[ich5_pata]; n_ports++; - printk(KERN_WARNING DRV_NAME ": combined mode detected\n"); + dev_printk(KERN_WARNING, &pdev->dev, + "combined mode detected (p=%u, s=%u)\n", + pata_chan, sata_chan); } return ata_pci_init_one(pdev, port_info, n_ports); From fbf30fbaa61595e9026f628f3913888b0df2b288 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 30 Oct 2005 07:57:31 -0500 Subject: [PATCH 95/98] [libata ata_piix] fix native mode probe, after recent updates --- drivers/scsi/ata_piix.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ata_piix.c b/drivers/scsi/ata_piix.c index b7fbf11f3fae..7f8aa1b552ce 100644 --- a/drivers/scsi/ata_piix.c +++ b/drivers/scsi/ata_piix.c @@ -622,7 +622,7 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) { static int printed_version; struct ata_port_info *port_info[2]; - unsigned int combined = 0, n_ports = 1; + unsigned int combined = 0; unsigned int pata_chan = 0, sata_chan = 0; if (!printed_version++) @@ -634,7 +634,7 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) return -ENODEV; port_info[0] = &piix_port_info[ent->driver_data]; - port_info[1] = NULL; + port_info[1] = &piix_port_info[ent->driver_data]; if (port_info[0]->host_flags & PIIX_FLAG_AHCI) { u8 tmp; @@ -672,14 +672,13 @@ static int piix_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) port_info[sata_chan] = &piix_port_info[ent->driver_data]; port_info[sata_chan]->host_flags |= ATA_FLAG_SLAVE_POSS; port_info[pata_chan] = &piix_port_info[ich5_pata]; - n_ports++; dev_printk(KERN_WARNING, &pdev->dev, "combined mode detected (p=%u, s=%u)\n", pata_chan, sata_chan); } - return ata_pci_init_one(pdev, port_info, n_ports); + return ata_pci_init_one(pdev, port_info, 2); } static int __init piix_init(void) From a9524a76f70f3343e4be27f95a7e92a8ba5f9009 Mon Sep 17 00:00:00 2001 From: Jeff Garzik Date: Sun, 30 Oct 2005 14:39:11 -0500 Subject: [PATCH 96/98] [libata] use dev_printk() throughout drivers A few drivers were not following the standard meme of printing out their driver name and version at module load time; this is fixed as well. --- drivers/scsi/ahci.c | 42 ++++++++++++++++++------------------- drivers/scsi/pdc_adma.c | 13 ++++++------ drivers/scsi/sata_mv.c | 12 +++++------ drivers/scsi/sata_nv.c | 3 ++- drivers/scsi/sata_promise.c | 3 ++- drivers/scsi/sata_qstor.c | 18 +++++++--------- drivers/scsi/sata_sil.c | 7 ++++--- drivers/scsi/sata_sil24.c | 21 +++++++++---------- drivers/scsi/sata_sis.c | 13 +++++++++--- drivers/scsi/sata_svw.c | 3 ++- drivers/scsi/sata_sx4.c | 3 ++- drivers/scsi/sata_uli.c | 5 +++++ drivers/scsi/sata_via.c | 38 ++++++++++++++++++--------------- drivers/scsi/sata_vsc.c | 3 ++- 14 files changed, 101 insertions(+), 83 deletions(-) diff --git a/drivers/scsi/ahci.c b/drivers/scsi/ahci.c index 5efb3c50aa8a..e2a5657d5fdb 100644 --- a/drivers/scsi/ahci.c +++ b/drivers/scsi/ahci.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -674,10 +675,10 @@ static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs * if (!ahci_host_intr(ap, qc)) if (ata_ratelimit()) { struct pci_dev *pdev = - to_pci_dev(ap->host_set->dev); - printk(KERN_WARNING - "ahci(%s): unhandled interrupt on port %u\n", - pci_name(pdev), i); + to_pci_dev(ap->host_set->dev); + dev_printk(KERN_WARNING, &pdev->dev, + "unhandled interrupt on port %u\n", + i); } VPRINTK("port %u\n", i); @@ -685,10 +686,9 @@ static irqreturn_t ahci_interrupt (int irq, void *dev_instance, struct pt_regs * VPRINTK("port %u (no irq)\n", i); if (ata_ratelimit()) { struct pci_dev *pdev = - to_pci_dev(ap->host_set->dev); - printk(KERN_WARNING - "ahci(%s): interrupt on disabled port %u\n", - pci_name(pdev), i); + to_pci_dev(ap->host_set->dev); + dev_printk(KERN_WARNING, &pdev->dev, + "interrupt on disabled port %u\n", i); } } @@ -760,8 +760,8 @@ static int ahci_host_init(struct ata_probe_ent *probe_ent) tmp = readl(mmio + HOST_CTL); if (tmp & HOST_RESET) { - printk(KERN_ERR DRV_NAME "(%s): controller reset failed (0x%x)\n", - pci_name(pdev), tmp); + dev_printk(KERN_ERR, &pdev->dev, + "controller reset failed (0x%x)\n", tmp); return -EIO; } @@ -789,22 +789,22 @@ static int ahci_host_init(struct ata_probe_ent *probe_ent) if (rc) { rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME "(%s): 64-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "64-bit DMA enable failed\n"); return rc; } } } else { rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME "(%s): 32-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit DMA enable failed\n"); return rc; } rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME "(%s): 32-bit consistent DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit consistent DMA enable failed\n"); return rc; } } @@ -907,10 +907,10 @@ static void ahci_print_info(struct ata_probe_ent *probe_ent) else scc_s = "unknown"; - printk(KERN_INFO DRV_NAME "(%s) AHCI %02x%02x.%02x%02x " + dev_printk(KERN_INFO, &pdev->dev, + "AHCI %02x%02x.%02x%02x " "%u slots %u ports %s Gbps 0x%x impl %s mode\n" , - pci_name(pdev), (vers >> 24) & 0xff, (vers >> 16) & 0xff, @@ -923,11 +923,11 @@ static void ahci_print_info(struct ata_probe_ent *probe_ent) impl, scc_s); - printk(KERN_INFO DRV_NAME "(%s) flags: " + dev_printk(KERN_INFO, &pdev->dev, + "flags: " "%s%s%s%s%s%s" "%s%s%s%s%s%s%s\n" , - pci_name(pdev), cap & (1 << 31) ? "64bit " : "", cap & (1 << 30) ? "ncq " : "", @@ -960,7 +960,7 @@ static int ahci_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) VPRINTK("ENTER\n"); if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) diff --git a/drivers/scsi/pdc_adma.c b/drivers/scsi/pdc_adma.c index 74b3b2c629b6..665017eda8a6 100644 --- a/drivers/scsi/pdc_adma.c +++ b/drivers/scsi/pdc_adma.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -626,16 +627,14 @@ static int adma_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base) rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME - "(%s): 32-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit DMA enable failed\n"); return rc; } rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME - "(%s): 32-bit consistent DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit consistent DMA enable failed\n"); return rc; } return 0; @@ -651,7 +650,7 @@ static int adma_ata_init_one(struct pci_dev *pdev, int rc, port_no; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) diff --git a/drivers/scsi/sata_mv.c b/drivers/scsi/sata_mv.c index 936d1ce5575f..46dbdee79f77 100644 --- a/drivers/scsi/sata_mv.c +++ b/drivers/scsi/sata_mv.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -1437,9 +1438,9 @@ static void mv_print_info(struct ata_probe_ent *probe_ent) else scc_s = "unknown"; - printk(KERN_INFO DRV_NAME - "(%s) %u slots %u ports %s mode IRQ via %s\n", - pci_name(pdev), (unsigned)MV_MAX_Q_DEPTH, probe_ent->n_ports, + dev_printk(KERN_INFO, &pdev->dev, + "%u slots %u ports %s mode IRQ via %s\n", + (unsigned)MV_MAX_Q_DEPTH, probe_ent->n_ports, scc_s, (MV_HP_FLAG_MSI & hpriv->hp_flags) ? "MSI" : "INTx"); } @@ -1460,9 +1461,8 @@ static int mv_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) void __iomem *mmio_base; int pci_dev_busy = 0, rc; - if (!printed_version++) { - printk(KERN_INFO DRV_NAME " version " DRV_VERSION "\n"); - } + if (!printed_version++) + dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) { diff --git a/drivers/scsi/sata_nv.c b/drivers/scsi/sata_nv.c index 1a56d6c79ddd..d573888eda76 100644 --- a/drivers/scsi/sata_nv.c +++ b/drivers/scsi/sata_nv.c @@ -61,6 +61,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -383,7 +384,7 @@ static int nv_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) return -ENODEV; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) diff --git a/drivers/scsi/sata_promise.c b/drivers/scsi/sata_promise.c index 8f41702275db..b41c977d6fab 100644 --- a/drivers/scsi/sata_promise.c +++ b/drivers/scsi/sata_promise.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -633,7 +634,7 @@ static int pdc_ata_init_one (struct pci_dev *pdev, const struct pci_device_id *e int rc; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); /* * If this driver happens to only be useful on Apple's K2, then diff --git a/drivers/scsi/sata_qstor.c b/drivers/scsi/sata_qstor.c index d95a02fa7afb..9938dae782b6 100644 --- a/drivers/scsi/sata_qstor.c +++ b/drivers/scsi/sata_qstor.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -600,25 +601,22 @@ static int qs_set_dma_masks(struct pci_dev *pdev, void __iomem *mmio_base) if (rc) { rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME - "(%s): 64-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "64-bit DMA enable failed\n"); return rc; } } } else { rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME - "(%s): 32-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit DMA enable failed\n"); return rc; } rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME - "(%s): 32-bit consistent DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit consistent DMA enable failed\n"); return rc; } } @@ -635,7 +633,7 @@ static int qs_ata_init_one(struct pci_dev *pdev, int rc, port_no; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) diff --git a/drivers/scsi/sata_sil.c b/drivers/scsi/sata_sil.c index 3a056173fb95..435f7e0085ec 100644 --- a/drivers/scsi/sata_sil.c +++ b/drivers/scsi/sata_sil.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -386,7 +387,7 @@ static int sil_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) u8 cls; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); /* * If this driver happens to only be useful on Apple's K2, then @@ -463,8 +464,8 @@ static int sil_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) writeb(cls, mmio_base + SIL_FIFO_W3); } } else - printk(KERN_WARNING DRV_NAME "(%s): cache line size not set. Driver may not function\n", - pci_name(pdev)); + dev_printk(KERN_WARNING, &pdev->dev, + "cache line size not set. Driver may not function\n"); if (ent->driver_data == sil_3114) { irq_mask = SIL_MASK_4PORT; diff --git a/drivers/scsi/sata_sil24.c b/drivers/scsi/sata_sil24.c index 4afe2b15b803..c66548025657 100644 --- a/drivers/scsi/sata_sil24.c +++ b/drivers/scsi/sata_sil24.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "scsi.h" #include @@ -690,7 +691,7 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) int i, rc; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) @@ -750,14 +751,14 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) */ rc = pci_set_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME "(%s): 32-bit DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit DMA enable failed\n"); goto out_free; } rc = pci_set_consistent_dma_mask(pdev, DMA_32BIT_MASK); if (rc) { - printk(KERN_ERR DRV_NAME "(%s): 32-bit consistent DMA enable failed\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "32-bit consistent DMA enable failed\n"); goto out_free; } @@ -793,9 +794,8 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) break; } if (tmp & PORT_CS_PORT_RST) - printk(KERN_ERR DRV_NAME - "(%s): failed to clear port RST\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "failed to clear port RST\n"); } /* Zero error counters. */ @@ -824,9 +824,8 @@ static int sil24_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Reset itself */ if (__sil24_reset_controller(port)) - printk(KERN_ERR DRV_NAME - "(%s): failed to reset controller\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, + "failed to reset controller\n"); } /* Turn on interrupts */ diff --git a/drivers/scsi/sata_sis.c b/drivers/scsi/sata_sis.c index 057f7b98b6c4..42288be0e561 100644 --- a/drivers/scsi/sata_sis.c +++ b/drivers/scsi/sata_sis.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -237,6 +238,7 @@ static void sis_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val) static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) { + static int printed_version; struct ata_probe_ent *probe_ent = NULL; int rc; u32 genctl; @@ -245,6 +247,9 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) u8 pmr; u8 port2_start; + if (!printed_version++) + dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n"); + rc = pci_enable_device(pdev); if (rc) return rc; @@ -288,16 +293,18 @@ static int sis_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) pci_read_config_byte(pdev, SIS_PMR, &pmr); if (ent->device != 0x182) { if ((pmr & SIS_PMR_COMBINED) == 0) { - printk(KERN_INFO "sata_sis: Detected SiS 180/181 chipset in SATA mode\n"); + dev_printk(KERN_INFO, &pdev->dev, + "Detected SiS 180/181 chipset in SATA mode\n"); port2_start = 64; } else { - printk(KERN_INFO "sata_sis: Detected SiS 180/181 chipset in combined mode\n"); + dev_printk(KERN_INFO, &pdev->dev, + "Detected SiS 180/181 chipset in combined mode\n"); port2_start=0; } } else { - printk(KERN_INFO "sata_sis: Detected SiS 182 chipset\n"); + dev_printk(KERN_INFO, &pdev->dev, "Detected SiS 182 chipset\n"); port2_start = 0x20; } diff --git a/drivers/scsi/sata_svw.c b/drivers/scsi/sata_svw.c index 46208f52d0e1..db615ff794d8 100644 --- a/drivers/scsi/sata_svw.c +++ b/drivers/scsi/sata_svw.c @@ -44,6 +44,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -362,7 +363,7 @@ static int k2_sata_init_one (struct pci_dev *pdev, const struct pci_device_id *e int i; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); /* * If this driver happens to only be useful on Apple's K2, then diff --git a/drivers/scsi/sata_sx4.c b/drivers/scsi/sata_sx4.c index d9a8baff0d4d..0ec21e09f5d8 100644 --- a/drivers/scsi/sata_sx4.c +++ b/drivers/scsi/sata_sx4.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -1385,7 +1386,7 @@ static int pdc_sata_init_one (struct pci_dev *pdev, const struct pci_device_id * int rc; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); /* * If this driver happens to only be useful on Apple's K2, then diff --git a/drivers/scsi/sata_uli.c b/drivers/scsi/sata_uli.c index d68dc7d3422c..a5e245c098e1 100644 --- a/drivers/scsi/sata_uli.c +++ b/drivers/scsi/sata_uli.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -178,12 +179,16 @@ static void uli_scr_write (struct ata_port *ap, unsigned int sc_reg, u32 val) static int uli_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) { + static int printed_version; struct ata_probe_ent *probe_ent; struct ata_port_info *ppi; int rc; unsigned int board_idx = (unsigned int) ent->driver_data; int pci_dev_busy = 0; + if (!printed_version++) + dev_printk(KERN_INFO, &pdev->dev, "version " DRV_VERSION "\n"); + rc = pci_enable_device(pdev); if (rc) return rc; diff --git a/drivers/scsi/sata_via.c b/drivers/scsi/sata_via.c index 80e291a909a9..b3ecdbe400e9 100644 --- a/drivers/scsi/sata_via.c +++ b/drivers/scsi/sata_via.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -259,15 +260,15 @@ static void svia_configure(struct pci_dev *pdev) u8 tmp8; pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &tmp8); - printk(KERN_INFO DRV_NAME "(%s): routed to hard irq line %d\n", - pci_name(pdev), + dev_printk(KERN_INFO, &pdev->dev, "routed to hard irq line %d\n", (int) (tmp8 & 0xf0) == 0xf0 ? 0 : tmp8 & 0x0f); /* make sure SATA channels are enabled */ pci_read_config_byte(pdev, SATA_CHAN_ENAB, &tmp8); if ((tmp8 & ALL_PORTS) != ALL_PORTS) { - printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channels (0x%x)\n", - pci_name(pdev), (int) tmp8); + dev_printk(KERN_DEBUG, &pdev->dev, + "enabling SATA channels (0x%x)\n", + (int) tmp8); tmp8 |= ALL_PORTS; pci_write_config_byte(pdev, SATA_CHAN_ENAB, tmp8); } @@ -275,8 +276,9 @@ static void svia_configure(struct pci_dev *pdev) /* make sure interrupts for each channel sent to us */ pci_read_config_byte(pdev, SATA_INT_GATE, &tmp8); if ((tmp8 & ALL_PORTS) != ALL_PORTS) { - printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channel interrupts (0x%x)\n", - pci_name(pdev), (int) tmp8); + dev_printk(KERN_DEBUG, &pdev->dev, + "enabling SATA channel interrupts (0x%x)\n", + (int) tmp8); tmp8 |= ALL_PORTS; pci_write_config_byte(pdev, SATA_INT_GATE, tmp8); } @@ -284,8 +286,9 @@ static void svia_configure(struct pci_dev *pdev) /* make sure native mode is enabled */ pci_read_config_byte(pdev, SATA_NATIVE_MODE, &tmp8); if ((tmp8 & NATIVE_MODE_ALL) != NATIVE_MODE_ALL) { - printk(KERN_DEBUG DRV_NAME "(%s): enabling SATA channel native mode (0x%x)\n", - pci_name(pdev), (int) tmp8); + dev_printk(KERN_DEBUG, &pdev->dev, + "enabling SATA channel native mode (0x%x)\n", + (int) tmp8); tmp8 |= NATIVE_MODE_ALL; pci_write_config_byte(pdev, SATA_NATIVE_MODE, tmp8); } @@ -303,7 +306,7 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) u8 tmp8; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) @@ -318,8 +321,9 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) if (board_id == vt6420) { pci_read_config_byte(pdev, SATA_PATA_SHARING, &tmp8); if (tmp8 & SATA_2DEV) { - printk(KERN_ERR DRV_NAME "(%s): SATA master/slave not supported (0x%x)\n", - pci_name(pdev), (int) tmp8); + dev_printk(KERN_ERR, &pdev->dev, + "SATA master/slave not supported (0x%x)\n", + (int) tmp8); rc = -EIO; goto err_out_regions; } @@ -332,10 +336,11 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) for (i = 0; i < ARRAY_SIZE(svia_bar_sizes); i++) if ((pci_resource_start(pdev, i) == 0) || (pci_resource_len(pdev, i) < bar_sizes[i])) { - printk(KERN_ERR DRV_NAME "(%s): invalid PCI BAR %u (sz 0x%lx, val 0x%lx)\n", - pci_name(pdev), i, - pci_resource_start(pdev, i), - pci_resource_len(pdev, i)); + dev_printk(KERN_ERR, &pdev->dev, + "invalid PCI BAR %u (sz 0x%lx, val 0x%lx)\n", + i, + pci_resource_start(pdev, i), + pci_resource_len(pdev, i)); rc = -ENODEV; goto err_out_regions; } @@ -353,8 +358,7 @@ static int svia_init_one (struct pci_dev *pdev, const struct pci_device_id *ent) probe_ent = vt6421_init_probe_ent(pdev); if (!probe_ent) { - printk(KERN_ERR DRV_NAME "(%s): out of memory\n", - pci_name(pdev)); + dev_printk(KERN_ERR, &pdev->dev, "out of memory\n"); rc = -ENOMEM; goto err_out_regions; } diff --git a/drivers/scsi/sata_vsc.c b/drivers/scsi/sata_vsc.c index 54273e0063c7..bb84ba0c7e83 100644 --- a/drivers/scsi/sata_vsc.c +++ b/drivers/scsi/sata_vsc.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "scsi.h" #include #include @@ -295,7 +296,7 @@ static int __devinit vsc_sata_init_one (struct pci_dev *pdev, const struct pci_d int rc; if (!printed_version++) - printk(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n"); + dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); rc = pci_enable_device(pdev); if (rc) From 08db2a701bd63d0e36f06a29137bf016a907cf71 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 30 Oct 2005 14:40:07 -0800 Subject: [PATCH 97/98] Fix PIIX4 SMB region size Petr Vandrovec correctly points out that the SMB region of the PIIX4 is just 16 bytes, not 32. Signed-off-by: Linus Torvalds --- drivers/pci/quirks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index bbd9c2323d8c..5627ce1d2b32 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -356,7 +356,7 @@ static void piix4_mem_quirk(struct pci_dev *dev, const char *name, unsigned int /* * PIIX4 ACPI: Two IO regions pointed to by longwords at * 0x40 (64 bytes of ACPI registers) - * 0x90 (32 bytes of SMB registers) + * 0x90 (16 bytes of SMB registers) * and a few strange programmable PIIX4 device resources. */ static void __devinit quirk_piix4_acpi(struct pci_dev *dev) @@ -366,7 +366,7 @@ static void __devinit quirk_piix4_acpi(struct pci_dev *dev) pci_read_config_dword(dev, 0x40, ®ion); quirk_io_region(dev, region, 64, PCI_BRIDGE_RESOURCES, "PIIX4 ACPI"); pci_read_config_dword(dev, 0x90, ®ion); - quirk_io_region(dev, region, 32, PCI_BRIDGE_RESOURCES+1, "PIIX4 SMB"); + quirk_io_region(dev, region, 16, PCI_BRIDGE_RESOURCES+1, "PIIX4 SMB"); /* Device resource A has enables for some of the other ones */ pci_read_config_dword(dev, 0x5c, &res_a); From d3f8cf489993658702b7e58ff37162246263de53 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 30 Oct 2005 17:38:25 -0500 Subject: [PATCH 98/98] [PATCH] NFS: Remove unbalanced spin_unlock() calls from nfs_refresh_inode() Doh! Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- fs/nfs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f2781ca42761..fc0f12ba89cc 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1274,14 +1274,12 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat } if ((fattr->valid & NFS_ATTR_FATTR) == 0) { - spin_unlock(&inode->i_lock); return 0; } /* Has the inode gone and changed behind our back? */ if (nfsi->fileid != fattr->fileid || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) { - spin_unlock(&inode->i_lock); return -EIO; }