cef3970381
Stefan Agner reported a bug when using zsram on 32-bit Arm machines with RAM above the 4GB address boundary: Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = a27bd01c [00000000] *pgd=236a0003, *pmd=1ffa64003 Internal error: Oops: 207 [#1] SMP ARM Modules linked in: mdio_bcm_unimac(+) brcmfmac cfg80211 brcmutil raspberrypi_hwmon hci_uart crc32_arm_ce bcm2711_thermal phy_generic genet CPU: 0 PID: 123 Comm: mkfs.ext4 Not tainted 5.9.6 #1 Hardware name: BCM2711 PC is at zs_map_object+0x94/0x338 LR is at zram_bvec_rw.constprop.0+0x330/0xa64 pc : [<c0602b38>] lr : [<c0bda6a0>] psr: 60000013 sp : e376bbe0 ip : 00000000 fp : c1e2921c r10: 00000002 r9 : c1dda730 r8 : 00000000 r7 : e8ff7a00 r6 : 00000000 r5 : 02f9ffa0 r4 : e3710000 r3 : 000fdffe r2 : c1e0ce80 r1 : ebf979a0 r0 : 00000000 Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user Control: 30c5383d Table: 235c2a80 DAC: fffffffd Process mkfs.ext4 (pid: 123, stack limit = 0x495a22e6) Stack: (0xe376bbe0 to 0xe376c000) As it turns out, zsram needs to know the maximum memory size, which is defined in MAX_PHYSMEM_BITS when CONFIG_SPARSEMEM is set, or in MAX_POSSIBLE_PHYSMEM_BITS on the x86 architecture. The same problem will be hit on all 32-bit architectures that have a physical address space larger than 4GB and happen to not enable sparsemem and include asm/sparsemem.h from asm/pgtable.h. After the initial discussion, I suggested just always defining MAX_POSSIBLE_PHYSMEM_BITS whenever CONFIG_PHYS_ADDR_T_64BIT is set, or provoking a build error otherwise. This addresses all configurations that can currently have this runtime bug, but leaves all other configurations unchanged. I looked up the possible number of bits in source code and datasheets, here is what I found: - on ARC, CONFIG_ARC_HAS_PAE40 controls whether 32 or 40 bits are used - on ARM, CONFIG_LPAE enables 40 bit addressing, without it we never support more than 32 bits, even though supersections in theory allow up to 40 bits as well. - on MIPS, some MIPS32r1 or later chips support 36 bits, and MIPS32r5 XPA supports up to 60 bits in theory, but 40 bits are more than anyone will ever ship - On PowerPC, there are three different implementations of 36 bit addressing, but 32-bit is used without CONFIG_PTE_64BIT - On RISC-V, the normal page table format can support 34 bit addressing. There is no highmem support on RISC-V, so anything above 2GB is unused, but it might be useful to eventually support CONFIG_ZRAM for high pages. Fixes: 61989a80fb3a ("staging: zsmalloc: zsmalloc memory allocation library") Fixes: 02390b87a945 ("mm/zsmalloc: Prepare to variable MAX_PHYSMEM_BITS") Acked-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de> Reviewed-by: Stefan Agner <stefan@agner.ch> Tested-by: Stefan Agner <stefan@agner.ch> Acked-by: Mike Rapoport <rppt@linux.ibm.com> Link: https://lore.kernel.org/linux-mm/bdfa44bf1c570b05d6c70898e2bbb0acf234ecdf.1604762181.git.stefan@agner.ch/ Signed-off-by: Arnd Bergmann <arnd@arndb.de>
228 lines
8.3 KiB
C
228 lines
8.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* arch/arm/include/asm/pgtable-2level.h
|
|
*
|
|
* Copyright (C) 1995-2002 Russell King
|
|
*/
|
|
#ifndef _ASM_PGTABLE_2LEVEL_H
|
|
#define _ASM_PGTABLE_2LEVEL_H
|
|
|
|
#define __PAGETABLE_PMD_FOLDED 1
|
|
|
|
/*
|
|
* Hardware-wise, we have a two level page table structure, where the first
|
|
* level has 4096 entries, and the second level has 256 entries. Each entry
|
|
* is one 32-bit word. Most of the bits in the second level entry are used
|
|
* by hardware, and there aren't any "accessed" and "dirty" bits.
|
|
*
|
|
* Linux on the other hand has a three level page table structure, which can
|
|
* be wrapped to fit a two level page table structure easily - using the PGD
|
|
* and PTE only. However, Linux also expects one "PTE" table per page, and
|
|
* at least a "dirty" bit.
|
|
*
|
|
* Therefore, we tweak the implementation slightly - we tell Linux that we
|
|
* have 2048 entries in the first level, each of which is 8 bytes (iow, two
|
|
* hardware pointers to the second level.) The second level contains two
|
|
* hardware PTE tables arranged contiguously, preceded by Linux versions
|
|
* which contain the state information Linux needs. We, therefore, end up
|
|
* with 512 entries in the "PTE" level.
|
|
*
|
|
* This leads to the page tables having the following layout:
|
|
*
|
|
* pgd pte
|
|
* | |
|
|
* +--------+
|
|
* | | +------------+ +0
|
|
* +- - - - + | Linux pt 0 |
|
|
* | | +------------+ +1024
|
|
* +--------+ +0 | Linux pt 1 |
|
|
* | |-----> +------------+ +2048
|
|
* +- - - - + +4 | h/w pt 0 |
|
|
* | |-----> +------------+ +3072
|
|
* +--------+ +8 | h/w pt 1 |
|
|
* | | +------------+ +4096
|
|
*
|
|
* See L_PTE_xxx below for definitions of bits in the "Linux pt", and
|
|
* PTE_xxx for definitions of bits appearing in the "h/w pt".
|
|
*
|
|
* PMD_xxx definitions refer to bits in the first level page table.
|
|
*
|
|
* The "dirty" bit is emulated by only granting hardware write permission
|
|
* iff the page is marked "writable" and "dirty" in the Linux PTE. This
|
|
* means that a write to a clean page will cause a permission fault, and
|
|
* the Linux MM layer will mark the page dirty via handle_pte_fault().
|
|
* For the hardware to notice the permission change, the TLB entry must
|
|
* be flushed, and ptep_set_access_flags() does that for us.
|
|
*
|
|
* The "accessed" or "young" bit is emulated by a similar method; we only
|
|
* allow accesses to the page if the "young" bit is set. Accesses to the
|
|
* page will cause a fault, and handle_pte_fault() will set the young bit
|
|
* for us as long as the page is marked present in the corresponding Linux
|
|
* PTE entry. Again, ptep_set_access_flags() will ensure that the TLB is
|
|
* up to date.
|
|
*
|
|
* However, when the "young" bit is cleared, we deny access to the page
|
|
* by clearing the hardware PTE. Currently Linux does not flush the TLB
|
|
* for us in this case, which means the TLB will retain the transation
|
|
* until either the TLB entry is evicted under pressure, or a context
|
|
* switch which changes the user space mapping occurs.
|
|
*/
|
|
#define PTRS_PER_PTE 512
|
|
#define PTRS_PER_PMD 1
|
|
#define PTRS_PER_PGD 2048
|
|
|
|
#define PTE_HWTABLE_PTRS (PTRS_PER_PTE)
|
|
#define PTE_HWTABLE_OFF (PTE_HWTABLE_PTRS * sizeof(pte_t))
|
|
#define PTE_HWTABLE_SIZE (PTRS_PER_PTE * sizeof(u32))
|
|
|
|
#define MAX_POSSIBLE_PHYSMEM_BITS 32
|
|
|
|
/*
|
|
* PMD_SHIFT determines the size of the area a second-level page table can map
|
|
* PGDIR_SHIFT determines what a third-level page table entry can map
|
|
*/
|
|
#define PMD_SHIFT 21
|
|
#define PGDIR_SHIFT 21
|
|
|
|
#define PMD_SIZE (1UL << PMD_SHIFT)
|
|
#define PMD_MASK (~(PMD_SIZE-1))
|
|
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
|
|
#define PGDIR_MASK (~(PGDIR_SIZE-1))
|
|
|
|
/*
|
|
* section address mask and size definitions.
|
|
*/
|
|
#define SECTION_SHIFT 20
|
|
#define SECTION_SIZE (1UL << SECTION_SHIFT)
|
|
#define SECTION_MASK (~(SECTION_SIZE-1))
|
|
|
|
/*
|
|
* ARMv6 supersection address mask and size definitions.
|
|
*/
|
|
#define SUPERSECTION_SHIFT 24
|
|
#define SUPERSECTION_SIZE (1UL << SUPERSECTION_SHIFT)
|
|
#define SUPERSECTION_MASK (~(SUPERSECTION_SIZE-1))
|
|
|
|
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
|
|
|
|
/*
|
|
* "Linux" PTE definitions.
|
|
*
|
|
* We keep two sets of PTEs - the hardware and the linux version.
|
|
* This allows greater flexibility in the way we map the Linux bits
|
|
* onto the hardware tables, and allows us to have YOUNG and DIRTY
|
|
* bits.
|
|
*
|
|
* The PTE table pointer refers to the hardware entries; the "Linux"
|
|
* entries are stored 1024 bytes below.
|
|
*/
|
|
#define L_PTE_VALID (_AT(pteval_t, 1) << 0) /* Valid */
|
|
#define L_PTE_PRESENT (_AT(pteval_t, 1) << 0)
|
|
#define L_PTE_YOUNG (_AT(pteval_t, 1) << 1)
|
|
#define L_PTE_DIRTY (_AT(pteval_t, 1) << 6)
|
|
#define L_PTE_RDONLY (_AT(pteval_t, 1) << 7)
|
|
#define L_PTE_USER (_AT(pteval_t, 1) << 8)
|
|
#define L_PTE_XN (_AT(pteval_t, 1) << 9)
|
|
#define L_PTE_SHARED (_AT(pteval_t, 1) << 10) /* shared(v6), coherent(xsc3) */
|
|
#define L_PTE_NONE (_AT(pteval_t, 1) << 11)
|
|
|
|
/*
|
|
* These are the memory types, defined to be compatible with
|
|
* pre-ARMv6 CPUs cacheable and bufferable bits: n/a,n/a,C,B
|
|
* ARMv6+ without TEX remapping, they are a table index.
|
|
* ARMv6+ with TEX remapping, they correspond to n/a,TEX(0),C,B
|
|
*
|
|
* MT type Pre-ARMv6 ARMv6+ type / cacheable status
|
|
* UNCACHED Uncached Strongly ordered
|
|
* BUFFERABLE Bufferable Normal memory / non-cacheable
|
|
* WRITETHROUGH Writethrough Normal memory / write through
|
|
* WRITEBACK Writeback Normal memory / write back, read alloc
|
|
* MINICACHE Minicache N/A
|
|
* WRITEALLOC Writeback Normal memory / write back, write alloc
|
|
* DEV_SHARED Uncached Device memory (shared)
|
|
* DEV_NONSHARED Uncached Device memory (non-shared)
|
|
* DEV_WC Bufferable Normal memory / non-cacheable
|
|
* DEV_CACHED Writeback Normal memory / write back, read alloc
|
|
* VECTORS Variable Normal memory / variable
|
|
*
|
|
* All normal memory mappings have the following properties:
|
|
* - reads can be repeated with no side effects
|
|
* - repeated reads return the last value written
|
|
* - reads can fetch additional locations without side effects
|
|
* - writes can be repeated (in certain cases) with no side effects
|
|
* - writes can be merged before accessing the target
|
|
* - unaligned accesses can be supported
|
|
*
|
|
* All device mappings have the following properties:
|
|
* - no access speculation
|
|
* - no repetition (eg, on return from an exception)
|
|
* - number, order and size of accesses are maintained
|
|
* - unaligned accesses are "unpredictable"
|
|
*/
|
|
#define L_PTE_MT_UNCACHED (_AT(pteval_t, 0x00) << 2) /* 0000 */
|
|
#define L_PTE_MT_BUFFERABLE (_AT(pteval_t, 0x01) << 2) /* 0001 */
|
|
#define L_PTE_MT_WRITETHROUGH (_AT(pteval_t, 0x02) << 2) /* 0010 */
|
|
#define L_PTE_MT_WRITEBACK (_AT(pteval_t, 0x03) << 2) /* 0011 */
|
|
#define L_PTE_MT_MINICACHE (_AT(pteval_t, 0x06) << 2) /* 0110 (sa1100, xscale) */
|
|
#define L_PTE_MT_WRITEALLOC (_AT(pteval_t, 0x07) << 2) /* 0111 */
|
|
#define L_PTE_MT_DEV_SHARED (_AT(pteval_t, 0x04) << 2) /* 0100 */
|
|
#define L_PTE_MT_DEV_NONSHARED (_AT(pteval_t, 0x0c) << 2) /* 1100 */
|
|
#define L_PTE_MT_DEV_WC (_AT(pteval_t, 0x09) << 2) /* 1001 */
|
|
#define L_PTE_MT_DEV_CACHED (_AT(pteval_t, 0x0b) << 2) /* 1011 */
|
|
#define L_PTE_MT_VECTORS (_AT(pteval_t, 0x0f) << 2) /* 1111 */
|
|
#define L_PTE_MT_MASK (_AT(pteval_t, 0x0f) << 2)
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
/*
|
|
* The "pud_xxx()" functions here are trivial when the pmd is folded into
|
|
* the pud: the pud entry is never bad, always exists, and can't be set or
|
|
* cleared.
|
|
*/
|
|
#define pud_none(pud) (0)
|
|
#define pud_bad(pud) (0)
|
|
#define pud_present(pud) (1)
|
|
#define pud_clear(pudp) do { } while (0)
|
|
#define set_pud(pud,pudp) do { } while (0)
|
|
|
|
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
|
|
{
|
|
return (pmd_t *)pud;
|
|
}
|
|
#define pmd_offset pmd_offset
|
|
|
|
#define pmd_large(pmd) (pmd_val(pmd) & 2)
|
|
#define pmd_leaf(pmd) (pmd_val(pmd) & 2)
|
|
#define pmd_bad(pmd) (pmd_val(pmd) & 2)
|
|
#define pmd_present(pmd) (pmd_val(pmd))
|
|
|
|
#define copy_pmd(pmdpd,pmdps) \
|
|
do { \
|
|
pmdpd[0] = pmdps[0]; \
|
|
pmdpd[1] = pmdps[1]; \
|
|
flush_pmd_entry(pmdpd); \
|
|
} while (0)
|
|
|
|
#define pmd_clear(pmdp) \
|
|
do { \
|
|
pmdp[0] = __pmd(0); \
|
|
pmdp[1] = __pmd(0); \
|
|
clean_pmd_entry(pmdp); \
|
|
} while (0)
|
|
|
|
/* we don't need complex calculations here as the pmd is folded into the pgd */
|
|
#define pmd_addr_end(addr,end) (end)
|
|
|
|
#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
|
|
|
|
/*
|
|
* We don't have huge page support for short descriptors, for the moment
|
|
* define empty stubs for use by pin_page_for_write.
|
|
*/
|
|
#define pmd_hugewillfault(pmd) (0)
|
|
#define pmd_thp_or_huge(pmd) (0)
|
|
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
#endif /* _ASM_PGTABLE_2LEVEL_H */
|