Initial commit; kernel source import

This commit is contained in:
Nathan
2025-04-06 23:50:55 -05:00
commit 25c6d769f4
45093 changed files with 18199410 additions and 0 deletions

30
arch/x86/mm/Makefile Normal file
View File

@@ -0,0 +1,30 @@
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_physaddr.o := $(nostackp)
CFLAGS_setup_nx.o := $(nostackp)
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck/
obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_MEMTEST) += memtest.o

196
arch/x86/mm/amdtopology.c Normal file
View File

@@ -0,0 +1,196 @@
/*
* AMD NUMA support.
* Discover the memory map and associated nodes.
*
* This version reads it directly from the AMD northbridge.
*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
#include <asm/io.h>
#include <linux/pci_ids.h>
#include <linux/acpi.h>
#include <asm/types.h>
#include <asm/mmzone.h>
#include <asm/proto.h>
#include <asm/e820.h>
#include <asm/pci-direct.h>
#include <asm/numa.h>
#include <asm/mpspec.h>
#include <asm/apic.h>
#include <asm/amd_nb.h>
static unsigned char __initdata nodeids[8];
static __init int find_northbridge(void)
{
int num;
for (num = 0; num < 32; num++) {
u32 header;
header = read_pci_config(0, num, 0, 0x00);
if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
continue;
header = read_pci_config(0, num, 1, 0x00);
if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
continue;
return num;
}
return -ENOENT;
}
static __init void early_get_boot_cpu_id(void)
{
/*
* need to get the APIC ID of the BSP so can use that to
* create apicid_to_node in amd_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
*/
if (smp_found_config)
early_get_smp_config();
#endif
}
int __init amd_numa_init(void)
{
u64 start = PFN_PHYS(0);
u64 end = PFN_PHYS(max_pfn);
unsigned numnodes;
u64 prevbase;
int i, j, nb;
u32 nodeid, reg;
unsigned int bits, cores, apicid_base;
if (!early_pci_allowed())
return -EINVAL;
nb = find_northbridge();
if (nb < 0)
return nb;
pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
return -ENOENT;
pr_info("Number of physical nodes %d\n", numnodes);
prevbase = 0;
for (i = 0; i < 8; i++) {
u64 base, limit;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
continue;
}
if (nodeid >= numnodes) {
pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
pr_info("Skipping node entry %d (base %Lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
pr_err("Node %d using interleaving mode %Lx/%Lx\n",
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
return -EINVAL;
}
if (node_isset(nodeid, numa_nodes_parsed)) {
pr_info("Node %d already present, skipping\n",
nodeid);
continue;
}
limit >>= 16;
limit++;
limit <<= 24;
if (limit > end)
limit = end;
if (limit <= base)
continue;
base >>= 16;
base <<= 24;
if (base < start)
base = start;
if (limit > end)
limit = end;
if (limit == base) {
pr_err("Empty node %d\n", nodeid);
continue;
}
if (limit < base) {
pr_err("Node %d bogus settings %Lx-%Lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
pr_err("Node map not sorted %Lx,%Lx\n",
prevbase, base);
return -EINVAL;
}
pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
nodeid, base, limit);
prevbase = base;
numa_add_memblk(nodeid, base, limit);
node_set(nodeid, numa_nodes_parsed);
}
if (!nodes_weight(numa_nodes_parsed))
return -ENOENT;
/*
* We seem to have valid NUMA configuration. Map apicids to nodes
* using the coreid bits from early_identify_cpu.
*/
bits = boot_cpu_data.x86_coreid_bits;
cores = 1 << bits;
apicid_base = 0;
/* get the APIC ID of the BSP early for systems with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
}
for_each_node_mask(i, numa_nodes_parsed)
for (j = apicid_base; j < cores + apicid_base; j++)
set_apicid_to_node((i << bits) + j, i);
return 0;
}

View File

@@ -0,0 +1,375 @@
/*
* Debug helper to dump the current kernel pagetables of the system
* so that we can see what the various memory ranges are set to.
*
* (C) Copyright 2008 Intel Corporation
*
* Author: Arjan van de Ven <arjan@linux.intel.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/debugfs.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <asm/pgtable.h>
/*
* The dumper groups pagetable entries of the same type into one, and for
* that it needs to keep some state when walking, and flush this state
* when a "break" in the continuity is found.
*/
struct pg_state {
int level;
pgprot_t current_prot;
unsigned long start_address;
unsigned long current_address;
const struct addr_marker *marker;
};
struct addr_marker {
unsigned long start_address;
const char *name;
};
/* indices for address_markers; keep sync'd w/ address_markers below */
enum address_markers_idx {
USER_SPACE_NR = 0,
#ifdef CONFIG_X86_64
KERNEL_SPACE_NR,
LOW_KERNEL_NR,
VMALLOC_START_NR,
VMEMMAP_START_NR,
HIGH_KERNEL_NR,
MODULES_VADDR_NR,
MODULES_END_NR,
#else
KERNEL_SPACE_NR,
VMALLOC_START_NR,
VMALLOC_END_NR,
# ifdef CONFIG_HIGHMEM
PKMAP_BASE_NR,
# endif
FIXADDR_START_NR,
#endif
};
/* Address space markers hints */
static struct addr_marker address_markers[] = {
{ 0, "User Space" },
#ifdef CONFIG_X86_64
{ 0x8000000000000000UL, "Kernel Space" },
{ PAGE_OFFSET, "Low Kernel Mapping" },
{ VMALLOC_START, "vmalloc() Area" },
{ VMEMMAP_START, "Vmemmap" },
{ __START_KERNEL_map, "High Kernel Mapping" },
{ MODULES_VADDR, "Modules" },
{ MODULES_END, "End Modules" },
#else
{ PAGE_OFFSET, "Kernel Mapping" },
{ 0/* VMALLOC_START */, "vmalloc() Area" },
{ 0/*VMALLOC_END*/, "vmalloc() End" },
# ifdef CONFIG_HIGHMEM
{ 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
# endif
{ 0/*FIXADDR_START*/, "Fixmap Area" },
#endif
{ -1, NULL } /* End of list */
};
/* Multipliers for offsets within the PTEs */
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
/*
* Print a readable form of a pgprot_t to the seq_file
*/
static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
{
pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
{ "cr3", "pgd", "pud", "pmd", "pte" };
if (!pgprot_val(prot)) {
/* Not present */
seq_printf(m, " ");
} else {
if (pr & _PAGE_USER)
seq_printf(m, "USR ");
else
seq_printf(m, " ");
if (pr & _PAGE_RW)
seq_printf(m, "RW ");
else
seq_printf(m, "ro ");
if (pr & _PAGE_PWT)
seq_printf(m, "PWT ");
else
seq_printf(m, " ");
if (pr & _PAGE_PCD)
seq_printf(m, "PCD ");
else
seq_printf(m, " ");
/* Bit 9 has a different meaning on level 3 vs 4 */
if (level <= 3) {
if (pr & _PAGE_PSE)
seq_printf(m, "PSE ");
else
seq_printf(m, " ");
} else {
if (pr & _PAGE_PAT)
seq_printf(m, "pat ");
else
seq_printf(m, " ");
}
if (pr & _PAGE_GLOBAL)
seq_printf(m, "GLB ");
else
seq_printf(m, " ");
if (pr & _PAGE_NX)
seq_printf(m, "NX ");
else
seq_printf(m, "x ");
}
seq_printf(m, "%s\n", level_name[level]);
}
/*
* On 64 bits, sign-extend the 48 bit address to 64 bit
*/
static unsigned long normalize_addr(unsigned long u)
{
#ifdef CONFIG_X86_64
return (signed long)(u << 16) >> 16;
#else
return u;
#endif
}
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
* print what we collected so far.
*/
static void note_page(struct seq_file *m, struct pg_state *st,
pgprot_t new_prot, int level)
{
pgprotval_t prot, cur;
static const char units[] = "KMGTPE";
/*
* If we have a "break" in the series, we need to flush the state that
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
if (!st->level) {
/* First entry */
st->current_prot = new_prot;
st->level = level;
st->marker = address_markers;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
} else if (prot != cur || level != st->level ||
st->current_address >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
/*
* Now print the actual finished series
*/
seq_printf(m, "0x%0*lx-0x%0*lx ",
width, st->start_address,
width, st->current_address);
delta = (st->current_address - st->start_address) >> 10;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
}
seq_printf(m, "%9lu%c ", delta, *unit);
printk_prot(m, st->current_prot, st->level);
/*
* We print markers for special areas of address space,
* such as the start of vmalloc space etc.
* This helps in the interpretation.
*/
if (st->current_address >= st->marker[1].start_address) {
st->marker++;
seq_printf(m, "---[ %s ]---\n", st->marker->name);
}
st->start_address = st->current_address;
st->current_prot = new_prot;
st->level = level;
}
}
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
unsigned long P)
{
int i;
pte_t *start;
start = (pte_t *) pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
pgprot_t prot = pte_pgprot(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
note_page(m, st, prot, 4);
start++;
}
}
#if PTRS_PER_PMD > 1
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
unsigned long P)
{
int i;
pmd_t *start;
start = (pmd_t *) pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
if (pmd_large(*start) || !pmd_present(*start))
note_page(m, st, __pgprot(prot), 3);
else
walk_pte_level(m, st, *start,
P + i * PMD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 3);
start++;
}
}
#else
#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
#define pud_large(a) pmd_large(__pmd(pud_val(a)))
#define pud_none(a) pmd_none(__pmd(pud_val(a)))
#endif
#if PTRS_PER_PUD > 1
static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
unsigned long P)
{
int i;
pud_t *start;
start = (pud_t *) pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
if (!pud_none(*start)) {
pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
if (pud_large(*start) || !pud_present(*start))
note_page(m, st, __pgprot(prot), 2);
else
walk_pmd_level(m, st, *start,
P + i * PUD_LEVEL_MULT);
} else
note_page(m, st, __pgprot(0), 2);
start++;
}
}
#else
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
#define pgd_large(a) pud_large(__pud(pgd_val(a)))
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#endif
static void walk_pgd_level(struct seq_file *m)
{
#ifdef CONFIG_X86_64
pgd_t *start = (pgd_t *) &init_level4_pgt;
#else
pgd_t *start = swapper_pg_dir;
#endif
int i;
struct pg_state st;
memset(&st, 0, sizeof(st));
for (i = 0; i < PTRS_PER_PGD; i++) {
st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
if (!pgd_none(*start)) {
pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
if (pgd_large(*start) || !pgd_present(*start))
note_page(m, &st, __pgprot(prot), 1);
else
walk_pud_level(m, &st, *start,
i * PGD_LEVEL_MULT);
} else
note_page(m, &st, __pgprot(0), 1);
start++;
}
/* Flush out the last page */
st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
note_page(m, &st, __pgprot(0), 0);
}
static int ptdump_show(struct seq_file *m, void *v)
{
walk_pgd_level(m);
return 0;
}
static int ptdump_open(struct inode *inode, struct file *filp)
{
return single_open(filp, ptdump_show, NULL);
}
static const struct file_operations ptdump_fops = {
.open = ptdump_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static int pt_dump_init(void)
{
struct dentry *pe;
#ifdef CONFIG_X86_32
/* Not a compile-time constant on x86-32 */
address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
# ifdef CONFIG_HIGHMEM
address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
# endif
address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
#endif
pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
&ptdump_fops);
if (!pe)
return -ENOMEM;
return 0;
}
__initcall(pt_dump_init);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");

169
arch/x86/mm/extable.c Normal file
View File

@@ -0,0 +1,169 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/sort.h>
#include <asm/uaccess.h>
static inline unsigned long
ex_insn_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->insn + x->insn;
}
static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
}
int fixup_exception(struct pt_regs *regs)
{
const struct exception_table_entry *fixup;
unsigned long new_ip;
#ifdef CONFIG_PNPBIOS
if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) {
extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp;
extern u32 pnp_bios_is_utter_crap;
pnp_bios_is_utter_crap = 1;
printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n");
__asm__ volatile(
"movl %0, %%esp\n\t"
"jmp *%1\n\t"
: : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip));
panic("do_trap: can't hit this");
}
#endif
fixup = search_exception_tables(regs->ip);
if (fixup) {
new_ip = ex_fixup_addr(fixup);
if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
/* Special hack for uaccess_err */
current_thread_info()->uaccess_err = 1;
new_ip -= 0x7ffffff0;
}
regs->ip = new_ip;
return 1;
}
return 0;
}
/* Restricted version used during very early boot */
int __init early_fixup_exception(unsigned long *ip)
{
const struct exception_table_entry *fixup;
unsigned long new_ip;
fixup = search_exception_tables(*ip);
if (fixup) {
new_ip = ex_fixup_addr(fixup);
if (fixup->fixup - fixup->insn >= 0x7ffffff0 - 4) {
/* uaccess handling not supported during early boot */
return 0;
}
*ip = new_ip;
return 1;
}
return 0;
}
/*
* Search one exception table for an entry corresponding to the
* given instruction address, and return the address of the entry,
* or NULL if none is found.
* We use a binary search, and thus we assume that the table is
* already sorted.
*/
const struct exception_table_entry *
search_extable(const struct exception_table_entry *first,
const struct exception_table_entry *last,
unsigned long value)
{
while (first <= last) {
const struct exception_table_entry *mid;
unsigned long addr;
mid = ((last - first) >> 1) + first;
addr = ex_insn_addr(mid);
if (addr < value)
first = mid + 1;
else if (addr > value)
last = mid - 1;
else
return mid;
}
return NULL;
}
/*
* The exception table needs to be sorted so that the binary
* search that we use to find entries in it works properly.
* This is used both for the kernel exception table and for
* the exception tables of modules that get loaded.
*
*/
static int cmp_ex(const void *a, const void *b)
{
const struct exception_table_entry *x = a, *y = b;
/*
* This value will always end up fittin in an int, because on
* both i386 and x86-64 the kernel symbol-reachable address
* space is < 2 GiB.
*
* This compare is only valid after normalization.
*/
return x->insn - y->insn;
}
void sort_extable(struct exception_table_entry *start,
struct exception_table_entry *finish)
{
struct exception_table_entry *p;
int i;
/* Convert all entries to being relative to the start of the section */
i = 0;
for (p = start; p < finish; p++) {
p->insn += i;
i += 4;
p->fixup += i;
i += 4;
}
sort(start, finish - start, sizeof(struct exception_table_entry),
cmp_ex, NULL);
/* Denormalize all entries */
i = 0;
for (p = start; p < finish; p++) {
p->insn -= i;
i += 4;
p->fixup -= i;
i += 4;
}
}
#ifdef CONFIG_MODULES
/*
* If the exception table is sorted, any referring to the module init
* will be at the beginning or the end.
*/
void trim_init_extable(struct module *m)
{
/*trim the beginning*/
while (m->num_exentries &&
within_module_init(ex_insn_addr(&m->extable[0]), m)) {
m->extable++;
m->num_exentries--;
}
/*trim the end*/
while (m->num_exentries &&
within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
m->num_exentries--;
}
#endif /* CONFIG_MODULES */

1238
arch/x86/mm/fault.c Normal file

File diff suppressed because it is too large Load Diff

393
arch/x86/mm/gup.c Normal file
View File

@@ -0,0 +1,393 @@
/*
* Lockless get_user_pages_fast for x86
*
* Copyright (C) 2008 Nick Piggin
* Copyright (C) 2008 Novell Inc.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
return ACCESS_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
* any locks. For this we would like to load the pointers atomically,
* but that is not possible (without expensive cmpxchg8b) on PAE. What
* we do have is the guarantee that a pte will only either go from not
* present to present, or present to not present or both -- it will not
* switch to a completely different present page without a TLB flush in
* between; something that we are blocking by holding interrupts off.
*
* Setting ptes from not present to present goes:
* ptep->pte_high = h;
* smp_wmb();
* ptep->pte_low = l;
*
* And present to not present goes:
* ptep->pte_low = 0;
* smp_wmb();
* ptep->pte_high = 0;
*
* We must ensure here that the load of pte_low sees l iff pte_high
* sees h. We load pte_high *after* loading pte_low, which ensures we
* don't see an older value of pte_high. *Then* we recheck pte_low,
* which ensures that we haven't picked up a changed pte high. We might
* have got rubbish values from pte_low and pte_high, but we are
* guaranteed that pte_low will not have the present bit set *unless*
* it is 'l'. And get_user_pages_fast only operates on present ptes, so
* we're safe.
*
* gup_get_pte should not be used or copied outside gup.c without being
* very careful -- it does not atomically load the pte or anything that
* is likely to be useful for you.
*/
pte_t pte;
retry:
pte.pte_low = ptep->pte_low;
smp_rmb();
pte.pte_high = ptep->pte_high;
smp_rmb();
if (unlikely(pte.pte_low != ptep->pte_low))
goto retry;
return pte;
#endif
}
/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
*/
static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t *ptep;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
struct page *page;
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep);
return 0;
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
} while (ptep++, addr += PAGE_SIZE, addr != end);
pte_unmap(ptep - 1);
return 1;
}
static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
SetPageReferenced(page);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t pte = *(pte_t *)&pmd;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
head = pte_page(pte);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
pmdp = pmd_offset(&pud, addr);
do {
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
/*
* The pmd_trans_splitting() check below explains why
* pmdp_splitting_flush has to flush the tlb, to stop
* this gup-fast code from running while we set the
* splitting bit in the pmd. Returning zero will take
* the slow path that will call wait_split_huge_page()
* if the pmd is still in splitting state. gup-fast
* can't because it has irq disabled and
* wait_split_huge_page() would never return as the
* tlb flush IPI wouldn't run.
*/
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pte_range(pmd, addr, next, write, pages, nr))
return 0;
}
} while (pmdp++, addr = next, addr != end);
return 1;
}
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
unsigned long mask;
pte_t pte = *(pte_t *)&pud;
struct page *head, *page;
int refs;
mask = _PAGE_PRESENT|_PAGE_USER;
if (write)
mask |= _PAGE_RW;
if ((pte_flags(pte) & mask) != mask)
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
refs = 0;
head = pte_page(pte);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
if (PageTail(page))
get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
} while (addr += PAGE_SIZE, addr != end);
get_head_page_multiple(head, refs);
return 1;
}
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset(&pgd, addr);
do {
pud_t pud = *pudp;
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
if (unlikely(pud_large(pud))) {
if (!gup_huge_pud(pud, addr, next, write, pages, nr))
return 0;
} else {
if (!gup_pmd_range(pud, addr, next, write, pages, nr))
return 0;
}
} while (pudp++, addr = next, addr != end);
return 1;
}
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
unsigned long flags;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
(void __user *)start, len)))
return 0;
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_save(flags);
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
return nr;
}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @write: whether pages will be written to
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
* Attempt to pin user pages in memory without taking mm->mmap_sem.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
unsigned long next;
pgd_t *pgdp;
int nr = 0;
start &= PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
if (end < start)
goto slow_irqon;
#ifdef CONFIG_X86_64
if (end >> __VIRTUAL_MASK_SHIFT)
goto slow_irqon;
#endif
/*
* XXX: batch / limit 'nr', to avoid large irq off latency
* needs some instrumenting to determine the common sizes used by
* important workloads (eg. DB2), and whether limiting the batch size
* will decrease performance.
*
* It seems like we're in the clear for the moment. Direct-IO is
* the main guy that batches up lots of get_user_pages, and even
* they are limited to 64-at-a-time which is not so many.
*/
/*
* This doesn't prevent pagetable teardown, but does prevent
* the pagetables and pages from being freed on x86.
*
* So long as we atomically load page table pointers versus teardown
* (which we do on x86, with the above PAE exception), we can follow the
* address down to the the page and take a ref on it.
*/
local_irq_disable();
pgdp = pgd_offset(mm, addr);
do {
pgd_t pgd = *pgdp;
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
return nr;
{
int ret;
slow:
local_irq_enable();
slow_irqon:
/* Try to get the remaining pages with get_user_pages */
start += nr << PAGE_SHIFT;
pages += nr;
down_read(&mm->mmap_sem);
ret = get_user_pages(current, mm, start,
(end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
up_read(&mm->mmap_sem);
/* Have to be a bit careful with return values */
if (nr > 0) {
if (ret < 0)
ret = nr;
else
ret += nr;
}
return ret;
}
}

146
arch/x86/mm/highmem_32.c Normal file
View File

@@ -0,0 +1,146 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h> /* for totalram_pages */
#include <linux/bootmem.h>
void *kmap(struct page *page)
{
might_sleep();
if (!PageHighMem(page))
return page_address(page);
return kmap_high(page);
}
EXPORT_SYMBOL(kmap);
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
EXPORT_SYMBOL(kunmap);
/*
* kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
* no global lock is needed and because the kmap code must perform a global TLB
* invalidation when the kmap pool wraps.
*
* However when holding an atomic kmap it is not legal to sleep, so atomic
* kmaps are appropriate for short, tight code paths only.
*/
void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
pagefault_disable();
if (!PageHighMem(page))
return page_address(page);
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
BUG_ON(!pte_none(*(kmap_pte-idx)));
set_pte(kmap_pte-idx, mk_pte(page, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
EXPORT_SYMBOL(kmap_atomic_prot);
void *kmap_atomic(struct page *page)
{
return kmap_atomic_prot(page, kmap_prot);
}
EXPORT_SYMBOL(kmap_atomic);
/*
* This is the same as kmap_atomic() but can map memory that doesn't
* have a struct page associated with it.
*/
void *kmap_atomic_pfn(unsigned long pfn)
{
return kmap_atomic_prot_pfn(pfn, kmap_prot);
}
EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
void __kunmap_atomic(void *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
int idx, type;
type = kmap_atomic_idx();
idx = type + KM_TYPE_NR * smp_processor_id();
#ifdef CONFIG_DEBUG_HIGHMEM
WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
#endif
/*
* Force other mappings to Oops if they'll try to access this
* pte without first remap it. Keeping stale mappings around
* is a bad idea also, in case the page changes cacheability
* attributes or becomes a protected page in a hypervisor.
*/
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
arch_flush_lazy_mmu_mode();
}
#ifdef CONFIG_DEBUG_HIGHMEM
else {
BUG_ON(vaddr < PAGE_OFFSET);
BUG_ON(vaddr >= (unsigned long)high_memory);
}
#endif
pagefault_enable();
}
EXPORT_SYMBOL(__kunmap_atomic);
struct page *kmap_atomic_to_page(void *ptr)
{
unsigned long idx, vaddr = (unsigned long)ptr;
pte_t *pte;
if (vaddr < FIXADDR_START)
return virt_to_page(ptr);
idx = virt_to_fix(vaddr);
pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
return pte_page(*pte);
}
EXPORT_SYMBOL(kmap_atomic_to_page);
void __init set_highmem_pages_init(void)
{
struct zone *zone;
int nid;
/*
* Explicitly reset zone->managed_pages because set_highmem_pages_init()
* is invoked before free_all_bootmem()
*/
reset_all_zones_managed_pages();
for_each_zone(zone) {
unsigned long zone_start_pfn, zone_end_pfn;
if (!is_highmem(zone))
continue;
zone_start_pfn = zone->zone_start_pfn;
zone_end_pfn = zone_start_pfn + zone->spanned_pages;
nid = zone_to_nid(zone);
printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n",
zone->name, nid, zone_start_pfn, zone_end_pfn);
add_highpages_with_active_regions(nid, zone_start_pfn,
zone_end_pfn);
}
}

373
arch/x86/mm/hugetlbpage.c Normal file
View File

@@ -0,0 +1,373 @@
/*
* IA-32 Huge TLB Page Support for Kernel.
*
* Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
*/
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
{
unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
svma->vm_start;
unsigned long sbase = saddr & PUD_MASK;
unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */
unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
/*
* match the virtual addresses, permission and the alignment of the
* page table page.
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
sbase < svma->vm_start || svma->vm_end < s_end)
return 0;
return saddr;
}
static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
{
unsigned long base = addr & PUD_MASK;
unsigned long end = base + PUD_SIZE;
/*
* check on proper vm_flags and page table alignment
*/
if (vma->vm_flags & VM_MAYSHARE &&
vma->vm_start <= base && end <= vma->vm_end)
return 1;
return 0;
}
/*
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
* code much cleaner. pmd allocation is essential for the shared case because
* pud has to be populated inside the same i_mmap_mutex section - otherwise
* racing tasks could either miss the sharing (see huge_pte_offset) or select a
* bad pmd for sharing.
*/
static pte_t *
huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
struct vm_area_struct *vma = find_vma(mm, addr);
struct address_space *mapping = vma->vm_file->f_mapping;
pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
vma->vm_pgoff;
struct vm_area_struct *svma;
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
mutex_lock(&mapping->i_mmap_mutex);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
saddr = page_table_shareable(svma, vma, addr, idx);
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
get_page(virt_to_page(spte));
break;
}
}
}
if (!spte)
goto out;
spin_lock(&mm->page_table_lock);
if (pud_none(*pud))
pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK));
else
put_page(virt_to_page(spte));
spin_unlock(&mm->page_table_lock);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
mutex_unlock(&mapping->i_mmap_mutex);
return pte;
}
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* called with vma->vm_mm->page_table_lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
pud_t *pud = pud_offset(pgd, *addr);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
pud_clear(pud);
put_page(virt_to_page(ptep));
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
{
pgd_t *pgd;
pud_t *pud;
pte_t *pte = NULL;
pgd = pgd_offset(mm, addr);
pud = pud_alloc(mm, pgd, addr);
if (pud) {
if (sz == PUD_SIZE) {
pte = (pte_t *)pud;
} else {
BUG_ON(sz != PMD_SIZE);
if (pud_none(*pud))
pte = huge_pmd_share(mm, addr, pud);
else
pte = (pte_t *)pmd_alloc(mm, pud, addr);
}
}
BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
return pte;
}
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd = NULL;
pgd = pgd_offset(mm, addr);
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, addr);
if (pud_present(*pud)) {
if (pud_large(*pud))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
}
}
return (pte_t *) pmd;
}
#if 0 /* This is just for testing */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
unsigned long start = address;
int length = 1;
int nr;
struct page *page;
struct vm_area_struct *vma;
vma = find_vma(mm, addr);
if (!vma || !is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
pte = huge_pte_offset(mm, address);
/* hugetlb should be locked, and hence, prefaulted */
WARN_ON(!pte || pte_none(*pte));
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
WARN_ON(!PageHead(page));
return page;
}
int pmd_huge(pmd_t pmd)
{
return 0;
}
int pud_huge(pud_t pud)
{
return 0;
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
return NULL;
}
#else
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
return ERR_PTR(-EINVAL);
}
int pmd_huge(pmd_t pmd)
{
return !!(pmd_val(pmd) & _PAGE_PSE);
}
int pud_huge(pud_t pud)
{
return !!(pud_val(pud) & _PAGE_PSE);
}
struct page *
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
pmd_t *pmd, int write)
{
struct page *page;
page = pte_page(*(pte_t *)pmd);
if (page)
page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
return page;
}
struct page *
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write)
{
struct page *page;
page = pte_page(*(pte_t *)pud);
if (page)
page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
return page;
}
#endif
/* x86_64 also uses this file */
#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
info.flags = 0;
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
unsigned long addr0, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
unsigned long addr;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = current->mm->mmap_base;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
* allocations.
*/
if (addr & ~PAGE_MASK) {
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
addr = vm_unmapped_area(&info);
}
return addr;
}
unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
struct hstate *h = hstate_file(file);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
if (len & ~huge_page_mask(h))
return -EINVAL;
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
return addr;
}
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
if (mm->get_unmapped_area == arch_get_unmapped_area)
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
else
return hugetlb_get_unmapped_area_topdown(file, addr, len,
pgoff, flags);
}
#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
#ifdef CONFIG_X86_64
static __init int setup_hugepagesz(char *opt)
{
unsigned long ps = memparse(opt, &opt);
if (ps == PMD_SIZE) {
hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
} else if (ps == PUD_SIZE && cpu_has_gbpages) {
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
} else {
printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
ps >> 20);
return 0;
}
return 1;
}
__setup("hugepagesz=", setup_hugepagesz);
#endif

592
arch/x86/mm/init.c Normal file
View File

@@ -0,0 +1,592 @@
#include <linux/gfp.h>
#include <linux/initrd.h>
#include <linux/ioport.h>
#include <linux/swap.h>
#include <linux/memblock.h>
#include <linux/bootmem.h> /* for max_low_pfn */
#include <asm/cacheflush.h>
#include <asm/e820.h>
#include <asm/init.h>
#include <asm/page.h>
#include <asm/page_types.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <asm/proto.h>
#include <asm/dma.h> /* for MAX_DMA_PFN */
#include <asm/microcode.h>
#include "mm_internal.h"
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
static unsigned long __initdata pgt_buf_top;
static unsigned long min_pfn_mapped;
static bool __initdata can_use_brk_pgt = true;
/*
* Pages returned are already directly mapped.
*
* Changing that is likely to break Xen, see commit:
*
* 279b706 x86,xen: introduce x86_init.mapping.pagetable_reserve
*
* for detailed information.
*/
__ref void *alloc_low_pages(unsigned int num)
{
unsigned long pfn;
int i;
if (after_bootmem) {
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
__GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
unsigned long ret;
if (min_pfn_mapped >= max_pfn_mapped)
panic("alloc_low_page: ran out of memory");
ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
if (!ret)
panic("alloc_low_page: can not alloc memory");
memblock_reserve(ret, PAGE_SIZE * num);
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
for (i = 0; i < num; i++) {
void *adr;
adr = __va((pfn + i) << PAGE_SHIFT);
clear_page(adr);
}
return __va(pfn << PAGE_SHIFT);
}
/* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */
#define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
{
unsigned long tables = INIT_PGT_BUF_SIZE;
phys_addr_t base;
base = __pa(extend_brk(tables, PAGE_SIZE));
pgt_buf_start = base >> PAGE_SHIFT;
pgt_buf_end = pgt_buf_start;
pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
}
int after_bootmem;
int direct_gbpages
#ifdef CONFIG_DIRECT_GBPAGES
= 1
#endif
;
static void __init init_gbpages(void)
{
#ifdef CONFIG_X86_64
if (direct_gbpages && cpu_has_gbpages)
printk(KERN_INFO "Using GB pages for direct mapping\n");
else
direct_gbpages = 0;
#endif
}
struct map_range {
unsigned long start;
unsigned long end;
unsigned page_size_mask;
};
static int page_size_mask;
static void __init probe_page_size_mask(void)
{
init_gbpages();
#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
/*
* For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
if (direct_gbpages)
page_size_mask |= 1 << PG_LEVEL_1G;
if (cpu_has_pse)
page_size_mask |= 1 << PG_LEVEL_2M;
#endif
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
/* Enable PGE if available */
if (cpu_has_pge) {
set_in_cr4(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
}
}
#ifdef CONFIG_X86_32
#define NR_RANGE_MR 3
#else /* CONFIG_X86_64 */
#define NR_RANGE_MR 5
#endif
static int __meminit save_mr(struct map_range *mr, int nr_range,
unsigned long start_pfn, unsigned long end_pfn,
unsigned long page_size_mask)
{
if (start_pfn < end_pfn) {
if (nr_range >= NR_RANGE_MR)
panic("run out of range for init_memory_mapping\n");
mr[nr_range].start = start_pfn<<PAGE_SHIFT;
mr[nr_range].end = end_pfn<<PAGE_SHIFT;
mr[nr_range].page_size_mask = page_size_mask;
nr_range++;
}
return nr_range;
}
/*
* adjust the page_size_mask for small range to go with
* big page size instead small one if nearby are ram too.
*/
static void __init_refok adjust_range_page_size_mask(struct map_range *mr,
int nr_range)
{
int i;
for (i = 0; i < nr_range; i++) {
if ((page_size_mask & (1<<PG_LEVEL_2M)) &&
!(mr[i].page_size_mask & (1<<PG_LEVEL_2M))) {
unsigned long start = round_down(mr[i].start, PMD_SIZE);
unsigned long end = round_up(mr[i].end, PMD_SIZE);
#ifdef CONFIG_X86_32
if ((end >> PAGE_SHIFT) > max_low_pfn)
continue;
#endif
if (memblock_is_region_memory(start, end - start))
mr[i].page_size_mask |= 1<<PG_LEVEL_2M;
}
if ((page_size_mask & (1<<PG_LEVEL_1G)) &&
!(mr[i].page_size_mask & (1<<PG_LEVEL_1G))) {
unsigned long start = round_down(mr[i].start, PUD_SIZE);
unsigned long end = round_up(mr[i].end, PUD_SIZE);
if (memblock_is_region_memory(start, end - start))
mr[i].page_size_mask |= 1<<PG_LEVEL_1G;
}
}
}
static int __meminit split_mem_range(struct map_range *mr, int nr_range,
unsigned long start,
unsigned long end)
{
unsigned long start_pfn, end_pfn, limit_pfn;
unsigned long pfn;
int i;
limit_pfn = PFN_DOWN(end);
/* head if not big page alignment ? */
pfn = start_pfn = PFN_DOWN(start);
#ifdef CONFIG_X86_32
/*
* Don't use a large page for the first 2/4MB of memory
* because there are often fixed size MTRRs in there
* and overlapping MTRRs into large pages can cause
* slowdowns.
*/
if (pfn == 0)
end_pfn = PFN_DOWN(PMD_SIZE);
else
end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#endif
if (end_pfn > limit_pfn)
end_pfn = limit_pfn;
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
pfn = end_pfn;
}
/* big page (2M) range */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
#ifdef CONFIG_X86_32
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#else /* CONFIG_X86_64 */
end_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
if (end_pfn > round_down(limit_pfn, PFN_DOWN(PMD_SIZE)))
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
#endif
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pfn = end_pfn;
}
#ifdef CONFIG_X86_64
/* big page (1G) range */
start_pfn = round_up(pfn, PFN_DOWN(PUD_SIZE));
end_pfn = round_down(limit_pfn, PFN_DOWN(PUD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask &
((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
pfn = end_pfn;
}
/* tail is not big page (1G) alignment */
start_pfn = round_up(pfn, PFN_DOWN(PMD_SIZE));
end_pfn = round_down(limit_pfn, PFN_DOWN(PMD_SIZE));
if (start_pfn < end_pfn) {
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
page_size_mask & (1<<PG_LEVEL_2M));
pfn = end_pfn;
}
#endif
/* tail is not big page (2M) alignment */
start_pfn = pfn;
end_pfn = limit_pfn;
nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
if (!after_bootmem)
adjust_range_page_size_mask(mr, nr_range);
/* try to merge same page size and continuous */
for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
unsigned long old_start;
if (mr[i].end != mr[i+1].start ||
mr[i].page_size_mask != mr[i+1].page_size_mask)
continue;
/* move it */
old_start = mr[i].start;
memmove(&mr[i], &mr[i+1],
(nr_range - 1 - i) * sizeof(struct map_range));
mr[i--].start = old_start;
nr_range--;
}
for (i = 0; i < nr_range; i++)
printk(KERN_DEBUG " [mem %#010lx-%#010lx] page %s\n",
mr[i].start, mr[i].end - 1,
(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
(mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
return nr_range;
}
struct range pfn_mapped[E820_X_MAX];
int nr_pfn_mapped;
static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
nr_pfn_mapped, start_pfn, end_pfn);
nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
max_pfn_mapped = max(max_pfn_mapped, end_pfn);
if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
max_low_pfn_mapped = max(max_low_pfn_mapped,
min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
}
bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
{
int i;
for (i = 0; i < nr_pfn_mapped; i++)
if ((start_pfn >= pfn_mapped[i].start) &&
(end_pfn <= pfn_mapped[i].end))
return true;
return false;
}
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __init_refok init_memory_mapping(unsigned long start,
unsigned long end)
{
struct map_range mr[NR_RANGE_MR];
unsigned long ret = 0;
int nr_range, i;
pr_info("init_memory_mapping: [mem %#010lx-%#010lx]\n",
start, end - 1);
memset(mr, 0, sizeof(mr));
nr_range = split_mem_range(mr, 0, start, end);
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
mr[i].page_size_mask);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
return ret >> PAGE_SHIFT;
}
/*
* We need to iterate through the E820 memory map and create direct mappings
* for only E820_RAM and E820_KERN_RESERVED regions. We cannot simply
* create direct mappings for all pfns from [0 to max_low_pfn) and
* [4GB to max_pfn) because of possible memory holes in high addresses
* that cannot be marked as UC by fixed/variable range MTRRs.
* Depending on the alignment of E820 ranges, this may possibly result
* in using smaller size (i.e. 4K instead of 2M or 1G) page tables.
*
* init_mem_mapping() calls init_range_memory_mapping() with big range.
* That range would have hole in the middle or ends, and only ram parts
* will be mapped in init_range_memory_mapping().
*/
static unsigned long __init init_range_memory_mapping(
unsigned long r_start,
unsigned long r_end)
{
unsigned long start_pfn, end_pfn;
unsigned long mapped_ram_size = 0;
int i;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
u64 start = clamp_val(PFN_PHYS(start_pfn), r_start, r_end);
u64 end = clamp_val(PFN_PHYS(end_pfn), r_start, r_end);
if (start >= end)
continue;
/*
* if it is overlapping with brk pgt, we need to
* alloc pgt buf from memblock instead.
*/
can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
init_memory_mapping(start, end);
mapped_ram_size += end - start;
can_use_brk_pgt = true;
}
return mapped_ram_size;
}
/* (PUD_SHIFT-PMD_SHIFT)/2 */
#define STEP_SIZE_SHIFT 5
void __init init_mem_mapping(void)
{
unsigned long end, real_end, start, last_start;
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
unsigned long new_mapped_ram_size;
probe_page_size_mask();
#ifdef CONFIG_X86_64
end = max_pfn << PAGE_SHIFT;
#else
end = max_low_pfn << PAGE_SHIFT;
#endif
/* the ISA range is always mapped regardless of memory holes */
init_memory_mapping(0, ISA_END_ADDRESS);
/* xen has big range in reserved near end of ram, skip it at first.*/
addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
max_pfn_mapped = 0; /* will get exact value next */
min_pfn_mapped = real_end >> PAGE_SHIFT;
last_start = start = real_end;
/*
* We start from the top (end of memory) and go to the bottom.
* The memblock_find_in_range() gets us a block of RAM from the
* end of RAM in [min_pfn_mapped, max_pfn_mapped) used as new pages
* for page table.
*/
while (last_start > ISA_END_ADDRESS) {
if (last_start > step_size) {
start = round_down(last_start - 1, step_size);
if (start < ISA_END_ADDRESS)
start = ISA_END_ADDRESS;
} else
start = ISA_END_ADDRESS;
new_mapped_ram_size = init_range_memory_mapping(start,
last_start);
last_start = start;
min_pfn_mapped = last_start >> PAGE_SHIFT;
/* only increase step_size after big range get mapped */
if (new_mapped_ram_size > mapped_ram_size)
step_size <<= STEP_SIZE_SHIFT;
mapped_ram_size += new_mapped_ram_size;
}
if (real_end < end)
init_range_memory_mapping(real_end, end);
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
/* can we preseve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
#else
early_ioremap_page_table_range_init();
#endif
load_cr3(swapper_pg_dir);
__flush_tlb_all();
early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
}
/*
* devmem_is_allowed() checks to see if /dev/mem access to a certain address
* is valid. The argument is a physical page number.
*
*
* On x86, access has to be given to the first megabyte of ram because that area
* contains bios code and data regions used by X and dosemu and similar apps.
* Access has to be given to non-kernel-ram areas as well, these contain the PCI
* mmio resources as well as potential bios/acpi data regions.
*/
int devmem_is_allowed(unsigned long pagenr)
{
if (pagenr < 256)
return 1;
if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
return 0;
if (!page_is_ram(pagenr))
return 1;
return 0;
}
void free_init_pages(char *what, unsigned long begin, unsigned long end)
{
unsigned long addr;
unsigned long begin_aligned, end_aligned;
/* Make sure boundaries are page aligned */
begin_aligned = PAGE_ALIGN(begin);
end_aligned = end & PAGE_MASK;
if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
begin = begin_aligned;
end = end_aligned;
}
if (begin >= end)
return;
addr = begin;
/*
* If debugging page accesses then do not free this memory but
* mark them not present - any buggy init-section access will
* create a kernel page fault:
*/
#ifdef CONFIG_DEBUG_PAGEALLOC
printk(KERN_INFO "debug: unmapping init [mem %#010lx-%#010lx]\n",
begin, end - 1);
set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
#else
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
* writeable and non-executable first.
*/
set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
for (; addr < end; addr += PAGE_SIZE) {
memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
free_reserved_page(virt_to_page(addr));
}
#endif
}
void free_initmem(void)
{
free_init_pages("unused kernel memory",
(unsigned long)(&__init_begin),
(unsigned long)(&__init_end));
}
#ifdef CONFIG_BLK_DEV_INITRD
void __init free_initrd_mem(unsigned long start, unsigned long end)
{
#ifdef CONFIG_MICROCODE_EARLY
/*
* Remember, initrd memory may contain microcode or other useful things.
* Before we lose initrd mem, we need to find a place to hold them
* now that normal virtual memory is enabled.
*/
save_microcode_in_initrd();
#endif
/*
* end could be not aligned, and We can not align that,
* decompresser could be confused by aligned initrd_end
* We already reserve the end partial page before in
* - i386_start_kernel()
* - x86_64_start_kernel()
* - relocate_initrd()
* So here We can do PAGE_ALIGN() safely to get partial page to be freed
*/
free_init_pages("initrd memory", start, PAGE_ALIGN(end));
}
#endif
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
free_area_init_nodes(max_zone_pfns);
}

976
arch/x86/mm/init_32.c Normal file
View File

@@ -0,0 +1,976 @@
/*
*
* Copyright (C) 1995 Linus Torvalds
*
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
*/
#include <linux/module.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/pci.h>
#include <linux/pfn.h>
#include <linux/poison.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/memory_hotplug.h>
#include <linux/initrd.h>
#include <linux/cpumask.h>
#include <linux/gfp.h>
#include <asm/asm.h>
#include <asm/bios_ebda.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/olpc_ofw.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
#include <asm/setup.h>
#include <asm/cacheflush.h>
#include <asm/page_types.h>
#include <asm/init.h>
#include "mm_internal.h"
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
/*
* Creates a middle page table and puts a pointer to it in the
* given global directory entry. This only returns the gd entry
* in non-PAE compilation mode, since the middle layer is folded.
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
pud_t *pud;
pmd_t *pmd_table;
#ifdef CONFIG_X86_PAE
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
}
/*
* Create a page table and place a pointer to it in a middle page
* directory entry:
*/
static pte_t * __init one_page_table_init(pmd_t *pmd)
{
if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
pte_t *page_table = (pte_t *)alloc_low_page();
paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
return pte_offset_kernel(pmd, 0);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
int pgd_idx = pgd_index(vaddr);
int pmd_idx = pmd_index(vaddr);
return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
int pte_idx = pte_index(vaddr);
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return one_page_table_init(pmd) + pte_idx;
}
static unsigned long __init
page_table_range_init_count(unsigned long start, unsigned long end)
{
unsigned long count = 0;
#ifdef CONFIG_HIGHMEM
int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
int pgd_idx, pmd_idx;
unsigned long vaddr;
if (pmd_idx_kmap_begin == pmd_idx_kmap_end)
return 0;
vaddr = start;
pgd_idx = pgd_index(vaddr);
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd_idx++) {
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd_idx++) {
if ((vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin &&
(vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end)
count++;
vaddr += PMD_SIZE;
}
pmd_idx = 0;
}
#endif
return count;
}
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte,
void **adr)
{
#ifdef CONFIG_HIGHMEM
/*
* Something (early fixmap) may already have put a pte
* page here, which causes the page table allocation
* to become nonlinear. Attempt to fix it, and if it
* is still nonlinear then we have to bug.
*/
int pmd_idx_kmap_begin = fix_to_virt(FIX_KMAP_END) >> PMD_SHIFT;
int pmd_idx_kmap_end = fix_to_virt(FIX_KMAP_BEGIN) >> PMD_SHIFT;
if (pmd_idx_kmap_begin != pmd_idx_kmap_end
&& (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
&& (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end) {
pte_t *newpte;
int i;
BUG_ON(after_bootmem);
newpte = *adr;
for (i = 0; i < PTRS_PER_PTE; i++)
set_pte(newpte + i, pte[i]);
*adr = (void *)(((unsigned long)(*adr)) + PAGE_SIZE);
paravirt_alloc_pte(&init_mm, __pa(newpte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(newpte)|_PAGE_TABLE));
BUG_ON(newpte != pte_offset_kernel(pmd, 0));
__flush_tlb_all();
paravirt_release_pte(__pa(pte) >> PAGE_SHIFT);
pte = newpte;
}
BUG_ON(vaddr < fix_to_virt(FIX_KMAP_BEGIN - 1)
&& vaddr > fix_to_virt(FIX_KMAP_END)
&& lastpte && lastpte + PTRS_PER_PTE != pte);
#endif
return pte;
}
/*
* This function initializes a certain range of kernel virtual memory
* with new bootmem page tables, everywhere page tables are missing in
* the given range.
*
* NOTE: The pagetables are allocated contiguous on the physical space
* so we can cache the place of the first one and move around without
* checking the pgd every time.
*/
static void __init
page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
{
int pgd_idx, pmd_idx;
unsigned long vaddr;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte = NULL;
unsigned long count = page_table_range_init_count(start, end);
void *adr = NULL;
if (count)
adr = alloc_low_pages(count);
vaddr = start;
pgd_idx = pgd_index(vaddr);
pmd_idx = pmd_index(vaddr);
pgd = pgd_base + pgd_idx;
for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
pmd = pmd + pmd_index(vaddr);
for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end);
pmd++, pmd_idx++) {
pte = page_table_kmap_check(one_page_table_init(pmd),
pmd, vaddr, pte, &adr);
vaddr += PMD_SIZE;
}
pmd_idx = 0;
}
}
static inline int is_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
/*
* This maps the physical memory to kernel virtual address space, a total
* of max_low_pfn pages, by creating page tables starting from address
* PAGE_OFFSET:
*/
unsigned long __init
kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask)
{
int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
unsigned long last_map_addr = end;
unsigned long start_pfn, end_pfn;
pgd_t *pgd_base = swapper_pg_dir;
int pgd_idx, pmd_idx, pte_ofs;
unsigned long pfn;
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
unsigned pages_2m, pages_4k;
int mapping_iter;
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
/*
* First iteration will setup identity mapping using large/small pages
* based on use_pse, with other attributes same as set by
* the early code in head_32.S
*
* Second iteration will setup the appropriate attributes (NX, GLOBAL..)
* as desired for the kernel identity mapping.
*
* This two pass mechanism conforms to the TLB app note which says:
*
* "Software should not write to a paging-structure entry in a way
* that would change, for any linear address, both the page size
* and either the page frame or attributes."
*/
mapping_iter = 1;
if (!cpu_has_pse)
use_pse = 0;
repeat:
pages_2m = pages_4k = 0;
pfn = start_pfn;
pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pgd = pgd_base + pgd_idx;
for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
pmd = one_md_table_init(pgd);
if (pfn >= end_pfn)
continue;
#ifdef CONFIG_X86_PAE
pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pmd += pmd_idx;
#else
pmd_idx = 0;
#endif
for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;
pmd++, pmd_idx++) {
unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;
/*
* Map with big pages if possible, otherwise
* create normal page tables:
*/
if (use_pse) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
/*
* first pass will use the same initial
* identity mapping attribute + _PAGE_PSE.
*/
pgprot_t init_prot =
__pgprot(PTE_IDENT_ATTR |
_PAGE_PSE);
pfn &= PMD_MASK >> PAGE_SHIFT;
addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
PAGE_OFFSET + PAGE_SIZE-1;
if (is_kernel_text(addr) ||
is_kernel_text(addr2))
prot = PAGE_KERNEL_LARGE_EXEC;
pages_2m++;
if (mapping_iter == 1)
set_pmd(pmd, pfn_pmd(pfn, init_prot));
else
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
continue;
}
pte = one_page_table_init(pmd);
pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
pte += pte_ofs;
for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
pgprot_t prot = PAGE_KERNEL;
/*
* first pass will use the same initial
* identity mapping attribute.
*/
pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
if (is_kernel_text(addr))
prot = PAGE_KERNEL_EXEC;
pages_4k++;
if (mapping_iter == 1) {
set_pte(pte, pfn_pte(pfn, init_prot));
last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
} else
set_pte(pte, pfn_pte(pfn, prot));
}
}
}
if (mapping_iter == 1) {
/*
* update direct mapping page count only in the first
* iteration.
*/
update_page_count(PG_LEVEL_2M, pages_2m);
update_page_count(PG_LEVEL_4K, pages_4k);
/*
* local global flush tlb, which will flush the previous
* mappings present in both small and large page TLB's.
*/
__flush_tlb_all();
/*
* Second iteration will set the actual desired PTE attributes.
*/
mapping_iter = 2;
goto repeat;
}
return last_map_addr;
}
pte_t *kmap_pte;
pgprot_t kmap_prot;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
}
static void __init kmap_init(void)
{
unsigned long kmap_vstart;
/*
* Cache the first kmap pte:
*/
kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
kmap_prot = PAGE_KERNEL;
}
#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
vaddr = PKMAP_BASE;
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;
}
void __init add_highpages_with_active_regions(int nid,
unsigned long start_pfn, unsigned long end_pfn)
{
phys_addr_t start, end;
u64 i;
for_each_free_mem_range(i, nid, &start, &end, NULL) {
unsigned long pfn = clamp_t(unsigned long, PFN_UP(start),
start_pfn, end_pfn);
unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end),
start_pfn, end_pfn);
for ( ; pfn < e_pfn; pfn++)
if (pfn_valid(pfn))
free_highmem_page(pfn_to_page(pfn));
}
}
#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
}
#endif /* CONFIG_HIGHMEM */
void __init native_pagetable_init(void)
{
unsigned long pfn, va;
pgd_t *pgd, *base = swapper_pg_dir;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
/*
* Remove any mappings which extend past the end of physical
* memory from the boot time page table.
* In virtual address space, we should have at least two pages
* from VMALLOC_END to pkmap or fixmap according to VMALLOC_END
* definition. And max_low_pfn is set to VMALLOC_END physical
* address. If initial memory mapping is doing right job, we
* should have pte used near max_low_pfn or one pmd is not present.
*/
for (pfn = max_low_pfn; pfn < 1<<(32-PAGE_SHIFT); pfn++) {
va = PAGE_OFFSET + (pfn<<PAGE_SHIFT);
pgd = base + pgd_index(va);
if (!pgd_present(*pgd))
break;
pud = pud_offset(pgd, va);
pmd = pmd_offset(pud, va);
if (!pmd_present(*pmd))
break;
/* should not be large page here */
if (pmd_large(*pmd)) {
pr_warn("try to clear pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx, but pmd is big page and is not using pte !\n",
pfn, pmd, __pa(pmd));
BUG_ON(1);
}
pte = pte_offset_kernel(pmd, va);
if (!pte_present(*pte))
break;
printk(KERN_DEBUG "clearing pte for ram above max_low_pfn: pfn: %lx pmd: %p pmd phys: %lx pte: %p pte phys: %lx\n",
pfn, pmd, __pa(pmd), pte, __pa(pte));
pte_clear(NULL, va, pte);
}
paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
paging_init();
}
/*
* Build a proper pagetable for the kernel mappings. Up until this
* point, we've been running on some set of pagetables constructed by
* the boot process.
*
* If we're booting on native hardware, this will be a pagetable
* constructed in arch/x86/kernel/head_32.S. The root of the
* pagetable will be swapper_pg_dir.
*
* If we're booting paravirtualized under a hypervisor, then there are
* more options: we may already be running PAE, and the pagetable may
* or may not be based in swapper_pg_dir. In any case,
* paravirt_pagetable_init() will set up swapper_pg_dir
* appropriately for the rest of the initialization to work.
*
* In general, pagetable_init() assumes that the pagetable may already
* be partially populated, and so it avoids stomping on any existing
* mappings.
*/
void __init early_ioremap_page_table_range_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
unsigned long vaddr, end;
/*
* Fixed mappings, only the page table structure has to be
* created - mappings will be set by set_fixmap():
*/
vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
page_table_range_init(vaddr, end, pgd_base);
early_ioremap_reset();
}
static void __init pagetable_init(void)
{
pgd_t *pgd_base = swapper_pg_dir;
permanent_kmaps_init(pgd_base);
}
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/* user-defined highmem size */
static unsigned int highmem_pages = -1;
/*
* highmem=size forces highmem to be exactly 'size' bytes.
* This works even on boxes that have no highmem otherwise.
* This also works to reduce highmem size on bigger boxes.
*/
static int __init parse_highmem(char *arg)
{
if (!arg)
return -EINVAL;
highmem_pages = memparse(arg, &arg) >> PAGE_SHIFT;
return 0;
}
early_param("highmem", parse_highmem);
#define MSG_HIGHMEM_TOO_BIG \
"highmem size (%luMB) is bigger than pages available (%luMB)!\n"
#define MSG_LOWMEM_TOO_SMALL \
"highmem size (%luMB) results in <64MB lowmem, ignoring it!\n"
/*
* All of RAM fits into lowmem - but if user wants highmem
* artificially via the highmem=x boot parameter then create
* it:
*/
static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
if (highmem_pages == -1)
highmem_pages = 0;
#ifdef CONFIG_HIGHMEM
if (highmem_pages >= max_pfn) {
printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
highmem_pages = 0;
}
if (highmem_pages) {
if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
max_low_pfn -= highmem_pages;
}
#else
if (highmem_pages)
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}
#define MSG_HIGHMEM_TOO_SMALL \
"only %luMB highmem pages available, ignoring highmem size of %luMB!\n"
#define MSG_HIGHMEM_TRIMMED \
"Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n"
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
if (highmem_pages == -1)
highmem_pages = max_pfn - MAXMEM_PFN;
if (highmem_pages + MAXMEM_PFN < max_pfn)
max_pfn = MAXMEM_PFN + highmem_pages;
if (highmem_pages + MAXMEM_PFN > max_pfn) {
printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
pages_to_mb(max_pfn - MAXMEM_PFN),
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
/*
* Determine low and high memory ranges:
*/
void __init find_low_pfn_range(void)
{
/* it could update max_pfn */
if (max_pfn <= MAXMEM_PFN)
lowmem_pfn_init();
else
highmem_pfn_init();
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init initmem_init(void)
{
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0);
sparse_memory_present_with_active_regions(0);
#ifdef CONFIG_FLATMEM
max_mapnr = num_physpages;
#endif
__vmalloc_start_set = true;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
setup_bootmem_allocator();
}
#endif /* !CONFIG_NEED_MULTIPLE_NODES */
void __init setup_bootmem_allocator(void)
{
printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
max_pfn_mapped<<PAGE_SHIFT);
printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
}
/*
* paging_init() sets up the page tables - note that the first 8MB are
* already mapped by head.S.
*
* This routines also unmaps the page at virtual kernel address 0, so
* that we can trap those pesky NULL-reference errors in the kernel.
*/
void __init paging_init(void)
{
pagetable_init();
__flush_tlb_all();
kmap_init();
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
olpc_dt_build_devicetree();
sparse_memory_present_with_active_regions(MAX_NUMNODES);
sparse_init();
zone_sizes_init();
}
/*
* Test if the WP bit works in supervisor mode. It isn't supported on 386's
* and also on some strange 486's. All 586+'s are OK. This used to involve
* black magic jumps to work around some nasty CPU bugs, but fortunately the
* switch to using exceptions got rid of all that.
*/
static void __init test_wp_bit(void)
{
printk(KERN_INFO
"Checking if this processor honours the WP bit even in supervisor mode...");
/* Any page-aligned address will do, the test is non-destructive */
__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
boot_cpu_data.wp_works_ok = do_test_wp_bit();
clear_fixmap(FIX_WP_TEST);
if (!boot_cpu_data.wp_works_ok) {
printk(KERN_CONT "No.\n");
panic("Linux doesn't support CPUs with broken WP.");
} else {
printk(KERN_CONT "Ok.\n");
}
}
void __init mem_init(void)
{
int codesize, datasize, initsize;
pci_iommu_alloc();
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif
/*
* With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to
* be done before free_all_bootmem(). Memblock use free low memory for
* temporary data (see find_range_array()) and for this purpose can use
* pages that was already passed to the buddy allocator, hence marked as
* not accessible in the page tables when compiled with
* CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not
* important here.
*/
set_highmem_pages_init();
/* this will put all low memory onto the freelists */
totalram_pages += free_all_bootmem();
after_bootmem = 1;
codesize = (unsigned long) &_etext - (unsigned long) &_text;
datasize = (unsigned long) &_edata - (unsigned long) &_etext;
initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
"%dk data, %dk init, %ldk highmem)\n",
nr_free_pages() << (PAGE_SHIFT-10),
num_physpages << (PAGE_SHIFT-10),
codesize >> 10,
datasize >> 10,
initsize >> 10,
totalhigh_pages << (PAGE_SHIFT-10));
printk(KERN_INFO "virtual kernel memory layout:\n"
" fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#ifdef CONFIG_HIGHMEM
" pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
#endif
" vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n"
" lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n"
" .init : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .data : 0x%08lx - 0x%08lx (%4ld kB)\n"
" .text : 0x%08lx - 0x%08lx (%4ld kB)\n",
FIXADDR_START, FIXADDR_TOP,
(FIXADDR_TOP - FIXADDR_START) >> 10,
#ifdef CONFIG_HIGHMEM
PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
(LAST_PKMAP*PAGE_SIZE) >> 10,
#endif
VMALLOC_START, VMALLOC_END,
(VMALLOC_END - VMALLOC_START) >> 20,
(unsigned long)__va(0), (unsigned long)high_memory,
((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
(unsigned long)&__init_begin, (unsigned long)&__init_end,
((unsigned long)&__init_end -
(unsigned long)&__init_begin) >> 10,
(unsigned long)&_etext, (unsigned long)&_edata,
((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
(unsigned long)&_text, (unsigned long)&_etext,
((unsigned long)&_etext - (unsigned long)&_text) >> 10);
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
*/
#define __FIXADDR_TOP (-PAGE_SIZE)
#ifdef CONFIG_HIGHMEM
BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
#define high_memory (-128UL << 20)
BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
#undef high_memory
#undef __FIXADDR_TOP
#ifdef CONFIG_RANDOMIZE_BASE
BUILD_BUG_ON(CONFIG_RANDOMIZE_BASE_MAX_OFFSET > KERNEL_IMAGE_SIZE);
#endif
#ifdef CONFIG_HIGHMEM
BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
BUG_ON(VMALLOC_END > PKMAP_BASE);
#endif
BUG_ON(VMALLOC_START >= VMALLOC_END);
BUG_ON((unsigned long)high_memory > VMALLOC_START);
if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
}
#ifdef CONFIG_MEMORY_HOTPLUG
int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdata = NODE_DATA(nid);
struct zone *zone = pgdata->node_zones + ZONE_HIGHMEM;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
return __add_pages(nid, zone, start_pfn, nr_pages);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
int arch_remove_memory(u64 start, u64 size)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct zone *zone;
zone = page_zone(pfn_to_page(start_pfn));
return __remove_pages(zone, start_pfn, nr_pages);
}
#endif
#endif
/*
* This function cannot be __init, since exceptions don't work in that
* section. Put this after the callers, so that it cannot be inlined.
*/
static noinline int do_test_wp_bit(void)
{
char tmp_reg;
int flag;
__asm__ __volatile__(
" movb %0, %1 \n"
"1: movb %1, %0 \n"
" xorl %2, %2 \n"
"2: \n"
_ASM_EXTABLE(1b,2b)
:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
"=q" (tmp_reg),
"=r" (flag)
:"2" (1)
:"memory");
return flag;
}
#ifdef CONFIG_DEBUG_RODATA
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
if (!kernel_set_to_readonly)
return;
pr_debug("Set kernel text: %lx - %lx for read write\n",
start, start+size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
}
void set_kernel_text_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
if (!kernel_set_to_readonly)
return;
pr_debug("Set kernel text: %lx - %lx for read only\n",
start, start+size);
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
static void mark_nxdata_nx(void)
{
/*
* When this called, init has already been executed and released,
* so everything past _etext should be NX.
*/
unsigned long start = PFN_ALIGN(_etext);
/*
* This comes from is_kernel_text upper limit. Also HPAGE where used:
*/
unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
if (__supported_pte_mask & _PAGE_NX)
printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
}
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel text: %luk\n",
size >> 10);
kernel_set_to_readonly = 1;
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
start, start+size);
set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
size >> 10);
rodata_test();
#ifdef CONFIG_CPA_DEBUG
printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
mark_nxdata_nx();
}
#endif

1425
arch/x86/mm/init_64.c Normal file

File diff suppressed because it is too large Load Diff

119
arch/x86/mm/iomap_32.c Normal file
View File

@@ -0,0 +1,119 @@
/*
* Copyright © 2008 Ingo Molnar
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include <asm/iomap.h>
#include <asm/pat.h>
#include <linux/module.h>
#include <linux/highmem.h>
static int is_io_mapping_possible(resource_size_t base, unsigned long size)
{
#if !defined(CONFIG_X86_PAE) && defined(CONFIG_PHYS_ADDR_T_64BIT)
/* There is no way to map greater than 1 << 32 address without PAE */
if (base + size > 0x100000000ULL)
return 0;
#endif
return 1;
}
int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
{
unsigned long flag = _PAGE_CACHE_WC;
int ret;
if (!is_io_mapping_possible(base, size))
return -EINVAL;
ret = io_reserve_memtype(base, base + size, &flag);
if (ret)
return ret;
*prot = __pgprot(__PAGE_KERNEL | flag);
return 0;
}
EXPORT_SYMBOL_GPL(iomap_create_wc);
void iomap_free(resource_size_t base, unsigned long size)
{
io_free_memtype(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);
void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
unsigned long vaddr;
int idx, type;
pagefault_disable();
type = kmap_atomic_idx_push();
idx = type + KM_TYPE_NR * smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
arch_flush_lazy_mmu_mode();
return (void *)vaddr;
}
/*
* Map 'pfn' using protections 'prot'
*/
void __iomem *
iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
/*
* For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
* PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
* MTRR is UC or WC. UC_MINUS gets the real intention, of the
* user, which is "WC if the MTRR is WC, UC if you can't do that."
*/
if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
prot = PAGE_KERNEL_UC_MINUS;
return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
}
EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
void
iounmap_atomic(void __iomem *kvaddr)
{
unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
int idx, type;
type = kmap_atomic_idx();
idx = type + KM_TYPE_NR * smp_processor_id();
#ifdef CONFIG_DEBUG_HIGHMEM
WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
#endif
/*
* Force other mappings to Oops if they'll try to access this
* pte without first remap it. Keeping stale mappings around
* is a bad idea also, in case the page changes cacheability
* attributes or becomes a protected page in a hypervisor.
*/
kpte_clear_flush(kmap_pte-idx, vaddr);
kmap_atomic_idx_pop();
}
pagefault_enable();
}
EXPORT_SYMBOL_GPL(iounmap_atomic);

623
arch/x86/mm/ioremap.c Normal file
View File

@@ -0,0 +1,623 @@
/*
* Re-map IO memory to kernel address space so that we can access it.
* This is needed for high PCI addresses that aren't mapped in the
* 640k-1MB IO memory area on PC's
*
* (C) Copyright 1995 1996 Linus Torvalds
*/
#include <linux/bootmem.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mmiotrace.h>
#include <asm/cacheflush.h>
#include <asm/e820.h>
#include <asm/fixmap.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/pat.h>
#include "physaddr.h"
/*
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
*/
int ioremap_change_attr(unsigned long vaddr, unsigned long size,
unsigned long prot_val)
{
unsigned long nrpages = size >> PAGE_SHIFT;
int err;
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
err = _set_memory_uc(vaddr, nrpages);
break;
case _PAGE_CACHE_WC:
err = _set_memory_wc(vaddr, nrpages);
break;
case _PAGE_CACHE_WB:
err = _set_memory_wb(vaddr, nrpages);
break;
}
return err;
}
/*
* Remap an arbitrary physical address space into the kernel virtual
* address space. Needed when the kernel wants to access high addresses
* directly.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
static void __iomem *__ioremap_caller(resource_size_t phys_addr,
unsigned long size, unsigned long prot_val, void *caller)
{
unsigned long offset, vaddr;
resource_size_t pfn, last_pfn, last_addr;
const resource_size_t unaligned_phys_addr = phys_addr;
const unsigned long unaligned_size = size;
struct vm_struct *area;
unsigned long new_prot_val;
pgprot_t prot;
int retval;
void __iomem *ret_addr;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
if (!phys_addr_valid(phys_addr)) {
printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
(unsigned long long)phys_addr);
WARN_ON_ONCE(1);
return NULL;
}
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
if (is_ISA_range(phys_addr, last_addr))
return (__force void __iomem *)phys_to_virt(phys_addr);
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
last_pfn = last_addr >> PAGE_SHIFT;
for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) {
int is_ram = page_is_ram(pfn);
if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
return NULL;
WARN_ON_ONCE(is_ram);
}
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PHYSICAL_PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
prot_val, &new_prot_val);
if (retval) {
printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
return NULL;
}
if (prot_val != new_prot_val) {
if (!is_new_memtype_allowed(phys_addr, size,
prot_val, new_prot_val)) {
printk(KERN_ERR
"ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
(unsigned long long)phys_addr,
(unsigned long long)(phys_addr + size),
prot_val, new_prot_val);
goto err_free_memtype;
}
prot_val = new_prot_val;
}
switch (prot_val) {
case _PAGE_CACHE_UC:
default:
prot = PAGE_KERNEL_IO_NOCACHE;
break;
case _PAGE_CACHE_UC_MINUS:
prot = PAGE_KERNEL_IO_UC_MINUS;
break;
case _PAGE_CACHE_WC:
prot = PAGE_KERNEL_IO_WC;
break;
case _PAGE_CACHE_WB:
prot = PAGE_KERNEL_IO;
break;
}
/*
* Ok, go for it..
*/
area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
goto err_free_memtype;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
if (kernel_map_sync_memtype(phys_addr, size, prot_val))
goto err_free_area;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
goto err_free_area;
ret_addr = (void __iomem *) (vaddr + offset);
mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
/*
* Check if the request spans more than any BAR in the iomem resource
* tree.
*/
WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
return ret_addr;
err_free_area:
free_vm_area(area);
err_free_memtype:
free_memtype(phys_addr, phys_addr + size);
return NULL;
}
/**
* ioremap_nocache - map bus memory into CPU space
* @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* ioremap_nocache performs a platform specific sequence of operations to
* make bus memory CPU accessible via the readb/readw/readl/writeb/
* writew/writel functions and the other mmio helpers. The returned
* address is not guaranteed to be usable directly as a virtual
* address.
*
* This version of ioremap ensures that the memory is marked uncachable
* on the CPU as well as honouring existing caching rules from things like
* the PCI bus. Note that there are other caches and buffers on many
* busses. In particular driver authors should read up on PCI writes
*
* It's useful if some control registers are in such an area and
* write combining or read caching is not desirable:
*
* Must be freed with iounmap.
*/
void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
* pat_enabled ? _PAGE_CACHE_UC : _PAGE_CACHE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
* UC MINUS.
*/
unsigned long val = _PAGE_CACHE_UC_MINUS;
return __ioremap_caller(phys_addr, size, val,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_nocache);
/**
* ioremap_wc - map memory into CPU space write combined
* @phys_addr: bus address of the memory
* @size: size of the resource to map
*
* This version of ioremap ensures that the memory is marked write combining.
* Write combining allows faster writes to some hardware devices.
*
* Must be freed with iounmap.
*/
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
if (pat_enabled)
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
__builtin_return_address(0));
else
return ioremap_nocache(phys_addr, size);
}
EXPORT_SYMBOL(ioremap_wc);
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_cache);
void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
unsigned long prot_val)
{
return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
__builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_prot);
/**
* iounmap - Free a IO remapping
* @addr: virtual address from ioremap_*
*
* Caller must ensure there is only one unmapping for the same pointer.
*/
void iounmap(volatile void __iomem *addr)
{
struct vm_struct *p, *o;
if ((void __force *)addr <= high_memory)
return;
/*
* __ioremap special-cases the PCI/ISA range by not instantiating a
* vm_area and by simply returning an address into the kernel mapping
* of ISA space. So handle that here.
*/
if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) &&
(void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
return;
addr = (volatile void __iomem *)
(PAGE_MASK & (unsigned long __force)addr);
mmiotrace_iounmap(addr);
/* Use the vm area unlocked, assuming the caller
ensures there isn't another iounmap for the same address
in parallel. Reuse of the virtual address is prevented by
leaving it in the global lists until we're done with it.
cpa takes care of the direct mappings. */
p = find_vm_area((void __force *)addr);
if (!p) {
printk(KERN_ERR "iounmap: bad address %p\n", addr);
dump_stack();
return;
}
free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
o = remove_vm_area((void __force *)addr);
BUG_ON(p != o || o == NULL);
kfree(p);
}
EXPORT_SYMBOL(iounmap);
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
*/
void *xlate_dev_mem_ptr(unsigned long phys)
{
void *addr;
unsigned long start = phys & PAGE_MASK;
/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
if (addr)
addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
return addr;
}
void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
{
if (page_is_ram(phys >> PAGE_SHIFT))
return;
iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
return;
}
static int __initdata early_ioremap_debug;
static int __init early_ioremap_debug_setup(char *str)
{
early_ioremap_debug = 1;
return 0;
}
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
/* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(addr)];
pud_t *pud = pud_offset(pgd, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;
}
static inline pte_t * __init early_ioremap_pte(unsigned long addr)
{
return &bm_pte[pte_index(addr)];
}
bool __init is_early_ioremap_ptep(pte_t *ptep)
{
return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
}
static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
void __init early_ioremap_init(void)
{
pmd_t *pmd;
int i;
if (early_ioremap_debug)
printk(KERN_INFO "early_ioremap_init()\n");
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
memset(bm_pte, 0, sizeof(bm_pte));
pmd_populate_kernel(&init_mm, pmd, bm_pte);
/*
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
#define __FIXADDR_TOP (-PAGE_SIZE)
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
#undef __FIXADDR_TOP
if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
printk(KERN_WARNING "pmd %p != %p\n",
pmd, early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END)));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n",
fix_to_virt(FIX_BTMAP_END));
printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n",
FIX_BTMAP_BEGIN);
}
}
void __init early_ioremap_reset(void)
{
after_paging_init = 1;
}
static void __init __early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
unsigned long addr = __fix_to_virt(idx);
pte_t *pte;
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
pte = early_ioremap_pte(addr);
if (pgprot_val(flags))
set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags));
else
pte_clear(&init_mm, addr, pte);
__flush_tlb_one(addr);
}
static inline void __init early_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t prot)
{
if (after_paging_init)
__set_fixmap(idx, phys, prot);
else
__early_set_fixmap(idx, phys, prot);
}
static inline void __init early_clear_fixmap(enum fixed_addresses idx)
{
if (after_paging_init)
clear_fixmap(idx);
else
__early_set_fixmap(idx, 0, __pgprot(0));
}
static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
void __init fixup_early_ioremap(void)
{
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (prev_map[i]) {
WARN_ON(1);
break;
}
}
early_ioremap_init();
}
static int __init check_early_ioremap_leak(void)
{
int count = 0;
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (prev_map[i])
count++;
if (!count)
return 0;
WARN(1, KERN_WARNING
"Debug warning: early ioremap leak of %d areas detected.\n",
count);
printk(KERN_WARNING
"please boot with early_ioremap_debug and report the dmesg.\n");
return 1;
}
late_initcall(check_early_ioremap_leak);
static void __init __iomem *
__early_ioremap(resource_size_t phys_addr, unsigned long size, pgprot_t prot)
{
unsigned long offset;
resource_size_t last_addr;
unsigned int nrpages;
enum fixed_addresses idx0, idx;
int i, slot;
WARN_ON(system_state != SYSTEM_BOOTING);
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (!prev_map[i]) {
slot = i;
break;
}
}
if (slot < 0) {
printk(KERN_INFO "early_iomap(%08llx, %08lx) not found slot\n",
(u64)phys_addr, size);
WARN_ON(1);
return NULL;
}
if (early_ioremap_debug) {
printk(KERN_INFO "early_ioremap(%08llx, %08lx) [%d] => ",
(u64)phys_addr, size, slot);
dump_stack();
}
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr) {
WARN_ON(1);
return NULL;
}
prev_size[slot] = size;
/*
* Mappings have to be page-aligned
*/
offset = phys_addr & ~PAGE_MASK;
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr + 1) - phys_addr;
/*
* Mappings have to fit in the FIX_BTMAP area.
*/
nrpages = size >> PAGE_SHIFT;
if (nrpages > NR_FIX_BTMAPS) {
WARN_ON(1);
return NULL;
}
/*
* Ok, go for it..
*/
idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
idx = idx0;
while (nrpages > 0) {
early_set_fixmap(idx, phys_addr, prot);
phys_addr += PAGE_SIZE;
--idx;
--nrpages;
}
if (early_ioremap_debug)
printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
return prev_map[slot];
}
/* Remap an IO device */
void __init __iomem *
early_ioremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
}
/* Remap memory */
void __init __iomem *
early_memremap(resource_size_t phys_addr, unsigned long size)
{
return __early_ioremap(phys_addr, size, PAGE_KERNEL);
}
void __init early_iounmap(void __iomem *addr, unsigned long size)
{
unsigned long virt_addr;
unsigned long offset;
unsigned int nrpages;
enum fixed_addresses idx;
int i, slot;
slot = -1;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
if (prev_map[i] == addr) {
slot = i;
break;
}
}
if (slot < 0) {
printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
addr, size);
WARN_ON(1);
return;
}
if (prev_size[slot] != size) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
addr, size, slot, prev_size[slot]);
WARN_ON(1);
return;
}
if (early_ioremap_debug) {
printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
size, slot);
dump_stack();
}
virt_addr = (unsigned long)addr;
if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) {
WARN_ON(1);
return;
}
offset = virt_addr & ~PAGE_MASK;
nrpages = PAGE_ALIGN(offset + size) >> PAGE_SHIFT;
idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
while (nrpages > 0) {
early_clear_fixmap(idx);
--idx;
--nrpages;
}
prev_map[slot] = NULL;
}

View File

@@ -0,0 +1 @@
obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o

View File

@@ -0,0 +1,227 @@
#include <linux/interrupt.h>
#include <linux/kdebug.h>
#include <linux/kmemcheck.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
#include "error.h"
#include "shadow.h"
enum kmemcheck_error_type {
KMEMCHECK_ERROR_INVALID_ACCESS,
KMEMCHECK_ERROR_BUG,
};
#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
struct kmemcheck_error {
enum kmemcheck_error_type type;
union {
/* KMEMCHECK_ERROR_INVALID_ACCESS */
struct {
/* Kind of access that caused the error */
enum kmemcheck_shadow state;
/* Address and size of the erroneous read */
unsigned long address;
unsigned int size;
};
};
struct pt_regs regs;
struct stack_trace trace;
unsigned long trace_entries[32];
/* We compress it to a char. */
unsigned char shadow_copy[SHADOW_COPY_SIZE];
unsigned char memory_copy[SHADOW_COPY_SIZE];
};
/*
* Create a ring queue of errors to output. We can't call printk() directly
* from the kmemcheck traps, since this may call the console drivers and
* result in a recursive fault.
*/
static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
static unsigned int error_count;
static unsigned int error_rd;
static unsigned int error_wr;
static unsigned int error_missed_count;
static struct kmemcheck_error *error_next_wr(void)
{
struct kmemcheck_error *e;
if (error_count == ARRAY_SIZE(error_fifo)) {
++error_missed_count;
return NULL;
}
e = &error_fifo[error_wr];
if (++error_wr == ARRAY_SIZE(error_fifo))
error_wr = 0;
++error_count;
return e;
}
static struct kmemcheck_error *error_next_rd(void)
{
struct kmemcheck_error *e;
if (error_count == 0)
return NULL;
e = &error_fifo[error_rd];
if (++error_rd == ARRAY_SIZE(error_fifo))
error_rd = 0;
--error_count;
return e;
}
void kmemcheck_error_recall(void)
{
static const char *desc[] = {
[KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
[KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
[KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
[KMEMCHECK_SHADOW_FREED] = "freed",
};
static const char short_desc[] = {
[KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
[KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
[KMEMCHECK_SHADOW_INITIALIZED] = 'i',
[KMEMCHECK_SHADOW_FREED] = 'f',
};
struct kmemcheck_error *e;
unsigned int i;
e = error_next_rd();
if (!e)
return;
switch (e->type) {
case KMEMCHECK_ERROR_INVALID_ACCESS:
printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
8 * e->size, e->state < ARRAY_SIZE(desc) ?
desc[e->state] : "(invalid shadow state)",
(void *) e->address);
printk(KERN_WARNING);
for (i = 0; i < SHADOW_COPY_SIZE; ++i)
printk(KERN_CONT "%02x", e->memory_copy[i]);
printk(KERN_CONT "\n");
printk(KERN_WARNING);
for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
else
printk(KERN_CONT " ?");
}
printk(KERN_CONT "\n");
printk(KERN_WARNING "%*c\n", 2 + 2
* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
break;
case KMEMCHECK_ERROR_BUG:
printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
break;
}
__show_regs(&e->regs, 1);
print_stack_trace(&e->trace, 0);
}
static void do_wakeup(unsigned long data)
{
while (error_count > 0)
kmemcheck_error_recall();
if (error_missed_count > 0) {
printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
"the queue was too small\n", error_missed_count);
error_missed_count = 0;
}
}
static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
/*
* Save the context of an error report.
*/
void kmemcheck_error_save(enum kmemcheck_shadow state,
unsigned long address, unsigned int size, struct pt_regs *regs)
{
static unsigned long prev_ip;
struct kmemcheck_error *e;
void *shadow_copy;
void *memory_copy;
/* Don't report several adjacent errors from the same EIP. */
if (regs->ip == prev_ip)
return;
prev_ip = regs->ip;
e = error_next_wr();
if (!e)
return;
e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
e->state = state;
e->address = address;
e->size = size;
/* Save regs */
memcpy(&e->regs, regs, sizeof(*regs));
/* Save stack trace */
e->trace.nr_entries = 0;
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
save_stack_trace_regs(regs, &e->trace);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
& ~(SHADOW_COPY_SIZE - 1));
BUG_ON(!shadow_copy);
memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
kmemcheck_show_addr(address);
memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
kmemcheck_hide_addr(address);
tasklet_hi_schedule_first(&kmemcheck_tasklet);
}
/*
* Save the context of a kmemcheck bug.
*/
void kmemcheck_error_save_bug(struct pt_regs *regs)
{
struct kmemcheck_error *e;
e = error_next_wr();
if (!e)
return;
e->type = KMEMCHECK_ERROR_BUG;
memcpy(&e->regs, regs, sizeof(*regs));
e->trace.nr_entries = 0;
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 1;
save_stack_trace(&e->trace);
tasklet_hi_schedule_first(&kmemcheck_tasklet);
}

View File

@@ -0,0 +1,15 @@
#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
#define ARCH__X86__MM__KMEMCHECK__ERROR_H
#include <linux/ptrace.h>
#include "shadow.h"
void kmemcheck_error_save(enum kmemcheck_shadow state,
unsigned long address, unsigned int size, struct pt_regs *regs);
void kmemcheck_error_save_bug(struct pt_regs *regs);
void kmemcheck_error_recall(void);
#endif

View File

@@ -0,0 +1,653 @@
/**
* kmemcheck - a heavyweight memory checker for the linux kernel
* Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
* (With a lot of help from Ingo Molnar and Pekka Enberg.)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License (version 2) as
* published by the Free Software Foundation.
*/
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/page-flags.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/string.h>
#include <linux/types.h>
#include <asm/cacheflush.h>
#include <asm/kmemcheck.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include "error.h"
#include "opcode.h"
#include "pte.h"
#include "selftest.h"
#include "shadow.h"
#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
# define KMEMCHECK_ENABLED 0
#endif
#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
# define KMEMCHECK_ENABLED 1
#endif
#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
# define KMEMCHECK_ENABLED 2
#endif
int kmemcheck_enabled = KMEMCHECK_ENABLED;
int __init kmemcheck_init(void)
{
#ifdef CONFIG_SMP
/*
* Limit SMP to use a single CPU. We rely on the fact that this code
* runs before SMP is set up.
*/
if (setup_max_cpus > 1) {
printk(KERN_INFO
"kmemcheck: Limiting number of CPUs to 1.\n");
setup_max_cpus = 1;
}
#endif
if (!kmemcheck_selftest()) {
printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
kmemcheck_enabled = 0;
return -EINVAL;
}
printk(KERN_INFO "kmemcheck: Initialized\n");
return 0;
}
early_initcall(kmemcheck_init);
/*
* We need to parse the kmemcheck= option before any memory is allocated.
*/
static int __init param_kmemcheck(char *str)
{
if (!str)
return -EINVAL;
sscanf(str, "%d", &kmemcheck_enabled);
return 0;
}
early_param("kmemcheck", param_kmemcheck);
int kmemcheck_show_addr(unsigned long address)
{
pte_t *pte;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return 0;
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
__flush_tlb_one(address);
return 1;
}
int kmemcheck_hide_addr(unsigned long address)
{
pte_t *pte;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return 0;
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
__flush_tlb_one(address);
return 1;
}
struct kmemcheck_context {
bool busy;
int balance;
/*
* There can be at most two memory operands to an instruction, but
* each address can cross a page boundary -- so we may need up to
* four addresses that must be hidden/revealed for each fault.
*/
unsigned long addr[4];
unsigned long n_addrs;
unsigned long flags;
/* Data size of the instruction that caused a fault. */
unsigned int size;
};
static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
bool kmemcheck_active(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
return data->balance > 0;
}
/* Save an address that needs to be shown/hidden */
static void kmemcheck_save_addr(unsigned long addr)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
data->addr[data->n_addrs++] = addr;
}
static unsigned int kmemcheck_show_all(void)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
unsigned int i;
unsigned int n;
n = 0;
for (i = 0; i < data->n_addrs; ++i)
n += kmemcheck_show_addr(data->addr[i]);
return n;
}
static unsigned int kmemcheck_hide_all(void)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
unsigned int i;
unsigned int n;
n = 0;
for (i = 0; i < data->n_addrs; ++i)
n += kmemcheck_hide_addr(data->addr[i]);
return n;
}
/*
* Called from the #PF handler.
*/
void kmemcheck_show(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
BUG_ON(!irqs_disabled());
if (unlikely(data->balance != 0)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
data->balance = 0;
return;
}
/*
* None of the addresses actually belonged to kmemcheck. Note that
* this is not an error.
*/
if (kmemcheck_show_all() == 0)
return;
++data->balance;
/*
* The IF needs to be cleared as well, so that the faulting
* instruction can run "uninterrupted". Otherwise, we might take
* an interrupt and start executing that before we've had a chance
* to hide the page again.
*
* NOTE: In the rare case of multiple faults, we must not override
* the original flags:
*/
if (!(regs->flags & X86_EFLAGS_TF))
data->flags = regs->flags;
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
}
/*
* Called from the #DB handler.
*/
void kmemcheck_hide(struct pt_regs *regs)
{
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
int n;
BUG_ON(!irqs_disabled());
if (unlikely(data->balance != 1)) {
kmemcheck_show_all();
kmemcheck_error_save_bug(regs);
data->n_addrs = 0;
data->balance = 0;
if (!(data->flags & X86_EFLAGS_TF))
regs->flags &= ~X86_EFLAGS_TF;
if (data->flags & X86_EFLAGS_IF)
regs->flags |= X86_EFLAGS_IF;
return;
}
if (kmemcheck_enabled)
n = kmemcheck_hide_all();
else
n = kmemcheck_show_all();
if (n == 0)
return;
--data->balance;
data->n_addrs = 0;
if (!(data->flags & X86_EFLAGS_TF))
regs->flags &= ~X86_EFLAGS_TF;
if (data->flags & X86_EFLAGS_IF)
regs->flags |= X86_EFLAGS_IF;
}
void kmemcheck_show_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i) {
unsigned long address;
pte_t *pte;
unsigned int level;
address = (unsigned long) page_address(&p[i]);
pte = lookup_address(address, &level);
BUG_ON(!pte);
BUG_ON(level != PG_LEVEL_4K);
set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
__flush_tlb_one(address);
}
}
bool kmemcheck_page_is_tracked(struct page *p)
{
/* This will also check the "hidden" flag of the PTE. */
return kmemcheck_pte_lookup((unsigned long) page_address(p));
}
void kmemcheck_hide_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i) {
unsigned long address;
pte_t *pte;
unsigned int level;
address = (unsigned long) page_address(&p[i]);
pte = lookup_address(address, &level);
BUG_ON(!pte);
BUG_ON(level != PG_LEVEL_4K);
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
__flush_tlb_one(address);
}
}
/* Access may NOT cross page boundary */
static void kmemcheck_read_strict(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
void *shadow;
enum kmemcheck_shadow status;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return;
kmemcheck_save_addr(addr);
status = kmemcheck_shadow_test(shadow, size);
if (status == KMEMCHECK_SHADOW_INITIALIZED)
return;
if (kmemcheck_enabled)
kmemcheck_error_save(status, addr, size, regs);
if (kmemcheck_enabled == 2)
kmemcheck_enabled = 0;
/* Don't warn about it again. */
kmemcheck_shadow_set(shadow, size);
}
bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
{
enum kmemcheck_shadow status;
void *shadow;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return true;
status = kmemcheck_shadow_test_all(shadow, size);
return status == KMEMCHECK_SHADOW_INITIALIZED;
}
/* Access may cross page boundary */
static void kmemcheck_read(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
unsigned long page = addr & PAGE_MASK;
unsigned long next_addr = addr + size - 1;
unsigned long next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
kmemcheck_read_strict(regs, addr, size);
return;
}
/*
* What we do is basically to split the access across the
* two pages and handle each part separately. Yes, this means
* that we may now see reads that are 3 + 5 bytes, for
* example (and if both are uninitialized, there will be two
* reports), but it makes the code a lot simpler.
*/
kmemcheck_read_strict(regs, addr, next_page - addr);
kmemcheck_read_strict(regs, next_page, next_addr - next_page);
}
static void kmemcheck_write_strict(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
void *shadow;
shadow = kmemcheck_shadow_lookup(addr);
if (!shadow)
return;
kmemcheck_save_addr(addr);
kmemcheck_shadow_set(shadow, size);
}
static void kmemcheck_write(struct pt_regs *regs,
unsigned long addr, unsigned int size)
{
unsigned long page = addr & PAGE_MASK;
unsigned long next_addr = addr + size - 1;
unsigned long next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
kmemcheck_write_strict(regs, addr, size);
return;
}
/* See comment in kmemcheck_read(). */
kmemcheck_write_strict(regs, addr, next_page - addr);
kmemcheck_write_strict(regs, next_page, next_addr - next_page);
}
/*
* Copying is hard. We have two addresses, each of which may be split across
* a page (and each page will have different shadow addresses).
*/
static void kmemcheck_copy(struct pt_regs *regs,
unsigned long src_addr, unsigned long dst_addr, unsigned int size)
{
uint8_t shadow[8];
enum kmemcheck_shadow status;
unsigned long page;
unsigned long next_addr;
unsigned long next_page;
uint8_t *x;
unsigned int i;
unsigned int n;
BUG_ON(size > sizeof(shadow));
page = src_addr & PAGE_MASK;
next_addr = src_addr + size - 1;
next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
/* Same page */
x = kmemcheck_shadow_lookup(src_addr);
if (x) {
kmemcheck_save_addr(src_addr);
for (i = 0; i < size; ++i)
shadow[i] = x[i];
} else {
for (i = 0; i < size; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
} else {
n = next_page - src_addr;
BUG_ON(n > sizeof(shadow));
/* First page */
x = kmemcheck_shadow_lookup(src_addr);
if (x) {
kmemcheck_save_addr(src_addr);
for (i = 0; i < n; ++i)
shadow[i] = x[i];
} else {
/* Not tracked */
for (i = 0; i < n; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
/* Second page */
x = kmemcheck_shadow_lookup(next_page);
if (x) {
kmemcheck_save_addr(next_page);
for (i = n; i < size; ++i)
shadow[i] = x[i - n];
} else {
/* Not tracked */
for (i = n; i < size; ++i)
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
page = dst_addr & PAGE_MASK;
next_addr = dst_addr + size - 1;
next_page = next_addr & PAGE_MASK;
if (likely(page == next_page)) {
/* Same page */
x = kmemcheck_shadow_lookup(dst_addr);
if (x) {
kmemcheck_save_addr(dst_addr);
for (i = 0; i < size; ++i) {
x[i] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
} else {
n = next_page - dst_addr;
BUG_ON(n > sizeof(shadow));
/* First page */
x = kmemcheck_shadow_lookup(dst_addr);
if (x) {
kmemcheck_save_addr(dst_addr);
for (i = 0; i < n; ++i) {
x[i] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
/* Second page */
x = kmemcheck_shadow_lookup(next_page);
if (x) {
kmemcheck_save_addr(next_page);
for (i = n; i < size; ++i) {
x[i - n] = shadow[i];
shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
}
}
}
status = kmemcheck_shadow_test(shadow, size);
if (status == KMEMCHECK_SHADOW_INITIALIZED)
return;
if (kmemcheck_enabled)
kmemcheck_error_save(status, src_addr, size, regs);
if (kmemcheck_enabled == 2)
kmemcheck_enabled = 0;
}
enum kmemcheck_method {
KMEMCHECK_READ,
KMEMCHECK_WRITE,
};
static void kmemcheck_access(struct pt_regs *regs,
unsigned long fallback_address, enum kmemcheck_method fallback_method)
{
const uint8_t *insn;
const uint8_t *insn_primary;
unsigned int size;
struct kmemcheck_context *data = &__get_cpu_var(kmemcheck_context);
/* Recursive fault -- ouch. */
if (data->busy) {
kmemcheck_show_addr(fallback_address);
kmemcheck_error_save_bug(regs);
return;
}
data->busy = true;
insn = (const uint8_t *) regs->ip;
insn_primary = kmemcheck_opcode_get_primary(insn);
kmemcheck_opcode_decode(insn, &size);
switch (insn_primary[0]) {
#ifdef CONFIG_KMEMCHECK_BITOPS_OK
/* AND, OR, XOR */
/*
* Unfortunately, these instructions have to be excluded from
* our regular checking since they access only some (and not
* all) bits. This clears out "bogus" bitfield-access warnings.
*/
case 0x80:
case 0x81:
case 0x82:
case 0x83:
switch ((insn_primary[1] >> 3) & 7) {
/* OR */
case 1:
/* AND */
case 4:
/* XOR */
case 6:
kmemcheck_write(regs, fallback_address, size);
goto out;
/* ADD */
case 0:
/* ADC */
case 2:
/* SBB */
case 3:
/* SUB */
case 5:
/* CMP */
case 7:
break;
}
break;
#endif
/* MOVS, MOVSB, MOVSW, MOVSD */
case 0xa4:
case 0xa5:
/*
* These instructions are special because they take two
* addresses, but we only get one page fault.
*/
kmemcheck_copy(regs, regs->si, regs->di, size);
goto out;
/* CMPS, CMPSB, CMPSW, CMPSD */
case 0xa6:
case 0xa7:
kmemcheck_read(regs, regs->si, size);
kmemcheck_read(regs, regs->di, size);
goto out;
}
/*
* If the opcode isn't special in any way, we use the data from the
* page fault handler to determine the address and type of memory
* access.
*/
switch (fallback_method) {
case KMEMCHECK_READ:
kmemcheck_read(regs, fallback_address, size);
goto out;
case KMEMCHECK_WRITE:
kmemcheck_write(regs, fallback_address, size);
goto out;
}
out:
data->busy = false;
}
bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
unsigned long error_code)
{
pte_t *pte;
/*
* XXX: Is it safe to assume that memory accesses from virtual 86
* mode or non-kernel code segments will _never_ access kernel
* memory (e.g. tracked pages)? For now, we need this to avoid
* invoking kmemcheck for PnP BIOS calls.
*/
if (regs->flags & X86_VM_MASK)
return false;
if (regs->cs != __KERNEL_CS)
return false;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return false;
WARN_ON_ONCE(in_nmi());
if (error_code & 2)
kmemcheck_access(regs, address, KMEMCHECK_WRITE);
else
kmemcheck_access(regs, address, KMEMCHECK_READ);
kmemcheck_show(regs);
return true;
}
bool kmemcheck_trap(struct pt_regs *regs)
{
if (!kmemcheck_active(regs))
return false;
/* We're done. */
kmemcheck_hide(regs);
return true;
}

View File

@@ -0,0 +1,106 @@
#include <linux/types.h>
#include "opcode.h"
static bool opcode_is_prefix(uint8_t b)
{
return
/* Group 1 */
b == 0xf0 || b == 0xf2 || b == 0xf3
/* Group 2 */
|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
|| b == 0x64 || b == 0x65
/* Group 3 */
|| b == 0x66
/* Group 4 */
|| b == 0x67;
}
#ifdef CONFIG_X86_64
static bool opcode_is_rex_prefix(uint8_t b)
{
return (b & 0xf0) == 0x40;
}
#else
static bool opcode_is_rex_prefix(uint8_t b)
{
return false;
}
#endif
#define REX_W (1 << 3)
/*
* This is a VERY crude opcode decoder. We only need to find the size of the
* load/store that caused our #PF and this should work for all the opcodes
* that we care about. Moreover, the ones who invented this instruction set
* should be shot.
*/
void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
{
/* Default operand size */
int operand_size_override = 4;
/* prefixes */
for (; opcode_is_prefix(*op); ++op) {
if (*op == 0x66)
operand_size_override = 2;
}
/* REX prefix */
if (opcode_is_rex_prefix(*op)) {
uint8_t rex = *op;
++op;
if (rex & REX_W) {
switch (*op) {
case 0x63:
*size = 4;
return;
case 0x0f:
++op;
switch (*op) {
case 0xb6:
case 0xbe:
*size = 1;
return;
case 0xb7:
case 0xbf:
*size = 2;
return;
}
break;
}
*size = 8;
return;
}
}
/* escape opcode */
if (*op == 0x0f) {
++op;
/*
* This is move with zero-extend and sign-extend, respectively;
* we don't have to think about 0xb6/0xbe, because this is
* already handled in the conditional below.
*/
if (*op == 0xb7 || *op == 0xbf)
operand_size_override = 2;
}
*size = (*op & 1) ? operand_size_override : 1;
}
const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
{
/* skip prefixes */
while (opcode_is_prefix(*op))
++op;
if (opcode_is_rex_prefix(*op))
++op;
return op;
}

View File

@@ -0,0 +1,9 @@
#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
#include <linux/types.h>
void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
#endif

View File

@@ -0,0 +1,22 @@
#include <linux/mm.h>
#include <asm/pgtable.h>
#include "pte.h"
pte_t *kmemcheck_pte_lookup(unsigned long address)
{
pte_t *pte;
unsigned int level;
pte = lookup_address(address, &level);
if (!pte)
return NULL;
if (level != PG_LEVEL_4K)
return NULL;
if (!pte_hidden(*pte))
return NULL;
return pte;
}

View File

@@ -0,0 +1,10 @@
#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
#define ARCH__X86__MM__KMEMCHECK__PTE_H
#include <linux/mm.h>
#include <asm/pgtable.h>
pte_t *kmemcheck_pte_lookup(unsigned long address);
#endif

View File

@@ -0,0 +1,70 @@
#include <linux/bug.h>
#include <linux/kernel.h>
#include "opcode.h"
#include "selftest.h"
struct selftest_opcode {
unsigned int expected_size;
const uint8_t *insn;
const char *desc;
};
static const struct selftest_opcode selftest_opcodes[] = {
/* REP MOVS */
{1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
{4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
/* MOVZX / MOVZXD */
{1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
{1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
/* MOVSX / MOVSXD */
{1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
{1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
#ifdef CONFIG_X86_64
/* MOVZX / MOVZXD */
{1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
{2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
/* MOVSX / MOVSXD */
{1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
{2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
{4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
#endif
};
static bool selftest_opcode_one(const struct selftest_opcode *op)
{
unsigned size;
kmemcheck_opcode_decode(op->insn, &size);
if (size == op->expected_size)
return true;
printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
op->desc, op->expected_size, size);
return false;
}
static bool selftest_opcodes_all(void)
{
bool pass = true;
unsigned int i;
for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
return pass;
}
bool kmemcheck_selftest(void)
{
bool pass = true;
pass = pass && selftest_opcodes_all();
return pass;
}

View File

@@ -0,0 +1,6 @@
#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
bool kmemcheck_selftest(void);
#endif

View File

@@ -0,0 +1,173 @@
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include "pte.h"
#include "shadow.h"
/*
* Return the shadow address for the given address. Returns NULL if the
* address is not tracked.
*
* We need to be extremely careful not to follow any invalid pointers,
* because this function can be called for *any* possible address.
*/
void *kmemcheck_shadow_lookup(unsigned long address)
{
pte_t *pte;
struct page *page;
if (!virt_addr_valid(address))
return NULL;
pte = kmemcheck_pte_lookup(address);
if (!pte)
return NULL;
page = virt_to_page(address);
if (!page->shadow)
return NULL;
return page->shadow + (address & (PAGE_SIZE - 1));
}
static void mark_shadow(void *address, unsigned int n,
enum kmemcheck_shadow status)
{
unsigned long addr = (unsigned long) address;
unsigned long last_addr = addr + n - 1;
unsigned long page = addr & PAGE_MASK;
unsigned long last_page = last_addr & PAGE_MASK;
unsigned int first_n;
void *shadow;
/* If the memory range crosses a page boundary, stop there. */
if (page == last_page)
first_n = n;
else
first_n = page + PAGE_SIZE - addr;
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, first_n);
addr += first_n;
n -= first_n;
/* Do full-page memset()s. */
while (n >= PAGE_SIZE) {
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, PAGE_SIZE);
addr += PAGE_SIZE;
n -= PAGE_SIZE;
}
/* Do the remaining page, if any. */
if (n > 0) {
shadow = kmemcheck_shadow_lookup(addr);
if (shadow)
memset(shadow, status, n);
}
}
void kmemcheck_mark_unallocated(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
}
void kmemcheck_mark_uninitialized(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
}
/*
* Fill the shadow memory of the given address such that the memory at that
* address is marked as being initialized.
*/
void kmemcheck_mark_initialized(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
}
EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
void kmemcheck_mark_freed(void *address, unsigned int n)
{
mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
}
void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
}
void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
}
void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
{
unsigned int i;
for (i = 0; i < n; ++i)
kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
}
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
{
#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
uint8_t *x;
unsigned int i;
x = shadow;
/*
* Make sure _some_ bytes are initialized. Gcc frequently generates
* code to access neighboring bytes.
*/
for (i = 0; i < size; ++i) {
if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
return x[0];
#else
return kmemcheck_shadow_test_all(shadow, size);
#endif
}
enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
{
uint8_t *x;
unsigned int i;
x = shadow;
/* All bytes must be initialized. */
for (i = 0; i < size; ++i) {
if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
return x[i];
}
return x[0];
}
void kmemcheck_shadow_set(void *shadow, unsigned int size)
{
uint8_t *x;
unsigned int i;
x = shadow;
for (i = 0; i < size; ++i)
x[i] = KMEMCHECK_SHADOW_INITIALIZED;
}

View File

@@ -0,0 +1,18 @@
#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
enum kmemcheck_shadow {
KMEMCHECK_SHADOW_UNALLOCATED,
KMEMCHECK_SHADOW_UNINITIALIZED,
KMEMCHECK_SHADOW_INITIALIZED,
KMEMCHECK_SHADOW_FREED,
};
void *kmemcheck_shadow_lookup(unsigned long address);
enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
unsigned int size);
void kmemcheck_shadow_set(void *shadow, unsigned int size);
#endif

590
arch/x86/mm/kmmio.c Normal file
View File

@@ -0,0 +1,590 @@
/* Support for MMIO probes.
* Benfit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
* 2007 Alexander Eichner
* 2008 Pekka Paalanen <pq@iki.fi>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/ptrace.h>
#include <linux/preempt.h>
#include <linux/percpu.h>
#include <linux/kdebug.h>
#include <linux/mutex.h>
#include <linux/io.h>
#include <linux/slab.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <linux/errno.h>
#include <asm/debugreg.h>
#include <linux/mmiotrace.h>
#define KMMIO_PAGE_HASH_BITS 4
#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
struct kmmio_fault_page {
struct list_head list;
struct kmmio_fault_page *release_next;
unsigned long page; /* location of the fault page */
pteval_t old_presence; /* page presence prior to arming */
bool armed;
/*
* Number of times this page has been registered as a part
* of a probe. If zero, page is disarmed and this may be freed.
* Used only by writers (RCU) and post_kmmio_handler().
* Protected by kmmio_lock, when linked into kmmio_page_table.
*/
int count;
bool scheduled_for_release;
};
struct kmmio_delayed_release {
struct rcu_head rcu;
struct kmmio_fault_page *release_list;
};
struct kmmio_context {
struct kmmio_fault_page *fpage;
struct kmmio_probe *probe;
unsigned long saved_flags;
unsigned long addr;
int active;
};
static DEFINE_SPINLOCK(kmmio_lock);
/* Protected by kmmio_lock */
unsigned int kmmio_count;
/* Read-protected by RCU, write-protected by kmmio_lock. */
static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
static LIST_HEAD(kmmio_probes);
static struct list_head *kmmio_page_list(unsigned long page)
{
return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
}
/* Accessed per-cpu */
static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
/*
* this is basically a dynamic stabbing problem:
* Could use the existing prio tree code or
* Possible better implementations:
* The Interval Skip List: A Data Structure for Finding All Intervals That
* Overlap a Point (might be simple)
* Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
*/
/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
{
struct kmmio_probe *p;
list_for_each_entry_rcu(p, &kmmio_probes, list) {
if (addr >= p->addr && addr < (p->addr + p->len))
return p;
}
return NULL;
}
/* You must be holding RCU read lock. */
static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
{
struct list_head *head;
struct kmmio_fault_page *f;
page &= PAGE_MASK;
head = kmmio_page_list(page);
list_for_each_entry_rcu(f, head, list) {
if (f->page == page)
return f;
}
return NULL;
}
static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
{
pmdval_t v = pmd_val(*pmd);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pmd(pmd, __pmd(v));
}
static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
{
pteval_t v = pte_val(*pte);
if (clear) {
*old = v & _PAGE_PRESENT;
v &= ~_PAGE_PRESENT;
} else /* presume this has been called with clear==true previously */
v |= *old;
set_pte_atomic(pte, __pte(v));
}
static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
{
unsigned int level;
pte_t *pte = lookup_address(f->page, &level);
if (!pte) {
pr_err("no pte for page 0x%08lx\n", f->page);
return -1;
}
switch (level) {
case PG_LEVEL_2M:
clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
break;
case PG_LEVEL_4K:
clear_pte_presence(pte, clear, &f->old_presence);
break;
default:
pr_err("unexpected page level 0x%x.\n", level);
return -1;
}
__flush_tlb_one(f->page);
return 0;
}
/*
* Mark the given page as not present. Access to it will trigger a fault.
*
* Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
* protection is ignored here. RCU read lock is assumed held, so the struct
* will not disappear unexpectedly. Furthermore, the caller must guarantee,
* that double arming the same virtual address (page) cannot occur.
*
* Double disarming on the other hand is allowed, and may occur when a fault
* and mmiotrace shutdown happen simultaneously.
*/
static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret;
WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
if (f->armed) {
pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
f->page, f->count, !!f->old_presence);
}
ret = clear_page_presence(f, true);
WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
f->page);
f->armed = true;
return ret;
}
/** Restore the given page to saved presence state. */
static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
{
int ret = clear_page_presence(f, false);
WARN_ONCE(ret < 0,
KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
f->armed = false;
}
/*
* This is being called from do_page_fault().
*
* We may be in an interrupt or a critical section. Also prefecthing may
* trigger a page fault. We may be in the middle of process switch.
* We cannot take any locks, because we could be executing especially
* within a kmmio critical section.
*
* Local interrupts are disabled, so preemption cannot happen.
* Do not enable interrupts, do not sleep, and watch out for other CPUs.
*/
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate
* and they remain disabled throughout this function.
*/
int kmmio_handler(struct pt_regs *regs, unsigned long addr)
{
struct kmmio_context *ctx;
struct kmmio_fault_page *faultpage;
int ret = 0; /* default to fault not handled */
/*
* Preemption is now disabled to prevent process switch during
* single stepping. We can only handle one active kmmio trace
* per cpu, so ensure that we finish it before something else
* gets to run. We also hold the RCU read lock over single
* stepping to avoid looking up the probe and kmmio_fault_page
* again.
*/
preempt_disable();
rcu_read_lock();
faultpage = get_kmmio_fault_page(addr);
if (!faultpage) {
/*
* Either this page fault is not caused by kmmio, or
* another CPU just pulled the kmmio probe from under
* our feet. The latter case should not be possible.
*/
goto no_kmmio;
}
ctx = &get_cpu_var(kmmio_ctx);
if (ctx->active) {
if (addr == ctx->addr) {
/*
* A second fault on the same page means some other
* condition needs handling by do_page_fault(), the
* page really not being present is the most common.
*/
pr_debug("secondary hit for 0x%08lx CPU %d.\n",
addr, smp_processor_id());
if (!faultpage->old_presence)
pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
addr, smp_processor_id());
} else {
/*
* Prevent overwriting already in-flight context.
* This should not happen, let's hope disarming at
* least prevents a panic.
*/
pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
smp_processor_id(), addr);
pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
disarm_kmmio_fault_page(faultpage);
}
goto no_kmmio_ctx;
}
ctx->active++;
ctx->fpage = faultpage;
ctx->probe = get_kmmio_probe(addr);
ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
ctx->addr = addr;
if (ctx->probe && ctx->probe->pre_handler)
ctx->probe->pre_handler(ctx->probe, regs, addr);
/*
* Enable single-stepping and disable interrupts for the faulting
* context. Local interrupts must not get enabled during stepping.
*/
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* Now we set present bit in PTE and single step. */
disarm_kmmio_fault_page(ctx->fpage);
/*
* If another cpu accesses the same page while we are stepping,
* the access will not be caught. It will simply succeed and the
* only downside is we lose the event. If this becomes a problem,
* the user should drop to single cpu before tracing.
*/
put_cpu_var(kmmio_ctx);
return 1; /* fault handled */
no_kmmio_ctx:
put_cpu_var(kmmio_ctx);
no_kmmio:
rcu_read_unlock();
preempt_enable_no_resched();
return ret;
}
/*
* Interrupts are disabled on entry as trap1 is an interrupt gate
* and they remain disabled throughout this function.
* This must always get called as the pair to kmmio_handler().
*/
static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
{
int ret = 0;
struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
if (!ctx->active) {
/*
* debug traps without an active context are due to either
* something external causing them (f.e. using a debugger while
* mmio tracing enabled), or erroneous behaviour
*/
pr_warning("unexpected debug trap on CPU %d.\n",
smp_processor_id());
goto out;
}
if (ctx->probe && ctx->probe->post_handler)
ctx->probe->post_handler(ctx->probe, condition, regs);
/* Prevent racing against release_kmmio_fault_page(). */
spin_lock(&kmmio_lock);
if (ctx->fpage->count)
arm_kmmio_fault_page(ctx->fpage);
spin_unlock(&kmmio_lock);
regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= ctx->saved_flags;
/* These were acquired in kmmio_handler(). */
ctx->active--;
BUG_ON(ctx->active);
rcu_read_unlock();
preempt_enable_no_resched();
/*
* if somebody else is singlestepping across a probe point, flags
* will have TF set, in which case, continue the remaining processing
* of do_debug, as if this is not a probe hit.
*/
if (!(regs->flags & X86_EFLAGS_TF))
ret = 1;
out:
put_cpu_var(kmmio_ctx);
return ret;
}
/* You must be holding kmmio_lock. */
static int add_kmmio_fault_page(unsigned long page)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
if (f) {
if (!f->count)
arm_kmmio_fault_page(f);
f->count++;
return 0;
}
f = kzalloc(sizeof(*f), GFP_ATOMIC);
if (!f)
return -1;
f->count = 1;
f->page = page;
if (arm_kmmio_fault_page(f)) {
kfree(f);
return -1;
}
list_add_rcu(&f->list, kmmio_page_list(f->page));
return 0;
}
/* You must be holding kmmio_lock. */
static void release_kmmio_fault_page(unsigned long page,
struct kmmio_fault_page **release_list)
{
struct kmmio_fault_page *f;
page &= PAGE_MASK;
f = get_kmmio_fault_page(page);
if (!f)
return;
f->count--;
BUG_ON(f->count < 0);
if (!f->count) {
disarm_kmmio_fault_page(f);
if (!f->scheduled_for_release) {
f->release_next = *release_list;
*release_list = f;
f->scheduled_for_release = true;
}
}
}
/*
* With page-unaligned ioremaps, one or two armed pages may contain
* addresses from outside the intended mapping. Events for these addresses
* are currently silently dropped. The events may result only from programming
* mistakes by accessing addresses before the beginning or past the end of a
* mapping.
*/
int register_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
int ret = 0;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
spin_lock_irqsave(&kmmio_lock, flags);
if (get_kmmio_probe(p->addr)) {
ret = -EEXIST;
goto out;
}
kmmio_count++;
list_add_rcu(&p->list, &kmmio_probes);
while (size < size_lim) {
if (add_kmmio_fault_page(p->addr + size))
pr_err("Unable to set page fault.\n");
size += PAGE_SIZE;
}
out:
spin_unlock_irqrestore(&kmmio_lock, flags);
/*
* XXX: What should I do here?
* Here was a call to global_flush_tlb(), but it does not exist
* anymore. It seems it's not needed after all.
*/
return ret;
}
EXPORT_SYMBOL(register_kmmio_probe);
static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr = container_of(
head,
struct kmmio_delayed_release,
rcu);
struct kmmio_fault_page *f = dr->release_list;
while (f) {
struct kmmio_fault_page *next = f->release_next;
BUG_ON(f->count);
kfree(f);
f = next;
}
kfree(dr);
}
static void remove_kmmio_fault_pages(struct rcu_head *head)
{
struct kmmio_delayed_release *dr =
container_of(head, struct kmmio_delayed_release, rcu);
struct kmmio_fault_page *f = dr->release_list;
struct kmmio_fault_page **prevp = &dr->release_list;
unsigned long flags;
spin_lock_irqsave(&kmmio_lock, flags);
while (f) {
if (!f->count) {
list_del_rcu(&f->list);
prevp = &f->release_next;
} else {
*prevp = f->release_next;
f->release_next = NULL;
f->scheduled_for_release = false;
}
f = *prevp;
}
spin_unlock_irqrestore(&kmmio_lock, flags);
/* This is the real RCU destroy call. */
call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
}
/*
* Remove a kmmio probe. You have to synchronize_rcu() before you can be
* sure that the callbacks will not be called anymore. Only after that
* you may actually release your struct kmmio_probe.
*
* Unregistering a kmmio fault page has three steps:
* 1. release_kmmio_fault_page()
* Disarm the page, wait a grace period to let all faults finish.
* 2. remove_kmmio_fault_pages()
* Remove the pages from kmmio_page_table.
* 3. rcu_free_kmmio_fault_pages()
* Actually free the kmmio_fault_page structs as with RCU.
*/
void unregister_kmmio_probe(struct kmmio_probe *p)
{
unsigned long flags;
unsigned long size = 0;
const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
struct kmmio_fault_page *release_list = NULL;
struct kmmio_delayed_release *drelease;
spin_lock_irqsave(&kmmio_lock, flags);
while (size < size_lim) {
release_kmmio_fault_page(p->addr + size, &release_list);
size += PAGE_SIZE;
}
list_del_rcu(&p->list);
kmmio_count--;
spin_unlock_irqrestore(&kmmio_lock, flags);
if (!release_list)
return;
drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
if (!drelease) {
pr_crit("leaking kmmio_fault_page objects.\n");
return;
}
drelease->release_list = release_list;
/*
* This is not really RCU here. We have just disarmed a set of
* pages so that they cannot trigger page faults anymore. However,
* we cannot remove the pages from kmmio_page_table,
* because a probe hit might be in flight on another CPU. The
* pages are collected into a list, and they will be removed from
* kmmio_page_table when it is certain that no probe hit related to
* these pages can be in flight. RCU grace period sounds like a
* good choice.
*
* If we removed the pages too early, kmmio page fault handler might
* not find the respective kmmio_fault_page and determine it's not
* a kmmio fault, when it actually is. This would lead to madness.
*/
call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
}
EXPORT_SYMBOL(unregister_kmmio_probe);
static int
kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
{
struct die_args *arg = args;
unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
/*
* Reset the BS bit in dr6 (pointed by args->err) to
* denote completion of processing
*/
*dr6_p &= ~DR_STEP;
return NOTIFY_STOP;
}
return NOTIFY_DONE;
}
static struct notifier_block nb_die = {
.notifier_call = kmmio_die_notifier
};
int kmmio_init(void)
{
int i;
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
INIT_LIST_HEAD(&kmmio_page_table[i]);
return register_die_notifier(&nb_die);
}
void kmmio_cleanup(void)
{
int i;
unregister_die_notifier(&nb_die);
for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
WARN_ONCE(!list_empty(&kmmio_page_table[i]),
KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
}
}

118
arch/x86/mm/memtest.c Normal file
View File

@@ -0,0 +1,118 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/pfn.h>
#include <linux/memblock.h>
static u64 patterns[] __initdata = {
/* The first entry has to be 0 to leave memtest with zeroed memory */
0,
0xffffffffffffffffULL,
0x5555555555555555ULL,
0xaaaaaaaaaaaaaaaaULL,
0x1111111111111111ULL,
0x2222222222222222ULL,
0x4444444444444444ULL,
0x8888888888888888ULL,
0x3333333333333333ULL,
0x6666666666666666ULL,
0x9999999999999999ULL,
0xccccccccccccccccULL,
0x7777777777777777ULL,
0xbbbbbbbbbbbbbbbbULL,
0xddddddddddddddddULL,
0xeeeeeeeeeeeeeeeeULL,
0x7a6c7258554e494cULL, /* yeah ;-) */
};
static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
{
printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
(unsigned long long) pattern,
(unsigned long long) start_bad,
(unsigned long long) end_bad);
memblock_reserve(start_bad, end_bad - start_bad);
}
static void __init memtest(u64 pattern, u64 start_phys, u64 size)
{
u64 *p, *start, *end;
u64 start_bad, last_bad;
u64 start_phys_aligned;
const size_t incr = sizeof(pattern);
start_phys_aligned = ALIGN(start_phys, incr);
start = __va(start_phys_aligned);
end = start + (size - (start_phys_aligned - start_phys)) / incr;
start_bad = 0;
last_bad = 0;
for (p = start; p < end; p++)
*p = pattern;
for (p = start; p < end; p++, start_phys_aligned += incr) {
if (*p == pattern)
continue;
if (start_phys_aligned == last_bad + incr) {
last_bad += incr;
continue;
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
start_bad = last_bad = start_phys_aligned;
}
if (start_bad)
reserve_bad_mem(pattern, start_bad, last_bad + incr);
}
static void __init do_one_pass(u64 pattern, u64 start, u64 end)
{
u64 i;
phys_addr_t this_start, this_end;
for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) {
this_start = clamp_t(phys_addr_t, this_start, start, end);
this_end = clamp_t(phys_addr_t, this_end, start, end);
if (this_start < this_end) {
printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
(unsigned long long)this_start,
(unsigned long long)this_end,
(unsigned long long)cpu_to_be64(pattern));
memtest(pattern, this_start, this_end - this_start);
}
}
}
/* default is disabled */
static int memtest_pattern __initdata;
static int __init parse_memtest(char *arg)
{
if (arg)
memtest_pattern = simple_strtoul(arg, NULL, 0);
else
memtest_pattern = ARRAY_SIZE(patterns);
return 0;
}
early_param("memtest", parse_memtest);
void __init early_memtest(unsigned long start, unsigned long end)
{
unsigned int i;
unsigned int idx = 0;
if (!memtest_pattern)
return;
printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
for (i = memtest_pattern-1; i < UINT_MAX; --i) {
idx = i % ARRAY_SIZE(patterns);
do_one_pass(patterns[idx], start, end);
}
}

19
arch/x86/mm/mm_internal.h Normal file
View File

@@ -0,0 +1,19 @@
#ifndef __X86_MM_INTERNAL_H
#define __X86_MM_INTERNAL_H
void *alloc_low_pages(unsigned int num);
static inline void *alloc_low_page(void)
{
return alloc_low_pages(1);
}
void early_ioremap_page_table_range_init(void);
unsigned long kernel_physical_mapping_init(unsigned long start,
unsigned long end,
unsigned long page_size_mask);
void zone_sizes_init(void);
extern int after_bootmem;
#endif /* __X86_MM_INTERNAL_H */

126
arch/x86/mm/mmap.c Normal file
View File

@@ -0,0 +1,126 @@
/*
* Flexible mmap layout support
*
* Based on code by Ingo Molnar and Andi Kleen, copyrighted
* as follows:
*
* Copyright 2003-2009 Red Hat Inc.
* All Rights Reserved.
* Copyright 2005 Andi Kleen, SUSE Labs.
* Copyright 2007 Jiri Kosina, SUSE Labs.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/personality.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/limits.h>
#include <linux/sched.h>
#include <asm/elf.h>
struct __read_mostly va_alignment va_align = {
.flags = -1,
};
static unsigned long stack_maxrandom_size(void)
{
unsigned long max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
}
return max;
}
/*
* Top of mmap area (just below the process stack).
*
* Leave an at least ~128 MB hole with possible stack randomization.
*/
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
static int mmap_is_legacy(void)
{
if (current->personality & ADDR_COMPAT_LAYOUT)
return 1;
if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
return 1;
return sysctl_legacy_va_layout;
}
static unsigned long mmap_rnd(void)
{
unsigned long rnd = 0;
/*
* 8 bits of randomness in 32bit mmaps, 20 address space bits
* 28 bits of randomness in 64bit mmaps, 40 address space bits
*/
if (current->flags & PF_RANDOMIZE) {
if (mmap_is_ia32())
rnd = get_random_int() % (1<<8);
else
rnd = get_random_int() % (1<<28);
}
return rnd << PAGE_SHIFT;
}
static unsigned long mmap_base(void)
{
unsigned long gap = rlimit(RLIMIT_STACK);
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
static unsigned long mmap_legacy_base(void)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
return TASK_UNMAPPED_BASE + mmap_rnd();
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
mm->mmap_legacy_base = mmap_legacy_base();
mm->mmap_base = mmap_base();
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
mm->unmap_area = arch_unmap_area;
} else {
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
mm->unmap_area = arch_unmap_area_topdown;
}
}

480
arch/x86/mm/mmio-mod.c Normal file
View File

@@ -0,0 +1,480 @@
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) IBM Corporation, 2005
* Jeff Muizelaar, 2006, 2007
* Pekka Paalanen, 2008 <pq@iki.fi>
*
* Derived from the read-mod example from relay-examples by Tom Zanussi.
*/
#define pr_fmt(fmt) "mmiotrace: " fmt
#define DEBUG 1
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/io.h>
#include <linux/kallsyms.h>
#include <asm/pgtable.h>
#include <linux/mmiotrace.h>
#include <asm/e820.h> /* for ISA_START_ADDRESS */
#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include "pf_in.h"
struct trap_reason {
unsigned long addr;
unsigned long ip;
enum reason_type type;
int active_traces;
};
struct remap_trace {
struct list_head list;
struct kmmio_probe probe;
resource_size_t phys;
unsigned long id;
};
/* Accessed per-cpu. */
static DEFINE_PER_CPU(struct trap_reason, pf_reason);
static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
static DEFINE_MUTEX(mmiotrace_mutex);
static DEFINE_SPINLOCK(trace_lock);
static atomic_t mmiotrace_enabled;
static LIST_HEAD(trace_list); /* struct remap_trace */
/*
* Locking in this file:
* - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
* - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
* and trace_lock.
* - Routines depending on is_enabled() must take trace_lock.
* - trace_list users must hold trace_lock.
* - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
* - pre/post callbacks assume the effect of is_enabled() being true.
*/
/* module parameters */
static unsigned long filter_offset;
static bool nommiotrace;
static bool trace_pc;
module_param(filter_offset, ulong, 0);
module_param(nommiotrace, bool, 0);
module_param(trace_pc, bool, 0);
MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
static bool is_enabled(void)
{
return atomic_read(&mmiotrace_enabled);
}
static void print_pte(unsigned long address)
{
unsigned int level;
pte_t *pte = lookup_address(address, &level);
if (!pte) {
pr_err("Error in %s: no pte for page 0x%08lx\n",
__func__, address);
return;
}
if (level == PG_LEVEL_2M) {
pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
address);
BUG();
}
pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
address,
(unsigned long long)pte_val(*pte),
(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
}
/*
* For some reason the pre/post pairs have been called in an
* unmatched order. Report and die.
*/
static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
{
const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
addr, my_reason->addr);
print_pte(addr);
print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
#ifdef __i386__
pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
regs->si, regs->di, regs->bp, regs->sp);
#else
pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
regs->ax, regs->cx, regs->dx);
pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
regs->si, regs->di, regs->bp, regs->sp);
#endif
put_cpu_var(pf_reason);
BUG();
}
static void pre(struct kmmio_probe *p, struct pt_regs *regs,
unsigned long addr)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
const unsigned long instptr = instruction_pointer(regs);
const enum reason_type type = get_ins_type(instptr);
struct remap_trace *trace = p->private;
/* it doesn't make sense to have more than one active trace per cpu */
if (my_reason->active_traces)
die_kmmio_nesting_error(regs, addr);
else
my_reason->active_traces++;
my_reason->type = type;
my_reason->addr = addr;
my_reason->ip = instptr;
my_trace->phys = addr - trace->probe.addr + trace->phys;
my_trace->map_id = trace->id;
/*
* Only record the program counter when requested.
* It may taint clean-room reverse engineering.
*/
if (trace_pc)
my_trace->pc = instptr;
else
my_trace->pc = 0;
/*
* XXX: the timestamp recorded will be *after* the tracing has been
* done, not at the time we hit the instruction. SMP implications
* on event ordering?
*/
switch (type) {
case REG_READ:
my_trace->opcode = MMIO_READ;
my_trace->width = get_ins_mem_width(instptr);
break;
case REG_WRITE:
my_trace->opcode = MMIO_WRITE;
my_trace->width = get_ins_mem_width(instptr);
my_trace->value = get_ins_reg_val(instptr, regs);
break;
case IMM_WRITE:
my_trace->opcode = MMIO_WRITE;
my_trace->width = get_ins_mem_width(instptr);
my_trace->value = get_ins_imm_val(instptr);
break;
default:
{
unsigned char *ip = (unsigned char *)instptr;
my_trace->opcode = MMIO_UNKNOWN_OP;
my_trace->width = 0;
my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
*(ip + 2);
}
}
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}
static void post(struct kmmio_probe *p, unsigned long condition,
struct pt_regs *regs)
{
struct trap_reason *my_reason = &get_cpu_var(pf_reason);
struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
/* this should always return the active_trace count to 0 */
my_reason->active_traces--;
if (my_reason->active_traces) {
pr_emerg("unexpected post handler");
BUG();
}
switch (my_reason->type) {
case REG_READ:
my_trace->value = get_ins_reg_val(my_reason->ip, regs);
break;
default:
break;
}
mmio_trace_rw(my_trace);
put_cpu_var(cpu_trace);
put_cpu_var(pf_reason);
}
static void ioremap_trace_core(resource_size_t offset, unsigned long size,
void __iomem *addr)
{
static atomic_t next_id;
struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
/* These are page-unaligned. */
struct mmiotrace_map map = {
.phys = offset,
.virt = (unsigned long)addr,
.len = size,
.opcode = MMIO_PROBE
};
if (!trace) {
pr_err("kmalloc failed in ioremap\n");
return;
}
*trace = (struct remap_trace) {
.probe = {
.addr = (unsigned long)addr,
.len = size,
.pre_handler = pre,
.post_handler = post,
.private = trace
},
.phys = offset,
.id = atomic_inc_return(&next_id)
};
map.map_id = trace->id;
spin_lock_irq(&trace_lock);
if (!is_enabled()) {
kfree(trace);
goto not_enabled;
}
mmio_trace_mapping(&map);
list_add_tail(&trace->list, &trace_list);
if (!nommiotrace)
register_kmmio_probe(&trace->probe);
not_enabled:
spin_unlock_irq(&trace_lock);
}
void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
void __iomem *addr)
{
if (!is_enabled()) /* recheck and proper locking in *_core() */
return;
pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
(unsigned long long)offset, size, addr);
if ((filter_offset) && (offset != filter_offset))
return;
ioremap_trace_core(offset, size, addr);
}
static void iounmap_trace_core(volatile void __iomem *addr)
{
struct mmiotrace_map map = {
.phys = 0,
.virt = (unsigned long)addr,
.len = 0,
.opcode = MMIO_UNPROBE
};
struct remap_trace *trace;
struct remap_trace *tmp;
struct remap_trace *found_trace = NULL;
pr_debug("Unmapping %p.\n", addr);
spin_lock_irq(&trace_lock);
if (!is_enabled())
goto not_enabled;
list_for_each_entry_safe(trace, tmp, &trace_list, list) {
if ((unsigned long)addr == trace->probe.addr) {
if (!nommiotrace)
unregister_kmmio_probe(&trace->probe);
list_del(&trace->list);
found_trace = trace;
break;
}
}
map.map_id = (found_trace) ? found_trace->id : -1;
mmio_trace_mapping(&map);
not_enabled:
spin_unlock_irq(&trace_lock);
if (found_trace) {
synchronize_rcu(); /* unregister_kmmio_probe() requirement */
kfree(found_trace);
}
}
void mmiotrace_iounmap(volatile void __iomem *addr)
{
might_sleep();
if (is_enabled()) /* recheck and proper locking in *_core() */
iounmap_trace_core(addr);
}
int mmiotrace_printk(const char *fmt, ...)
{
int ret = 0;
va_list args;
unsigned long flags;
va_start(args, fmt);
spin_lock_irqsave(&trace_lock, flags);
if (is_enabled())
ret = mmio_trace_printk(fmt, args);
spin_unlock_irqrestore(&trace_lock, flags);
va_end(args);
return ret;
}
EXPORT_SYMBOL(mmiotrace_printk);
static void clear_trace_list(void)
{
struct remap_trace *trace;
struct remap_trace *tmp;
/*
* No locking required, because the caller ensures we are in a
* critical section via mutex, and is_enabled() is false,
* i.e. nothing can traverse or modify this list.
* Caller also ensures is_enabled() cannot change.
*/
list_for_each_entry(trace, &trace_list, list) {
pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
trace->probe.addr, trace->probe.len);
if (!nommiotrace)
unregister_kmmio_probe(&trace->probe);
}
synchronize_rcu(); /* unregister_kmmio_probe() requirement */
list_for_each_entry_safe(trace, tmp, &trace_list, list) {
list_del(&trace->list);
kfree(trace);
}
}
#ifdef CONFIG_HOTPLUG_CPU
static cpumask_var_t downed_cpus;
static void enter_uniprocessor(void)
{
int cpu;
int err;
if (downed_cpus == NULL &&
!alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
pr_notice("Failed to allocate mask\n");
goto out;
}
get_online_cpus();
cpumask_copy(downed_cpus, cpu_online_mask);
cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
if (num_online_cpus() > 1)
pr_notice("Disabling non-boot CPUs...\n");
put_online_cpus();
for_each_cpu(cpu, downed_cpus) {
err = cpu_down(cpu);
if (!err)
pr_info("CPU%d is down.\n", cpu);
else
pr_err("Error taking CPU%d down: %d\n", cpu, err);
}
out:
if (num_online_cpus() > 1)
pr_warning("multiple CPUs still online, may miss events.\n");
}
/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
static void __ref leave_uniprocessor(void)
{
int cpu;
int err;
if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
return;
pr_notice("Re-enabling CPUs...\n");
for_each_cpu(cpu, downed_cpus) {
err = cpu_up(cpu);
if (!err)
pr_info("enabled CPU%d.\n", cpu);
else
pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
}
}
#else /* !CONFIG_HOTPLUG_CPU */
static void enter_uniprocessor(void)
{
if (num_online_cpus() > 1)
pr_warning("multiple CPUs are online, may miss events. "
"Suggest booting with maxcpus=1 kernel argument.\n");
}
static void leave_uniprocessor(void)
{
}
#endif
void enable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (is_enabled())
goto out;
if (nommiotrace)
pr_info("MMIO tracing disabled.\n");
kmmio_init();
enter_uniprocessor();
spin_lock_irq(&trace_lock);
atomic_inc(&mmiotrace_enabled);
spin_unlock_irq(&trace_lock);
pr_info("enabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}
void disable_mmiotrace(void)
{
mutex_lock(&mmiotrace_mutex);
if (!is_enabled())
goto out;
spin_lock_irq(&trace_lock);
atomic_dec(&mmiotrace_enabled);
BUG_ON(is_enabled());
spin_unlock_irq(&trace_lock);
clear_trace_list(); /* guarantees: no more kmmio callbacks */
leave_uniprocessor();
kmmio_cleanup();
pr_info("disabled.\n");
out:
mutex_unlock(&mmiotrace_mutex);
}

820
arch/x86/mm/numa.c Normal file
View File

@@ -0,0 +1,820 @@
/* Common code for 32 and 64-bit NUMA */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/mmzone.h>
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
#include <linux/sched.h>
#include <linux/topology.h>
#include <asm/e820.h>
#include <asm/proto.h>
#include <asm/dma.h>
#include <asm/acpi.h>
#include <asm/amd_nb.h>
#include "numa_internal.h"
int __initdata numa_off;
nodemask_t numa_nodes_parsed __initdata;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
static struct numa_meminfo numa_meminfo
#ifndef CONFIG_MEMORY_HOTPLUG
__initdata
#endif
;
static int numa_distance_cnt;
static u8 *numa_distance;
static __init int numa_setup(char *opt)
{
if (!opt)
return -EINVAL;
if (!strncmp(opt, "off", 3))
numa_off = 1;
#ifdef CONFIG_NUMA_EMU
if (!strncmp(opt, "fake=", 5))
numa_emu_cmdline(opt + 5);
#endif
#ifdef CONFIG_ACPI_NUMA
if (!strncmp(opt, "noacpi", 6))
acpi_numa = -1;
#endif
return 0;
}
early_param("numa", numa_setup);
/*
* apicid, cpu, node mappings
*/
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
int __cpuinit numa_cpu_node(int cpu)
{
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
if (apicid != BAD_APICID)
return __apicid_to_node[apicid];
return NUMA_NO_NODE;
}
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
/*
* Map cpu index to node index
*/
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
void numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
/* early setting, no percpu area yet */
if (cpu_to_node_map) {
cpu_to_node_map[cpu] = node;
return;
}
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
dump_stack();
return;
}
#endif
per_cpu(x86_cpu_to_node_map, cpu) = node;
set_cpu_numa_node(cpu, node);
}
void numa_clear_node(int cpu)
{
numa_set_node(cpu, NUMA_NO_NODE);
}
/*
* Allocate node_to_cpumask_map based on number of available nodes
* Requires node_possible_map to be valid.
*
* Note: cpumask_of_node() is not valid until after this is done.
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
*/
void __init setup_node_to_cpumask_map(void)
{
unsigned int node;
/* setup nr_node_ids if not done yet */
if (nr_node_ids == MAX_NUMNODES)
setup_nr_node_ids();
/* allocate the map */
for (node = 0; node < nr_node_ids; node++)
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
/* cpumask_of_node() will now work */
pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
}
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
struct numa_meminfo *mi)
{
/* ignore zero length blks */
if (start == end)
return 0;
/* whine about and ignore invalid blks */
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
pr_warning("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
nid, start, end - 1);
return 0;
}
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
pr_err("NUMA: too many memblk ranges\n");
return -EINVAL;
}
mi->blk[mi->nr_blks].start = start;
mi->blk[mi->nr_blks].end = end;
mi->blk[mi->nr_blks].nid = nid;
mi->nr_blks++;
return 0;
}
/**
* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
* @idx: Index of memblk to remove
* @mi: numa_meminfo to remove memblk from
*
* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
* decrementing @mi->nr_blks.
*/
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
{
mi->nr_blks--;
memmove(&mi->blk[idx], &mi->blk[idx + 1],
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
}
/**
* numa_add_memblk - Add one numa_memblk to numa_meminfo
* @nid: NUMA node ID of the new memblk
* @start: Start address of the new memblk
* @end: End address of the new memblk
*
* Add a new memblk to the default numa_meminfo.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init numa_add_memblk(int nid, u64 start, u64 end)
{
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
}
/* Initialize NODE_DATA for a node on the local memory */
static void __init setup_node_data(int nid, u64 start, u64 end)
{
const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
u64 nd_pa;
void *nd;
int tnid;
/*
* Don't confuse VM with a node that doesn't have the
* minimum amount of memory:
*/
if (end && (end - start) < NODE_MIN_SIZE)
return;
start = roundup(start, ZONE_ALIGN);
printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
nid, start, end - 1);
/*
* Allocate node data. Try node-local memory and then any node.
* Never allocate in DMA zone.
*/
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
if (!nd_pa) {
pr_err("Cannot find %zu bytes in node %d\n",
nd_size, nid);
return;
}
nd = __va(nd_pa);
/* report and initialize */
printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n",
nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
node_set_online(nid);
}
/**
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
* Sanitize @mi by merging and removing unncessary memblks. Also check for
* conflicts and clear unused memblks.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
{
const u64 low = 0;
const u64 high = PFN_PHYS(max_pfn);
int i, j, k;
/* first, trim all entries */
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
/* make sure all blocks are inside the limits */
bi->start = max(bi->start, low);
bi->end = min(bi->end, high);
/* and there's no empty block */
if (bi->start >= bi->end)
numa_remove_memblk_from(i--, mi);
}
/* merge neighboring / overlapping entries */
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *bi = &mi->blk[i];
for (j = i + 1; j < mi->nr_blks; j++) {
struct numa_memblk *bj = &mi->blk[j];
u64 start, end;
/*
* See whether there are overlapping blocks. Whine
* about but allow overlaps of the same nid. They
* will be merged below.
*/
if (bi->end > bj->start && bi->start < bj->end) {
if (bi->nid != bj->nid) {
pr_err("NUMA: node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
bi->nid, bi->start, bi->end - 1,
bj->nid, bj->start, bj->end - 1);
return -EINVAL;
}
pr_warning("NUMA: Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
bi->nid, bi->start, bi->end - 1,
bj->start, bj->end - 1);
}
/*
* Join together blocks on the same node, holes
* between which don't overlap with memory on other
* nodes.
*/
if (bi->nid != bj->nid)
continue;
start = min(bi->start, bj->start);
end = max(bi->end, bj->end);
for (k = 0; k < mi->nr_blks; k++) {
struct numa_memblk *bk = &mi->blk[k];
if (bi->nid == bk->nid)
continue;
if (start < bk->end && end > bk->start)
break;
}
if (k < mi->nr_blks)
continue;
printk(KERN_INFO "NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
bi->nid, bi->start, bi->end - 1, bj->start,
bj->end - 1, start, end - 1);
bi->start = start;
bi->end = end;
numa_remove_memblk_from(j--, mi);
}
}
/* clear unused ones */
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
mi->blk[i].start = mi->blk[i].end = 0;
mi->blk[i].nid = NUMA_NO_NODE;
}
return 0;
}
/*
* Set nodes, which have memory in @mi, in *@nodemask.
*/
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
const struct numa_meminfo *mi)
{
int i;
for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
if (mi->blk[i].start != mi->blk[i].end &&
mi->blk[i].nid != NUMA_NO_NODE)
node_set(mi->blk[i].nid, *nodemask);
}
/**
* numa_reset_distance - Reset NUMA distance table
*
* The current table is freed. The next numa_set_distance() call will
* create a new one.
*/
void __init numa_reset_distance(void)
{
size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
/* numa_distance could be 1LU marking allocation failure, test cnt */
if (numa_distance_cnt)
memblock_free(__pa(numa_distance), size);
numa_distance_cnt = 0;
numa_distance = NULL; /* enable table creation */
}
static int __init numa_alloc_distance(void)
{
nodemask_t nodes_parsed;
size_t size;
int i, j, cnt = 0;
u64 phys;
/* size the new table and allocate it */
nodes_parsed = numa_nodes_parsed;
numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
for_each_node_mask(i, nodes_parsed)
cnt = i;
cnt++;
size = cnt * cnt * sizeof(numa_distance[0]);
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
size, PAGE_SIZE);
if (!phys) {
pr_warning("NUMA: Warning: can't allocate distance table!\n");
/* don't retry until explicitly reset */
numa_distance = (void *)1LU;
return -ENOMEM;
}
memblock_reserve(phys, size);
numa_distance = __va(phys);
numa_distance_cnt = cnt;
/* fill with the default distances */
for (i = 0; i < cnt; i++)
for (j = 0; j < cnt; j++)
numa_distance[i * cnt + j] = i == j ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
return 0;
}
/**
* numa_set_distance - Set NUMA distance from one NUMA to another
* @from: the 'from' node to set distance
* @to: the 'to' node to set distance
* @distance: NUMA distance
*
* Set the distance from node @from to @to to @distance. If distance table
* doesn't exist, one which is large enough to accommodate all the currently
* known nodes will be created.
*
* If such table cannot be allocated, a warning is printed and further
* calls are ignored until the distance table is reset with
* numa_reset_distance().
*
* If @from or @to is higher than the highest known node or lower than zero
* at the time of table creation or @distance doesn't make sense, the call
* is ignored.
* This is to allow simplification of specific NUMA config implementations.
*/
void __init numa_set_distance(int from, int to, int distance)
{
if (!numa_distance && numa_alloc_distance() < 0)
return;
if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
from < 0 || to < 0) {
pr_warn_once("NUMA: Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
from, to, distance);
return;
}
if ((u8)distance != distance ||
(from == to && distance != LOCAL_DISTANCE)) {
pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
from, to, distance);
return;
}
numa_distance[from * numa_distance_cnt + to] = distance;
}
int __node_distance(int from, int to)
{
if (from >= numa_distance_cnt || to >= numa_distance_cnt)
return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
return numa_distance[from * numa_distance_cnt + to];
}
EXPORT_SYMBOL(__node_distance);
/*
* Sanity check to catch more bad NUMA configurations (they are amazingly
* common). Make sure the nodes cover all memory.
*/
static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
{
u64 numaram, e820ram;
int i;
numaram = 0;
for (i = 0; i < mi->nr_blks; i++) {
u64 s = mi->blk[i].start >> PAGE_SHIFT;
u64 e = mi->blk[i].end >> PAGE_SHIFT;
numaram += e - s;
numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
if ((s64)numaram < 0)
numaram = 0;
}
e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
(numaram << PAGE_SHIFT) >> 20,
(e820ram << PAGE_SHIFT) >> 20);
return false;
}
return true;
}
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
int i, nid;
/* Account for nodes with cpus and no memory */
node_possible_map = numa_nodes_parsed;
numa_nodemask_from_meminfo(&node_possible_map, mi);
if (WARN_ON(nodes_empty(node_possible_map)))
return -EINVAL;
for (i = 0; i < mi->nr_blks; i++) {
struct numa_memblk *mb = &mi->blk[i];
memblock_set_node(mb->start, mb->end - mb->start, mb->nid);
}
/*
* If sections array is gonna be used for pfn -> nid mapping, check
* whether its granularity is fine enough.
*/
#ifdef NODE_NOT_IN_PAGE_FLAGS
pfn_align = node_map_pfn_alignment();
if (pfn_align && pfn_align < PAGES_PER_SECTION) {
printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
PFN_PHYS(pfn_align) >> 20,
PFN_PHYS(PAGES_PER_SECTION) >> 20);
return -EINVAL;
}
#endif
if (!numa_meminfo_cover_memory(mi))
return -EINVAL;
/* Finally register nodes. */
for_each_node_mask(nid, node_possible_map) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;
for (i = 0; i < mi->nr_blks; i++) {
if (nid != mi->blk[i].nid)
continue;
start = min(mi->blk[i].start, start);
end = max(mi->blk[i].end, end);
}
if (start < end)
setup_node_data(nid, start, end);
}
/* Dump memblock with node info and return. */
memblock_dump_all();
return 0;
}
/*
* There are unfortunately some poorly designed mainboards around that
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
* mapping. To avoid this fill in the mapping for all possible CPUs,
* as the number of CPUs is not known yet. We round robin the existing
* nodes.
*/
static void __init numa_init_array(void)
{
int rr, i;
rr = first_node(node_online_map);
for (i = 0; i < nr_cpu_ids; i++) {
if (early_cpu_to_node(i) != NUMA_NO_NODE)
continue;
numa_set_node(i, rr);
rr = next_node(rr, node_online_map);
if (rr == MAX_NUMNODES)
rr = first_node(node_online_map);
}
}
static int __init numa_init(int (*init_func)(void))
{
int i;
int ret;
for (i = 0; i < MAX_LOCAL_APIC; i++)
set_apicid_to_node(i, NUMA_NO_NODE);
nodes_clear(numa_nodes_parsed);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES));
numa_reset_distance();
ret = init_func();
if (ret < 0)
return ret;
ret = numa_cleanup_meminfo(&numa_meminfo);
if (ret < 0)
return ret;
numa_emulation(&numa_meminfo, numa_distance_cnt);
ret = numa_register_memblks(&numa_meminfo);
if (ret < 0)
return ret;
for (i = 0; i < nr_cpu_ids; i++) {
int nid = early_cpu_to_node(i);
if (nid == NUMA_NO_NODE)
continue;
if (!node_online(nid))
numa_clear_node(i);
}
numa_init_array();
return 0;
}
/**
* dummy_numa_init - Fallback dummy NUMA init
*
* Used if there's no underlying NUMA architecture, NUMA initialization
* fails, or NUMA is disabled on the command line.
*
* Must online at least one node and add memory blocks that cover all
* allowed memory. This function must not fail.
*/
static int __init dummy_numa_init(void)
{
printk(KERN_INFO "%s\n",
numa_off ? "NUMA turned off" : "No NUMA configuration found");
printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
0LLU, PFN_PHYS(max_pfn) - 1);
node_set(0, numa_nodes_parsed);
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
return 0;
}
/**
* x86_numa_init - Initialize NUMA
*
* Try each configured NUMA initialization method until one succeeds. The
* last fallback is dummy single node config encomapssing whole memory and
* never fails.
*/
void __init x86_numa_init(void)
{
if (!numa_off) {
#ifdef CONFIG_X86_NUMAQ
if (!numa_init(numaq_numa_init))
return;
#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_init(x86_acpi_numa_init))
return;
#endif
#ifdef CONFIG_AMD_NUMA
if (!numa_init(amd_numa_init))
return;
#endif
}
numa_init(dummy_numa_init);
}
static __init int find_near_online_node(int node)
{
int n, val;
int min_val = INT_MAX;
int best_node = -1;
for_each_online_node(n) {
val = node_distance(node, n);
if (val < min_val) {
min_val = val;
best_node = n;
}
}
return best_node;
}
/*
* Setup early cpu_to_node.
*
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
* and apicid_to_node[] tables have valid entries for a CPU.
* This means we skip cpu_to_node[] initialisation for NUMA
* emulation and faking node case (when running a kernel compiled
* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
* is already initialized in a round robin manner at numa_init_array,
* prior to this call, and this initialization is good enough
* for the fake NUMA cases.
*
* Called before the per_cpu areas are setup.
*/
void __init init_cpu_to_node(void)
{
int cpu;
u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
BUG_ON(cpu_to_apicid == NULL);
for_each_possible_cpu(cpu) {
int node = numa_cpu_node(cpu);
if (node == NUMA_NO_NODE)
continue;
if (!node_online(node))
node = find_near_online_node(node);
numa_set_node(cpu, node);
}
}
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
# ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
# endif /* !CONFIG_NUMA_EMU */
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
int __cpu_to_node(int cpu)
{
if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
printk(KERN_WARNING
"cpu_to_node(%d): usage too early!\n", cpu);
dump_stack();
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
}
return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(__cpu_to_node);
/*
* Same function as cpu_to_node() but used if called before the
* per_cpu areas are setup.
*/
int early_cpu_to_node(int cpu)
{
if (early_per_cpu_ptr(x86_cpu_to_node_map))
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
if (!cpu_possible(cpu)) {
printk(KERN_WARNING
"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
dump_stack();
return NUMA_NO_NODE;
}
return per_cpu(x86_cpu_to_node_map, cpu);
}
void debug_cpumask_set_cpu(int cpu, int node, bool enable)
{
struct cpumask *mask;
char buf[64];
if (node == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
return;
}
mask = node_to_cpumask_map[node];
if (!mask) {
pr_err("node_to_cpumask_map[%i] NULL\n", node);
dump_stack();
return;
}
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable ? "numa_add_cpu" : "numa_remove_cpu",
cpu, node, buf);
return;
}
# ifndef CONFIG_NUMA_EMU
static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
}
void __cpuinit numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, false);
}
# endif /* !CONFIG_NUMA_EMU */
/*
* Returns a pointer to the bitmask of CPUs on Node 'node'.
*/
const struct cpumask *cpumask_of_node(int node)
{
if (node >= nr_node_ids) {
printk(KERN_WARNING
"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;
}
if (node_to_cpumask_map[node] == NULL) {
printk(KERN_WARNING
"cpumask_of_node(%d): no node_to_cpumask_map!\n",
node);
dump_stack();
return cpu_online_mask;
}
return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(cpumask_of_node);
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
#ifdef CONFIG_MEMORY_HOTPLUG
int memory_add_physaddr_to_nid(u64 start)
{
struct numa_meminfo *mi = &numa_meminfo;
int nid = mi->blk[0].nid;
int i;
for (i = 0; i < mi->nr_blks; i++)
if (mi->blk[i].start <= start && mi->blk[i].end > start)
nid = mi->blk[i].nid;
return nid;
}
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif

104
arch/x86/mm/numa_32.c Normal file
View File

@@ -0,0 +1,104 @@
/*
* Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
* August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
* All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
* NON INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/module.h>
#include "numa_internal.h"
#ifdef CONFIG_DISCONTIGMEM
/*
* 4) physnode_map - the mapping between a pfn and owning node
* physnode_map keeps track of the physical memory layout of a generic
* numa node on a 64Mb break (each element of the array will
* represent 64Mb of memory and will be marked by the node id. so,
* if the first gig is on node 0, and the second gig is on node 1
* physnode_map will contain:
*
* physnode_map[0-15] = 0;
* physnode_map[16-31] = 1;
* physnode_map[32- ] = -1;
*/
s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
EXPORT_SYMBOL(physnode_map);
void memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n",
nid, start, end);
printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid);
printk(KERN_DEBUG " ");
for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
physnode_map[pfn / PAGES_PER_SECTION] = nid;
printk(KERN_CONT "%lx ", pfn);
}
printk(KERN_CONT "\n");
}
unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
unsigned long end_pfn)
{
unsigned long nr_pages = end_pfn - start_pfn;
if (!nr_pages)
return 0;
return (nr_pages + 1) * sizeof(struct page);
}
#endif
extern unsigned long highend_pfn, highstart_pfn;
void __init initmem_init(void)
{
x86_numa_init();
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > max_low_pfn)
highstart_pfn = max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
num_physpages = highend_pfn;
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
#else
num_physpages = max_low_pfn;
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
#endif
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(max_low_pfn));
printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n",
max_low_pfn, highstart_pfn);
printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
setup_bootmem_allocator();
}

12
arch/x86/mm/numa_64.c Normal file
View File

@@ -0,0 +1,12 @@
/*
* Generic VM initialization for x86-64 NUMA setups.
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
#include <linux/bootmem.h>
#include "numa_internal.h"
void __init initmem_init(void)
{
x86_numa_init();
}

View File

@@ -0,0 +1,502 @@
/*
* NUMA emulation
*/
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/topology.h>
#include <linux/memblock.h>
#include <linux/bootmem.h>
#include <asm/dma.h>
#include "numa_internal.h"
static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
static char *emu_cmdline __initdata;
void __init numa_emu_cmdline(char *str)
{
emu_cmdline = str;
}
static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
{
int i;
for (i = 0; i < mi->nr_blks; i++)
if (mi->blk[i].nid == nid)
return i;
return -ENOENT;
}
static u64 __init mem_hole_size(u64 start, u64 end)
{
unsigned long start_pfn = PFN_UP(start);
unsigned long end_pfn = PFN_DOWN(end);
if (start_pfn < end_pfn)
return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
return 0;
}
/*
* Sets up nid to range from @start to @end. The return value is -errno if
* something went wrong, 0 otherwise.
*/
static int __init emu_setup_memblk(struct numa_meminfo *ei,
struct numa_meminfo *pi,
int nid, int phys_blk, u64 size)
{
struct numa_memblk *eb = &ei->blk[ei->nr_blks];
struct numa_memblk *pb = &pi->blk[phys_blk];
if (ei->nr_blks >= NR_NODE_MEMBLKS) {
pr_err("NUMA: Too many emulated memblks, failing emulation\n");
return -EINVAL;
}
ei->nr_blks++;
eb->start = pb->start;
eb->end = pb->start + size;
eb->nid = nid;
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
emu_nid_to_phys[nid] = nid;
pb->start += size;
if (pb->start >= pb->end) {
WARN_ON_ONCE(pb->start > pb->end);
numa_remove_memblk_from(phys_blk, pi);
}
printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
return 0;
}
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
static int __init split_nodes_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
int big;
int nid = 0;
int i, ret;
if (nr_nodes <= 0)
return -1;
if (nr_nodes > MAX_NUMNODES) {
pr_info("numa=fake=%d too large, reducing to %d\n",
nr_nodes, MAX_NUMNODES);
nr_nodes = MAX_NUMNODES;
}
/*
* Calculate target node size. x86_32 freaks on __udivdi3() so do
* the division in ulong number of pages and convert back.
*/
size = max_addr - addr - mem_hole_size(addr, max_addr);
size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
*/
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
FAKE_NODE_MIN_SIZE;
size &= FAKE_NODE_MIN_HASH_MASK;
if (!size) {
pr_err("Not enough memory for each node. "
"NUMA emulation disabled.\n");
return -1;
}
for (i = 0; i < pi->nr_blks; i++)
node_set(pi->blk[i].nid, physnode_mask);
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
int phys_blk;
phys_blk = emu_find_memblk_by_nid(i, pi);
if (phys_blk < 0) {
node_clear(i, physnode_mask);
continue;
}
start = pi->blk[phys_blk].start;
limit = pi->blk[phys_blk].end;
end = start + size;
if (nid < big)
end += FAKE_NODE_MIN_SIZE;
/*
* Continue to add memory to this fake node if its
* non-reserved memory is less than the per-node size.
*/
while (end - start - mem_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > limit) {
end = limit;
break;
}
}
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
* If there won't be enough non-reserved memory for the
* next node, this one must extend to the end of the
* physical node.
*/
if (limit - end - mem_hole_size(end, limit) < size)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
phys_blk,
min(end, limit) - start);
if (ret < 0)
return ret;
}
}
return 0;
}
/*
* Returns the end address of a node so that there is at least `size' amount of
* non-reserved memory or `max_addr' is reached.
*/
static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
{
u64 end = start + size;
while (end - start - mem_hole_size(start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
end = max_addr;
break;
}
}
return end;
}
/*
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
* `addr' to `max_addr'. The return value is the number of nodes allocated.
*/
static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
struct numa_meminfo *pi,
u64 addr, u64 max_addr, u64 size)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 min_size;
int nid = 0;
int i, ret;
if (!size)
return -1;
/*
* The limit on emulated nodes is MAX_NUMNODES, so the size per node is
* increased accordingly if the requested size is too small. This
* creates a uniform distribution of node sizes across the entire
* machine (but not necessarily over physical nodes).
*/
min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES;
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
min_size = (min_size + FAKE_NODE_MIN_SIZE) &
FAKE_NODE_MIN_HASH_MASK;
if (size < min_size) {
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
size >> 20, min_size >> 20);
size = min_size;
}
size &= FAKE_NODE_MIN_HASH_MASK;
for (i = 0; i < pi->nr_blks; i++)
node_set(pi->blk[i].nid, physnode_mask);
/*
* Fill physical nodes with fake nodes of size until there is no memory
* left on any of them.
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
u64 start, limit, end;
int phys_blk;
phys_blk = emu_find_memblk_by_nid(i, pi);
if (phys_blk < 0) {
node_clear(i, physnode_mask);
continue;
}
start = pi->blk[phys_blk].start;
limit = pi->blk[phys_blk].end;
end = find_end_of_node(start, limit, size);
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
* If there won't be enough non-reserved memory for the
* next node, this one must extend to the end of the
* physical node.
*/
if (limit - end - mem_hole_size(end, limit) < size)
end = limit;
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
phys_blk,
min(end, limit) - start);
if (ret < 0)
return ret;
}
}
return 0;
}
/**
* numa_emulation - Emulate NUMA nodes
* @numa_meminfo: NUMA configuration to massage
* @numa_dist_cnt: The size of the physical NUMA distance table
*
* Emulate NUMA nodes according to the numa=fake kernel parameter.
* @numa_meminfo contains the physical memory configuration and is modified
* to reflect the emulated configuration on success. @numa_dist_cnt is
* used to determine the size of the physical distance table.
*
* On success, the following modifications are made.
*
* - @numa_meminfo is updated to reflect the emulated nodes.
*
* - __apicid_to_node[] is updated such that APIC IDs are mapped to the
* emulated nodes.
*
* - NUMA distance table is rebuilt to represent distances between emulated
* nodes. The distances are determined considering how emulated nodes
* are mapped to physical nodes and match the actual distances.
*
* - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
* nodes. This is used by numa_add_cpu() and numa_remove_cpu().
*
* If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
* identity mapping and no other modification is made.
*/
void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
{
static struct numa_meminfo ei __initdata;
static struct numa_meminfo pi __initdata;
const u64 max_addr = PFN_PHYS(max_pfn);
u8 *phys_dist = NULL;
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
int max_emu_nid, dfl_phys_nid;
int i, j, ret;
if (!emu_cmdline)
goto no_emu;
memset(&ei, 0, sizeof(ei));
pi = *numa_meminfo;
for (i = 0; i < MAX_NUMNODES; i++)
emu_nid_to_phys[i] = NUMA_NO_NODE;
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
* split the system RAM into N fake nodes.
*/
if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
u64 size;
size = memparse(emu_cmdline, &emu_cmdline);
ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
} else {
unsigned long n;
n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
}
if (*emu_cmdline == ':')
emu_cmdline++;
if (ret < 0)
goto no_emu;
if (numa_cleanup_meminfo(&ei) < 0) {
pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
goto no_emu;
}
/* copy the physical distance table */
if (numa_dist_cnt) {
u64 phys;
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
phys_size, PAGE_SIZE);
if (!phys) {
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
goto no_emu;
}
memblock_reserve(phys, phys_size);
phys_dist = __va(phys);
for (i = 0; i < numa_dist_cnt; i++)
for (j = 0; j < numa_dist_cnt; j++)
phys_dist[i * numa_dist_cnt + j] =
node_distance(i, j);
}
/*
* Determine the max emulated nid and the default phys nid to use
* for unmapped nodes.
*/
max_emu_nid = 0;
dfl_phys_nid = NUMA_NO_NODE;
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
max_emu_nid = i;
if (dfl_phys_nid == NUMA_NO_NODE)
dfl_phys_nid = emu_nid_to_phys[i];
}
}
if (dfl_phys_nid == NUMA_NO_NODE) {
pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
goto no_emu;
}
/* commit */
*numa_meminfo = ei;
/*
* Transform __apicid_to_node table to use emulated nids by
* reverse-mapping phys_nid. The maps should always exist but fall
* back to zero just in case.
*/
for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
if (__apicid_to_node[i] == NUMA_NO_NODE)
continue;
for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
if (__apicid_to_node[i] == emu_nid_to_phys[j])
break;
__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
}
/* make sure all emulated nodes are mapped to a physical node */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
emu_nid_to_phys[i] = dfl_phys_nid;
/* transform distance table */
numa_reset_distance();
for (i = 0; i < max_emu_nid + 1; i++) {
for (j = 0; j < max_emu_nid + 1; j++) {
int physi = emu_nid_to_phys[i];
int physj = emu_nid_to_phys[j];
int dist;
if (get_option(&emu_cmdline, &dist) == 2)
;
else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
dist = physi == physj ?
LOCAL_DISTANCE : REMOTE_DISTANCE;
else
dist = phys_dist[physi * numa_dist_cnt + physj];
numa_set_distance(i, j, dist);
}
}
/* free the copied physical distance table */
if (phys_dist)
memblock_free(__pa(phys_dist), phys_size);
return;
no_emu:
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
emu_nid_to_phys[i] = i;
}
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
void __cpuinit numa_add_cpu(int cpu)
{
int physnid, nid;
nid = early_cpu_to_node(cpu);
BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
physnid = emu_nid_to_phys[nid];
/*
* Map the cpu to each emulated node that is allocated on the physical
* node of the cpu's apic id.
*/
for_each_online_node(nid)
if (emu_nid_to_phys[nid] == physnid)
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
}
void __cpuinit numa_remove_cpu(int cpu)
{
int i;
for_each_online_node(i)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
static void __cpuinit numa_set_cpumask(int cpu, bool enable)
{
int nid, physnid;
nid = early_cpu_to_node(cpu);
if (nid == NUMA_NO_NODE) {
/* early_cpu_to_node() already emits a warning and trace */
return;
}
physnid = emu_nid_to_phys[nid];
for_each_online_node(nid) {
if (emu_nid_to_phys[nid] != physnid)
continue;
debug_cpumask_set_cpu(cpu, nid, enable);
}
}
void __cpuinit numa_add_cpu(int cpu)
{
numa_set_cpumask(cpu, true);
}
void __cpuinit numa_remove_cpu(int cpu)
{
numa_set_cpumask(cpu, false);
}
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

View File

@@ -0,0 +1,33 @@
#ifndef __X86_MM_NUMA_INTERNAL_H
#define __X86_MM_NUMA_INTERNAL_H
#include <linux/types.h>
#include <asm/numa.h>
struct numa_memblk {
u64 start;
u64 end;
int nid;
};
struct numa_meminfo {
int nr_blks;
struct numa_memblk blk[NR_NODE_MEMBLKS];
};
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
void __init numa_reset_distance(void);
void __init x86_numa_init(void);
#ifdef CONFIG_NUMA_EMU
void __init numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt);
#else
static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
int numa_dist_cnt)
{ }
#endif
#endif /* __X86_MM_NUMA_INTERNAL_H */

260
arch/x86/mm/pageattr-test.c Normal file
View File

@@ -0,0 +1,260 @@
/*
* self test for change_page_attr.
*
* Clears the a test pte bit on random pages in the direct mapping,
* then reverts and compares page tables forwards and afterwards.
*/
#include <linux/bootmem.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
#include <asm/kdebug.h>
/*
* Only print the results of the first pass:
*/
static __read_mostly int print = 1;
enum {
NTEST = 400,
#ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
#elif defined(CONFIG_X86_PAE)
LPS = (1 << PMD_SHIFT),
#else
LPS = (1 << 22),
#endif
GPS = (1<<30)
};
#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
static int pte_testbit(pte_t pte)
{
return pte_flags(pte) & _PAGE_UNUSED1;
}
struct split_state {
long lpg, gpg, spg, exec;
long min_exec, max_exec;
};
static int print_split(struct split_state *s)
{
long i, expected, missed = 0;
int err = 0;
s->lpg = s->gpg = s->spg = s->exec = 0;
s->min_exec = ~0UL;
s->max_exec = 0;
for (i = 0; i < max_pfn_mapped; ) {
unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT);
unsigned int level;
pte_t *pte;
pte = lookup_address(addr, &level);
if (!pte) {
missed++;
i++;
continue;
}
if (level == PG_LEVEL_1G && sizeof(long) == 8) {
s->gpg++;
i += GPS/PAGE_SIZE;
} else if (level == PG_LEVEL_2M) {
if ((pte_val(*pte) & _PAGE_PRESENT) && !(pte_val(*pte) & _PAGE_PSE)) {
printk(KERN_ERR
"%lx level %d but not PSE %Lx\n",
addr, level, (u64)pte_val(*pte));
err = 1;
}
s->lpg++;
i += LPS/PAGE_SIZE;
} else {
s->spg++;
i++;
}
if (!(pte_val(*pte) & _PAGE_NX)) {
s->exec++;
if (addr < s->min_exec)
s->min_exec = addr;
if (addr > s->max_exec)
s->max_exec = addr;
}
}
if (print) {
printk(KERN_INFO
" 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n",
s->spg, s->lpg, s->gpg, s->exec,
s->min_exec != ~0UL ? s->min_exec : 0,
s->max_exec, missed);
}
expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed;
if (expected != i) {
printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n",
max_pfn_mapped, expected);
return 1;
}
return err;
}
static unsigned long addr[NTEST];
static unsigned int len[NTEST];
/* Change the global bit on random pages in the direct mapping */
static int pageattr_test(void)
{
struct split_state sa, sb, sc;
unsigned long *bm;
pte_t *pte, pte0;
int failed = 0;
unsigned int level;
int i, k;
int err;
unsigned long test_addr;
if (print)
printk(KERN_INFO "CPA self-test:\n");
bm = vzalloc((max_pfn_mapped + 7) / 8);
if (!bm) {
printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
return -ENOMEM;
}
failed += print_split(&sa);
for (i = 0; i < NTEST; i++) {
unsigned long pfn = prandom_u32() % max_pfn_mapped;
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
len[i] = prandom_u32() % 100;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
if (len[i] == 0)
len[i] = 1;
pte = NULL;
pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */
for (k = 0; k < len[i]; k++) {
pte = lookup_address(addr[i] + k*PAGE_SIZE, &level);
if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 ||
!(pte_val(*pte) & _PAGE_PRESENT)) {
addr[i] = 0;
break;
}
if (k == 0) {
pte0 = *pte;
} else {
if (pgprot_val(pte_pgprot(*pte)) !=
pgprot_val(pte_pgprot(pte0))) {
len[i] = k;
break;
}
}
if (test_bit(pfn + k, bm)) {
len[i] = k;
break;
}
__set_bit(pfn + k, bm);
}
if (!addr[i] || !pte || !k) {
addr[i] = 0;
continue;
}
test_addr = addr[i];
err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;
}
pte = lookup_address(addr[i], &level);
if (!pte || !pte_testbit(*pte) || pte_huge(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i],
pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
}
if (level != PG_LEVEL_4K) {
printk(KERN_ERR "CPA %lx: unexpected level %d\n",
addr[i], level);
failed++;
}
}
vfree(bm);
failed += print_split(&sb);
for (i = 0; i < NTEST; i++) {
if (!addr[i])
continue;
pte = lookup_address(addr[i], &level);
if (!pte) {
printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]);
failed++;
continue;
}
test_addr = addr[i];
err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
}
pte = lookup_address(addr[i], &level);
if (!pte || pte_testbit(*pte)) {
printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n",
addr[i], pte ? (u64)pte_val(*pte) : 0ULL);
failed++;
}
}
failed += print_split(&sc);
if (failed) {
WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
return -EINVAL;
} else {
if (print)
printk(KERN_INFO "ok.\n");
}
return 0;
}
static int do_pageattr_test(void *__unused)
{
while (!kthread_should_stop()) {
schedule_timeout_interruptible(HZ*30);
if (pageattr_test() < 0)
break;
if (print)
print--;
}
return 0;
}
static int start_pageattr_test(void)
{
struct task_struct *p;
p = kthread_create(do_pageattr_test, NULL, "pageattr-test");
if (!IS_ERR(p))
wake_up_process(p);
else
WARN_ON(1);
return 0;
}
module_init(start_pageattr_test);

1443
arch/x86/mm/pageattr.c Normal file

File diff suppressed because it is too large Load Diff

884
arch/x86/mm/pat.c Normal file
View File

@@ -0,0 +1,884 @@
/*
* Handle caching attributes in page tables (PAT)
*
* Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* Suresh B Siddha <suresh.b.siddha@intel.com>
*
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
*/
#include <linux/seq_file.h>
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rbtree.h>
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
#include <asm/pgtable.h>
#include <asm/fcntl.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/pat.h>
#include <asm/io.h>
#include "pat_internal.h"
#ifdef CONFIG_X86_PAT
int __read_mostly pat_enabled = 1;
static inline void pat_disable(const char *reason)
{
pat_enabled = 0;
printk(KERN_INFO "%s\n", reason);
}
static int __init nopat(char *str)
{
pat_disable("PAT support disabled.");
return 0;
}
early_param("nopat", nopat);
#else
static inline void pat_disable(const char *reason)
{
(void)reason;
}
#endif
int pat_debug_enable;
static int __init pat_debug_setup(char *str)
{
pat_debug_enable = 1;
return 0;
}
__setup("debugpat", pat_debug_setup);
static u64 __read_mostly boot_pat_state;
enum {
PAT_UC = 0, /* uncached */
PAT_WC = 1, /* Write combining */
PAT_WT = 4, /* Write Through */
PAT_WP = 5, /* Write Protected */
PAT_WB = 6, /* Write Back (default) */
PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
};
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
void pat_init(void)
{
u64 pat;
bool boot_cpu = !boot_pat_state;
if (!pat_enabled)
return;
if (!cpu_has_pat) {
if (!boot_pat_state) {
pat_disable("PAT not supported by CPU.");
return;
} else {
/*
* If this happens we are on a secondary CPU, but
* switched to PAT on the boot CPU. We have no way to
* undo PAT.
*/
printk(KERN_ERR "PAT enabled, "
"but not supported by secondary CPU\n");
BUG();
}
}
/* Set PWT to Write-Combining. All other bits stay the same */
/*
* PTE encoding used in Linux:
* PAT
* |PCD
* ||PWT
* |||
* 000 WB _PAGE_CACHE_WB
* 001 WC _PAGE_CACHE_WC
* 010 UC- _PAGE_CACHE_UC_MINUS
* 011 UC _PAGE_CACHE_UC
* PAT bit unused
*/
pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
/* Boot CPU check */
if (!boot_pat_state)
rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
wrmsrl(MSR_IA32_CR_PAT, pat);
if (boot_cpu)
printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
smp_processor_id(), boot_pat_state, pat);
}
#undef PAT
static DEFINE_SPINLOCK(memtype_lock); /* protects memtype accesses */
/*
* Does intersection of PAT memory type and MTRR memory type and returns
* the resulting memory type as PAT understands it.
* (Type in pat and mtrr will not have same value)
* The intersection is based on "Effective Memory Type" tables in IA-32
* SDM vol 3a
*/
static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
{
/*
* Look for MTRR hint to get the effective type in case where PAT
* request is for WB.
*/
if (req_type == _PAGE_CACHE_WB) {
u8 mtrr_type;
mtrr_type = mtrr_type_lookup(start, end);
if (mtrr_type != MTRR_TYPE_WRBACK)
return _PAGE_CACHE_UC_MINUS;
return _PAGE_CACHE_WB;
}
return req_type;
}
struct pagerange_state {
unsigned long cur_pfn;
int ram;
int not_ram;
};
static int
pagerange_is_ram_callback(unsigned long initial_pfn, unsigned long total_nr_pages, void *arg)
{
struct pagerange_state *state = arg;
state->not_ram |= initial_pfn > state->cur_pfn;
state->ram |= total_nr_pages > 0;
state->cur_pfn = initial_pfn + total_nr_pages;
return state->ram && state->not_ram;
}
static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
{
int ret = 0;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long end_pfn = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
struct pagerange_state state = {start_pfn, 0, 0};
/*
* For legacy reasons, physical address range in the legacy ISA
* region is tracked as non-RAM. This will allow users of
* /dev/mem to map portions of legacy ISA region, even when
* some of those portions are listed(or not even listed) with
* different e820 types(RAM/reserved/..)
*/
if (start_pfn < ISA_END_ADDRESS >> PAGE_SHIFT)
start_pfn = ISA_END_ADDRESS >> PAGE_SHIFT;
if (start_pfn < end_pfn) {
ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
&state, pagerange_is_ram_callback);
}
return (ret > 0) ? -1 : (state.ram ? 1 : 0);
}
/*
* For RAM pages, we use page flags to mark the pages with appropriate type.
* Here we do two pass:
* - Find the memtype of all the pages in the range, look for any conflicts
* - In case of no conflicts, set the new memtype for pages in the range
*/
static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct page *page;
u64 pfn;
if (req_type == _PAGE_CACHE_UC) {
/* We do not support strong UC */
WARN_ON_ONCE(1);
req_type = _PAGE_CACHE_UC_MINUS;
}
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
unsigned long type;
page = pfn_to_page(pfn);
type = get_page_memtype(page);
if (type != -1) {
printk(KERN_INFO "reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%lx, req 0x%lx\n",
start, end - 1, type, req_type);
if (new_type)
*new_type = type;
return -EBUSY;
}
}
if (new_type)
*new_type = req_type;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
set_page_memtype(page, req_type);
}
return 0;
}
static int free_ram_pages_type(u64 start, u64 end)
{
struct page *page;
u64 pfn;
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
set_page_memtype(page, -1);
}
return 0;
}
/*
* req_type typically has one of the:
* - _PAGE_CACHE_WB
* - _PAGE_CACHE_WC
* - _PAGE_CACHE_UC_MINUS
* - _PAGE_CACHE_UC
*
* If new_type is NULL, function will return an error if it cannot reserve the
* region with req_type. If new_type is non-NULL, function will return
* available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
int reserve_memtype(u64 start, u64 end, unsigned long req_type,
unsigned long *new_type)
{
struct memtype *new;
unsigned long actual_type;
int is_range_ram;
int err = 0;
BUG_ON(start >= end); /* end is exclusive */
if (!pat_enabled) {
/* This is identical to page table setting without PAT */
if (new_type) {
if (req_type == _PAGE_CACHE_WC)
*new_type = _PAGE_CACHE_UC_MINUS;
else
*new_type = req_type & _PAGE_CACHE_MASK;
}
return 0;
}
/* Low ISA region is always mapped WB in page table. No need to track */
if (x86_platform.is_untracked_pat_range(start, end)) {
if (new_type)
*new_type = _PAGE_CACHE_WB;
return 0;
}
/*
* Call mtrr_lookup to get the type hint. This is an
* optimization for /dev/mem mmap'ers into WB memory (BIOS
* tools and ACPI tools). Use WB request for WB memory and use
* UC_MINUS otherwise.
*/
actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK);
if (new_type)
*new_type = actual_type;
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1) {
err = reserve_ram_pages_type(start, end, req_type, new_type);
return err;
} else if (is_range_ram < 0) {
return -EINVAL;
}
new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
if (!new)
return -ENOMEM;
new->start = start;
new->end = end;
new->type = actual_type;
spin_lock(&memtype_lock);
err = rbt_memtype_check_insert(new, new_type);
if (err) {
printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
start, end - 1,
cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
return err;
}
spin_unlock(&memtype_lock);
dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
start, end - 1, cattr_name(new->type), cattr_name(req_type),
new_type ? cattr_name(*new_type) : "-");
return err;
}
int free_memtype(u64 start, u64 end)
{
int err = -EINVAL;
int is_range_ram;
struct memtype *entry;
if (!pat_enabled)
return 0;
/* Low ISA region is always mapped WB. No need to track */
if (x86_platform.is_untracked_pat_range(start, end))
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
if (is_range_ram == 1) {
err = free_ram_pages_type(start, end);
return err;
} else if (is_range_ram < 0) {
return -EINVAL;
}
spin_lock(&memtype_lock);
entry = rbt_memtype_erase(start, end);
spin_unlock(&memtype_lock);
if (!entry) {
printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid, start, end - 1);
return -EINVAL;
}
kfree(entry);
dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
return 0;
}
/**
* lookup_memtype - Looksup the memory type for a physical address
* @paddr: physical address of which memory type needs to be looked up
*
* Only to be called when PAT is enabled
*
* Returns _PAGE_CACHE_WB, _PAGE_CACHE_WC, _PAGE_CACHE_UC_MINUS or
* _PAGE_CACHE_UC
*/
static unsigned long lookup_memtype(u64 paddr)
{
int rettype = _PAGE_CACHE_WB;
struct memtype *entry;
if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
return rettype;
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
struct page *page;
page = pfn_to_page(paddr >> PAGE_SHIFT);
rettype = get_page_memtype(page);
/*
* -1 from get_page_memtype() implies RAM page is in its
* default state and not reserved, and hence of type WB
*/
if (rettype == -1)
rettype = _PAGE_CACHE_WB;
return rettype;
}
spin_lock(&memtype_lock);
entry = rbt_memtype_lookup(paddr);
if (entry != NULL)
rettype = entry->type;
else
rettype = _PAGE_CACHE_UC_MINUS;
spin_unlock(&memtype_lock);
return rettype;
}
/**
* io_reserve_memtype - Request a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
* @type: A pointer to memtype, with requested type. On success, requested
* or any other compatible type that was available for the region is returned
*
* On success, returns 0
* On failure, returns non-zero
*/
int io_reserve_memtype(resource_size_t start, resource_size_t end,
unsigned long *type)
{
resource_size_t size = end - start;
unsigned long req_type = *type;
unsigned long new_type;
int ret;
WARN_ON_ONCE(iomem_map_sanity_check(start, size));
ret = reserve_memtype(start, end, req_type, &new_type);
if (ret)
goto out_err;
if (!is_new_memtype_allowed(start, size, req_type, new_type))
goto out_free;
if (kernel_map_sync_memtype(start, size, new_type) < 0)
goto out_free;
*type = new_type;
return 0;
out_free:
free_memtype(start, end);
ret = -EBUSY;
out_err:
return ret;
}
/**
* io_free_memtype - Release a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
*/
void io_free_memtype(resource_size_t start, resource_size_t end)
{
free_memtype(start, end);
}
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
{
return vma_prot;
}
#ifdef CONFIG_STRICT_DEVMEM
/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
return 1;
}
#else
/* This check is needed to avoid cache aliasing when PAT is enabled */
static inline int range_is_allowed(unsigned long pfn, unsigned long size)
{
u64 from = ((u64)pfn) << PAGE_SHIFT;
u64 to = from + size;
u64 cursor = from;
if (!pat_enabled)
return 1;
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
pfn++;
}
return 1;
}
#endif /* CONFIG_STRICT_DEVMEM */
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t *vma_prot)
{
unsigned long flags = _PAGE_CACHE_WB;
if (!range_is_allowed(pfn, size))
return 0;
if (file->f_flags & O_DSYNC)
flags = _PAGE_CACHE_UC_MINUS;
#ifdef CONFIG_X86_32
/*
* On the PPro and successors, the MTRRs are used to set
* memory types for physical addresses outside main memory,
* so blindly setting UC or PWT on those pages is wrong.
* For Pentiums and earlier, the surround logic should disable
* caching for the high addresses through the KEN pin, but
* we maintain the tradition of paranoia in this code.
*/
if (!pat_enabled &&
!(boot_cpu_has(X86_FEATURE_MTRR) ||
boot_cpu_has(X86_FEATURE_K6_MTRR) ||
boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
boot_cpu_has(X86_FEATURE_CENTAUR_MCR)) &&
(pfn << PAGE_SHIFT) >= __pa(high_memory)) {
flags = _PAGE_CACHE_UC;
}
#endif
*vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
flags);
return 1;
}
/*
* Change the memory type for the physial address range in kernel identity
* mapping space if that range is a part of identity map.
*/
int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags)
{
unsigned long id_sz;
if (base > __pa(high_memory-1))
return 0;
/*
* some areas in the middle of the kernel identity range
* are not mapped, like the PCI space.
*/
if (!page_is_ram(base >> PAGE_SHIFT))
return 0;
id_sz = (__pa(high_memory-1) <= base + size) ?
__pa(high_memory) - base :
size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, flags) < 0) {
printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
"for [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid,
cattr_name(flags),
base, (unsigned long long)(base + size-1));
return -EINVAL;
}
return 0;
}
/*
* Internal interface to reserve a range of physical memory with prot.
* Reserved non RAM regions only and after successful reserve_memtype,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
int strict_prot)
{
int is_ram = 0;
int ret;
unsigned long want_flags = (pgprot_val(*vma_prot) & _PAGE_CACHE_MASK);
unsigned long flags = want_flags;
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
/*
* reserve_pfn_range() for RAM pages. We do not refcount to keep
* track of number of mappings of RAM pages. We can assert that
* the type requested matches the type of first page in the range.
*/
if (is_ram) {
if (!pat_enabled)
return 0;
flags = lookup_memtype(paddr);
if (want_flags != flags) {
printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
cattr_name(flags));
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
flags);
}
return 0;
}
ret = reserve_memtype(paddr, paddr + size, want_flags, &flags);
if (ret)
return ret;
if (flags != want_flags) {
if (strict_prot ||
!is_new_memtype_allowed(paddr, size, want_flags, flags)) {
free_memtype(paddr, paddr + size);
printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
" for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_flags),
(unsigned long long)paddr,
(unsigned long long)(paddr + size - 1),
cattr_name(flags));
return -EINVAL;
}
/*
* We allow returning different type than the one requested in
* non strict case.
*/
*vma_prot = __pgprot((pgprot_val(*vma_prot) &
(~_PAGE_CACHE_MASK)) |
flags);
}
if (kernel_map_sync_memtype(paddr, size, flags) < 0) {
free_memtype(paddr, paddr + size);
return -EINVAL;
}
return 0;
}
/*
* Internal interface to free a range of physical memory.
* Frees non RAM regions only.
*/
static void free_pfn_range(u64 paddr, unsigned long size)
{
int is_ram;
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
if (is_ram == 0)
free_memtype(paddr, paddr + size);
}
/*
* track_pfn_copy is called when vma that is covering the pfnmap gets
* copied through copy_page_range().
*
* If the vma has a linear pfn mapping for the entire range, we get the prot
* from pte and reserve the entire vma range with single reserve_pfn_range call.
*/
int track_pfn_copy(struct vm_area_struct *vma)
{
resource_size_t paddr;
unsigned long prot;
unsigned long vma_size = vma->vm_end - vma->vm_start;
pgprot_t pgprot;
if (vma->vm_flags & VM_PAT) {
/*
* reserve the whole chunk covered by vma. We need the
* starting address and protection from pte.
*/
if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return -EINVAL;
}
pgprot = __pgprot(prot);
return reserve_pfn_range(paddr, vma_size, &pgprot, 1);
}
return 0;
}
/*
* prot is passed in as a parameter for the new mapping. If the vma has a
* linear pfn mapping for the entire range reserve the entire vma range with
* single reserve_pfn_range call.
*/
int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn, unsigned long addr, unsigned long size)
{
resource_size_t paddr = (resource_size_t)pfn << PAGE_SHIFT;
unsigned long flags;
/* reserve the whole chunk starting from paddr */
if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
int ret;
ret = reserve_pfn_range(paddr, size, prot, 0);
if (!ret)
vma->vm_flags |= VM_PAT;
return ret;
}
if (!pat_enabled)
return 0;
/*
* For anything smaller than the vma size we set prot based on the
* lookup.
*/
flags = lookup_memtype(paddr);
/* Check memtype for the remaining pages */
while (size > PAGE_SIZE) {
size -= PAGE_SIZE;
paddr += PAGE_SIZE;
if (flags != lookup_memtype(paddr))
return -EINVAL;
}
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
flags);
return 0;
}
int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
unsigned long pfn)
{
unsigned long flags;
if (!pat_enabled)
return 0;
/* Set prot based on lookup */
flags = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT);
*prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) |
flags);
return 0;
}
/*
* untrack_pfn is called while unmapping a pfnmap for a region.
* untrack can be called for a specific region indicated by pfn and size or
* can be for the entire vma (in which case pfn, size are zero).
*/
void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
unsigned long size)
{
resource_size_t paddr;
unsigned long prot;
if (!(vma->vm_flags & VM_PAT))
return;
/* free the chunk starting from pfn or the whole chunk */
paddr = (resource_size_t)pfn << PAGE_SHIFT;
if (!paddr && !size) {
if (follow_phys(vma, vma->vm_start, 0, &prot, &paddr)) {
WARN_ON_ONCE(1);
return;
}
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
vma->vm_flags &= ~VM_PAT;
}
pgprot_t pgprot_writecombine(pgprot_t prot)
{
if (pat_enabled)
return __pgprot(pgprot_val(prot) | _PAGE_CACHE_WC);
else
return pgprot_noncached(prot);
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
static struct memtype *memtype_get_idx(loff_t pos)
{
struct memtype *print_entry;
int ret;
print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
if (!print_entry)
return NULL;
spin_lock(&memtype_lock);
ret = rbt_memtype_copy_nth_element(print_entry, pos);
spin_unlock(&memtype_lock);
if (!ret) {
return print_entry;
} else {
kfree(print_entry);
return NULL;
}
}
static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
{
if (*pos == 0) {
++*pos;
seq_printf(seq, "PAT memtype list:\n");
}
return memtype_get_idx(*pos);
}
static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return memtype_get_idx(*pos);
}
static void memtype_seq_stop(struct seq_file *seq, void *v)
{
}
static int memtype_seq_show(struct seq_file *seq, void *v)
{
struct memtype *print_entry = (struct memtype *)v;
seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
print_entry->start, print_entry->end);
kfree(print_entry);
return 0;
}
static const struct seq_operations memtype_seq_ops = {
.start = memtype_seq_start,
.next = memtype_seq_next,
.stop = memtype_seq_stop,
.show = memtype_seq_show,
};
static int memtype_seq_open(struct inode *inode, struct file *file)
{
return seq_open(file, &memtype_seq_ops);
}
static const struct file_operations memtype_fops = {
.open = memtype_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static int __init pat_memtype_list_init(void)
{
if (pat_enabled) {
debugfs_create_file("pat_memtype_list", S_IRUSR,
arch_debugfs_dir, NULL, &memtype_fops);
}
return 0;
}
late_initcall(pat_memtype_list_init);
#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */

View File

@@ -0,0 +1,46 @@
#ifndef __PAT_INTERNAL_H_
#define __PAT_INTERNAL_H_
extern int pat_debug_enable;
#define dprintk(fmt, arg...) \
do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
struct memtype {
u64 start;
u64 end;
u64 subtree_max_end;
unsigned long type;
struct rb_node rb;
};
static inline char *cattr_name(unsigned long flags)
{
switch (flags & _PAGE_CACHE_MASK) {
case _PAGE_CACHE_UC: return "uncached";
case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
case _PAGE_CACHE_WB: return "write-back";
case _PAGE_CACHE_WC: return "write-combining";
default: return "broken";
}
}
#ifdef CONFIG_X86_PAT
extern int rbt_memtype_check_insert(struct memtype *new,
unsigned long *new_type);
extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
extern struct memtype *rbt_memtype_lookup(u64 addr);
extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
#else
static inline int rbt_memtype_check_insert(struct memtype *new,
unsigned long *new_type)
{ return 0; }
static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
{ return NULL; }
static inline struct memtype *rbt_memtype_lookup(u64 addr)
{ return NULL; }
static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
{ return 0; }
#endif
#endif /* __PAT_INTERNAL_H_ */

247
arch/x86/mm/pat_rbtree.c Normal file
View File

@@ -0,0 +1,247 @@
/*
* Handle caching attributes in page tables (PAT)
*
* Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* Suresh B Siddha <suresh.b.siddha@intel.com>
*
* Interval tree (augmented rbtree) used to store the PAT memory type
* reservations.
*/
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rbtree_augmented.h>
#include <linux/sched.h>
#include <linux/gfp.h>
#include <asm/pgtable.h>
#include <asm/pat.h>
#include "pat_internal.h"
/*
* The memtype tree keeps track of memory type for specific
* physical memory areas. Without proper tracking, conflicting memory
* types in different mappings can cause CPU cache corruption.
*
* The tree is an interval tree (augmented rbtree) with tree ordered
* on starting address. Tree can contain multiple entries for
* different regions which overlap. All the aliases have the same
* cache attributes of course.
*
* memtype_lock protects the rbtree.
*/
static struct rb_root memtype_rbroot = RB_ROOT;
static int is_node_overlap(struct memtype *node, u64 start, u64 end)
{
if (node->start >= end || node->end <= start)
return 0;
return 1;
}
static u64 get_subtree_max_end(struct rb_node *node)
{
u64 ret = 0;
if (node) {
struct memtype *data = container_of(node, struct memtype, rb);
ret = data->subtree_max_end;
}
return ret;
}
static u64 compute_subtree_max_end(struct memtype *data)
{
u64 max_end = data->end, child_max_end;
child_max_end = get_subtree_max_end(data->rb.rb_right);
if (child_max_end > max_end)
max_end = child_max_end;
child_max_end = get_subtree_max_end(data->rb.rb_left);
if (child_max_end > max_end)
max_end = child_max_end;
return max_end;
}
RB_DECLARE_CALLBACKS(static, memtype_rb_augment_cb, struct memtype, rb,
u64, subtree_max_end, compute_subtree_max_end)
/* Find the first (lowest start addr) overlapping range from rb tree */
static struct memtype *memtype_rb_lowest_match(struct rb_root *root,
u64 start, u64 end)
{
struct rb_node *node = root->rb_node;
struct memtype *last_lower = NULL;
while (node) {
struct memtype *data = container_of(node, struct memtype, rb);
if (get_subtree_max_end(node->rb_left) > start) {
/* Lowest overlap if any must be on left side */
node = node->rb_left;
} else if (is_node_overlap(data, start, end)) {
last_lower = data;
break;
} else if (start >= data->start) {
/* Lowest overlap if any must be on right side */
node = node->rb_right;
} else {
break;
}
}
return last_lower; /* Returns NULL if there is no overlap */
}
static struct memtype *memtype_rb_exact_match(struct rb_root *root,
u64 start, u64 end)
{
struct memtype *match;
match = memtype_rb_lowest_match(root, start, end);
while (match != NULL && match->start < end) {
struct rb_node *node;
if (match->start == start && match->end == end)
return match;
node = rb_next(&match->rb);
if (node)
match = container_of(node, struct memtype, rb);
else
match = NULL;
}
return NULL; /* Returns NULL if there is no exact match */
}
static int memtype_rb_check_conflict(struct rb_root *root,
u64 start, u64 end,
unsigned long reqtype, unsigned long *newtype)
{
struct rb_node *node;
struct memtype *match;
int found_type = reqtype;
match = memtype_rb_lowest_match(&memtype_rbroot, start, end);
if (match == NULL)
goto success;
if (match->type != found_type && newtype == NULL)
goto failure;
dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
found_type = match->type;
node = rb_next(&match->rb);
while (node) {
match = container_of(node, struct memtype, rb);
if (match->start >= end) /* Checked all possible matches */
goto success;
if (is_node_overlap(match, start, end) &&
match->type != found_type) {
goto failure;
}
node = rb_next(&match->rb);
}
success:
if (newtype)
*newtype = found_type;
return 0;
failure:
printk(KERN_INFO "%s:%d conflicting memory types "
"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
end, cattr_name(found_type), cattr_name(match->type));
return -EBUSY;
}
static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata)
{
struct rb_node **node = &(root->rb_node);
struct rb_node *parent = NULL;
while (*node) {
struct memtype *data = container_of(*node, struct memtype, rb);
parent = *node;
if (data->subtree_max_end < newdata->end)
data->subtree_max_end = newdata->end;
if (newdata->start <= data->start)
node = &((*node)->rb_left);
else if (newdata->start > data->start)
node = &((*node)->rb_right);
}
newdata->subtree_max_end = newdata->end;
rb_link_node(&newdata->rb, parent, node);
rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb);
}
int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
{
int err = 0;
err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end,
new->type, ret_type);
if (!err) {
if (ret_type)
new->type = *ret_type;
new->subtree_max_end = new->end;
memtype_rb_insert(&memtype_rbroot, new);
}
return err;
}
struct memtype *rbt_memtype_erase(u64 start, u64 end)
{
struct memtype *data;
data = memtype_rb_exact_match(&memtype_rbroot, start, end);
if (!data)
goto out;
rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb);
out:
return data;
}
struct memtype *rbt_memtype_lookup(u64 addr)
{
struct memtype *data;
data = memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE);
return data;
}
#if defined(CONFIG_DEBUG_FS)
int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
{
struct rb_node *node;
int i = 1;
node = rb_first(&memtype_rbroot);
while (node && pos != i) {
node = rb_next(node);
i++;
}
if (node) { /* pos == i */
struct memtype *this = container_of(node, struct memtype, rb);
*out = *this;
return 0;
} else {
return 1;
}
}
#endif

532
arch/x86/mm/pf_in.c Normal file
View File

@@ -0,0 +1,532 @@
/*
* Fault Injection Test harness (FI)
* Copyright (C) Intel Crop.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*
*/
/* Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
* Copyright by Intel Crop., 2002
* Louis Zhuang (louis.zhuang@intel.com)
*
* Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
*/
#include <linux/module.h>
#include <linux/ptrace.h> /* struct pt_regs */
#include "pf_in.h"
#ifdef __i386__
/* IA32 Manual 3, 2-1 */
static unsigned char prefix_codes[] = {
0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
0x65, 0x66, 0x67
};
/* IA32 Manual 3, 3-432*/
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
/* IA32 Manual 3, 3-432*/
static unsigned int rw8[] = { 0x88, 0x8A, 0xC6, 0xAA };
static unsigned int rw32[] = {
0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F, 0xAA };
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
static unsigned int mw32[] = { 0x89, 0x8B, 0xC7, 0xAB };
static unsigned int mw64[] = {};
#else /* not __i386__ */
static unsigned char prefix_codes[] = {
0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
0xF0, 0xF3, 0xF2,
/* REX Prefixes */
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
};
/* AMD64 Manual 3, Appendix A*/
static unsigned int reg_rop[] = {
0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
};
static unsigned int reg_wop[] = { 0x88, 0x89, 0xAA, 0xAB };
static unsigned int imm_wop[] = { 0xC6, 0xC7 };
static unsigned int rw8[] = { 0xC6, 0x88, 0x8A, 0xAA };
static unsigned int rw32[] = {
0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F, 0xAB
};
/* 8 bit only */
static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F, 0xAA };
/* 16 bit only */
static unsigned int mw16[] = { 0xB70F, 0xBF0F };
/* 16 or 32 bit */
static unsigned int mw32[] = { 0xC7 };
/* 16, 32 or 64 bit */
static unsigned int mw64[] = { 0x89, 0x8B, 0xAB };
#endif /* not __i386__ */
struct prefix_bits {
unsigned shorted:1;
unsigned enlarged:1;
unsigned rexr:1;
unsigned rex:1;
};
static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
{
int i;
unsigned char *p = addr;
prf->shorted = 0;
prf->enlarged = 0;
prf->rexr = 0;
prf->rex = 0;
restart:
for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
if (*p == prefix_codes[i]) {
if (*p == 0x66)
prf->shorted = 1;
#ifdef __amd64__
if ((*p & 0xf8) == 0x48)
prf->enlarged = 1;
if ((*p & 0xf4) == 0x44)
prf->rexr = 1;
if ((*p & 0xf0) == 0x40)
prf->rex = 1;
#endif
p++;
goto restart;
}
}
return (p - addr);
}
static int get_opcode(unsigned char *addr, unsigned int *opcode)
{
int len;
if (*addr == 0x0F) {
/* 0x0F is extension instruction */
*opcode = *(unsigned short *)addr;
len = 2;
} else {
*opcode = *addr;
len = 1;
}
return len;
}
#define CHECK_OP_TYPE(opcode, array, type) \
for (i = 0; i < ARRAY_SIZE(array); i++) { \
if (array[i] == opcode) { \
rv = type; \
goto exit; \
} \
}
enum reason_type get_ins_type(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
enum reason_type rv = OTHERS;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
exit:
return rv;
}
#undef CHECK_OP_TYPE
static unsigned int get_ins_reg_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(rw8); i++)
if (rw8[i] == opcode)
return 1;
for (i = 0; i < ARRAY_SIZE(rw32); i++)
if (rw32[i] == opcode)
return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
}
unsigned int get_ins_mem_width(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(mw8); i++)
if (mw8[i] == opcode)
return 1;
for (i = 0; i < ARRAY_SIZE(mw16); i++)
if (mw16[i] == opcode)
return 2;
for (i = 0; i < ARRAY_SIZE(mw32); i++)
if (mw32[i] == opcode)
return prf.shorted ? 2 : 4;
for (i = 0; i < ARRAY_SIZE(mw64); i++)
if (mw64[i] == opcode)
return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
return 0;
}
/*
* Define register ident in mod/rm byte.
* Note: these are NOT the same as in ptrace-abi.h.
*/
enum {
arg_AL = 0,
arg_CL = 1,
arg_DL = 2,
arg_BL = 3,
arg_AH = 4,
arg_CH = 5,
arg_DH = 6,
arg_BH = 7,
arg_AX = 0,
arg_CX = 1,
arg_DX = 2,
arg_BX = 3,
arg_SP = 4,
arg_BP = 5,
arg_SI = 6,
arg_DI = 7,
#ifdef __amd64__
arg_R8 = 8,
arg_R9 = 9,
arg_R10 = 10,
arg_R11 = 11,
arg_R12 = 12,
arg_R13 = 13,
arg_R14 = 14,
arg_R15 = 15
#endif
};
static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
{
unsigned char *rv = NULL;
switch (no) {
case arg_AL:
rv = (unsigned char *)&regs->ax;
break;
case arg_BL:
rv = (unsigned char *)&regs->bx;
break;
case arg_CL:
rv = (unsigned char *)&regs->cx;
break;
case arg_DL:
rv = (unsigned char *)&regs->dx;
break;
#ifdef __amd64__
case arg_R8:
rv = (unsigned char *)&regs->r8;
break;
case arg_R9:
rv = (unsigned char *)&regs->r9;
break;
case arg_R10:
rv = (unsigned char *)&regs->r10;
break;
case arg_R11:
rv = (unsigned char *)&regs->r11;
break;
case arg_R12:
rv = (unsigned char *)&regs->r12;
break;
case arg_R13:
rv = (unsigned char *)&regs->r13;
break;
case arg_R14:
rv = (unsigned char *)&regs->r14;
break;
case arg_R15:
rv = (unsigned char *)&regs->r15;
break;
#endif
default:
break;
}
if (rv)
return rv;
if (rex) {
/*
* If REX prefix exists, access low bytes of SI etc.
* instead of AH etc.
*/
switch (no) {
case arg_SI:
rv = (unsigned char *)&regs->si;
break;
case arg_DI:
rv = (unsigned char *)&regs->di;
break;
case arg_BP:
rv = (unsigned char *)&regs->bp;
break;
case arg_SP:
rv = (unsigned char *)&regs->sp;
break;
default:
break;
}
} else {
switch (no) {
case arg_AH:
rv = 1 + (unsigned char *)&regs->ax;
break;
case arg_BH:
rv = 1 + (unsigned char *)&regs->bx;
break;
case arg_CH:
rv = 1 + (unsigned char *)&regs->cx;
break;
case arg_DH:
rv = 1 + (unsigned char *)&regs->dx;
break;
default:
break;
}
}
if (!rv)
printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
return rv;
}
static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
{
unsigned long *rv = NULL;
switch (no) {
case arg_AX:
rv = &regs->ax;
break;
case arg_BX:
rv = &regs->bx;
break;
case arg_CX:
rv = &regs->cx;
break;
case arg_DX:
rv = &regs->dx;
break;
case arg_SP:
rv = &regs->sp;
break;
case arg_BP:
rv = &regs->bp;
break;
case arg_SI:
rv = &regs->si;
break;
case arg_DI:
rv = &regs->di;
break;
#ifdef __amd64__
case arg_R8:
rv = &regs->r8;
break;
case arg_R9:
rv = &regs->r9;
break;
case arg_R10:
rv = &regs->r10;
break;
case arg_R11:
rv = &regs->r11;
break;
case arg_R12:
rv = &regs->r12;
break;
case arg_R13:
rv = &regs->r13;
break;
case arg_R14:
rv = &regs->r14;
break;
case arg_R15:
rv = &regs->r15;
break;
#endif
default:
printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
}
return rv;
}
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
{
unsigned int opcode;
int reg;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
if (reg_rop[i] == opcode)
goto do_work;
for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
if (reg_wop[i] == opcode)
goto do_work;
printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
"0x%02x\n", opcode);
goto err;
do_work:
/* for STOS, source register is fixed */
if (opcode == 0xAA || opcode == 0xAB) {
reg = arg_AX;
} else {
unsigned char mod_rm = *p;
reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
}
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *get_reg_w8(reg, prf.rex, regs);
case 2:
return *(unsigned short *)get_reg_w32(reg, regs);
case 4:
return *(unsigned int *)get_reg_w32(reg, regs);
#ifdef __amd64__
case 8:
return *(unsigned long *)get_reg_w32(reg, regs);
#endif
default:
printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
}
err:
return 0;
}
unsigned long get_ins_imm_val(unsigned long ins_addr)
{
unsigned int opcode;
unsigned char mod_rm;
unsigned char mod;
unsigned char *p;
struct prefix_bits prf;
int i;
p = (unsigned char *)ins_addr;
p += skip_prefix(p, &prf);
p += get_opcode(p, &opcode);
for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
if (imm_wop[i] == opcode)
goto do_work;
printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
"0x%02x\n", opcode);
goto err;
do_work:
mod_rm = *p;
mod = mod_rm >> 6;
p++;
switch (mod) {
case 0:
/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2) */
/* AMD64: XXX Check for address size prefix? */
if ((mod_rm & 0x7) == 0x5)
p += 4;
break;
case 1:
p += 1;
break;
case 2:
p += 4;
break;
case 3:
default:
printk(KERN_ERR "mmiotrace: not a memory access instruction "
"at 0x%lx, rm_mod=0x%02x\n",
ins_addr, mod_rm);
}
switch (get_ins_reg_width(ins_addr)) {
case 1:
return *(unsigned char *)p;
case 2:
return *(unsigned short *)p;
case 4:
return *(unsigned int *)p;
#ifdef __amd64__
case 8:
return *(unsigned long *)p;
#endif
default:
printk(KERN_ERR "mmiotrace: Error: width.\n");
}
err:
return 0;
}

39
arch/x86/mm/pf_in.h Normal file
View File

@@ -0,0 +1,39 @@
/*
* Fault Injection Test harness (FI)
* Copyright (C) Intel Crop.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
* USA.
*
*/
#ifndef __PF_H_
#define __PF_H_
enum reason_type {
NOT_ME, /* page fault is not in regions */
NOTHING, /* access others point in regions */
REG_READ, /* read from addr to reg */
REG_WRITE, /* write from reg to addr */
IMM_WRITE, /* write from imm to addr */
OTHERS /* Other instructions can not intercept */
};
enum reason_type get_ins_type(unsigned long ins_addr);
unsigned int get_ins_mem_width(unsigned long ins_addr);
unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
unsigned long get_ins_imm_val(unsigned long ins_addr);
#endif /* __PF_H_ */

465
arch/x86/mm/pgtable.c Normal file
View File

@@ -0,0 +1,465 @@
#include <linux/mm.h>
#include <linux/gfp.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
#else
#define PGALLOC_USER_GFP 0
#endif
gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
{
return (pte_t *)__get_free_page(PGALLOC_GFP);
}
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
{
struct page *pte;
pte = alloc_pages(__userpte_alloc_gfp, 0);
if (pte)
pgtable_page_ctor(pte);
return pte;
}
static int __init setup_userpte(char *arg)
{
if (!arg)
return -EINVAL;
/*
* "userpte=nohigh" disables allocation of user pagetables in
* high memory.
*/
if (strcmp(arg, "nohigh") == 0)
__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
else
return -EINVAL;
return 0;
}
early_param("userpte", setup_userpte);
void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
#if PAGETABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
/*
* NOTE! For PAE, any changes to the top page-directory-pointer-table
* entries need a full cr3 reload to flush.
*/
#ifdef CONFIG_X86_PAE
tlb->need_flush_all = 1;
#endif
tlb_remove_page(tlb, virt_to_page(pmd));
}
#if PAGETABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_add(&page->lru, &pgd_list);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
list_del(&page->lru);
}
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
{
BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
virt_to_page(pgd)->index = (pgoff_t)mm;
}
struct mm_struct *pgd_page_get_mm(struct page *page)
{
return (struct mm_struct *)page->index;
}
static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
{
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (PAGETABLE_LEVELS == 2 ||
(PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
PAGETABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
if (!SHARED_KERNEL_PMD) {
pgd_set_mm(pgd, mm);
pgd_list_add(pgd);
}
}
static void pgd_dtor(pgd_t *pgd)
{
if (SHARED_KERNEL_PMD)
return;
spin_lock(&pgd_lock);
pgd_list_del(pgd);
spin_unlock(&pgd_lock);
}
/*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* -- nyc
*/
#ifdef CONFIG_X86_PAE
/*
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
* updating the top-level pagetable entries to guarantee the
* processor notices the update. Since this is expensive, and
* all 4 top-level entries are used almost immediately in a
* new process's life, we just pre-populate them here.
*
* Also, if we're in a paravirt environment where the kernel pmd is
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
* and initialize the kernel pmds here.
*/
#define PREALLOCATED_PMDS UNSHARED_PTRS_PER_PGD
void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
/* Note: almost everything apart from _PAGE_PRESENT is
reserved at the pmd (PDPT) level. */
set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
/*
* According to Intel App note "TLBs, Paging-Structure Caches,
* and Their Invalidation", April 2007, document 317080-001,
* section 8.1: in PAE mode we explicitly have to flush the
* TLB via cr3 if the top-level pgd is changed...
*/
flush_tlb_mm(mm);
}
#else /* !CONFIG_X86_PAE */
/* No need to prepopulate any pagetable entries in non-PAE modes. */
#define PREALLOCATED_PMDS 0
#endif /* CONFIG_X86_PAE */
static void free_pmds(pmd_t *pmds[])
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++)
if (pmds[i])
free_page((unsigned long)pmds[i]);
}
static int preallocate_pmds(pmd_t *pmds[])
{
int i;
bool failed = false;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
if (pmd == NULL)
failed = true;
pmds[i] = pmd;
}
if (failed) {
free_pmds(pmds);
return -ENOMEM;
}
return 0;
}
/*
* Mop up any pmd pages which may still be attached to the pgd.
* Normally they will be freed by munmap/exit_mmap, but any pmd we
* preallocate which never got a corresponding vma will need to be
* freed manually.
*/
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
{
int i;
for(i = 0; i < PREALLOCATED_PMDS; i++) {
pgd_t pgd = pgdp[i];
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
pgdp[i] = native_make_pgd(0);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
}
}
}
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
pud_t *pud;
unsigned long addr;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
pud = pud_offset(pgd, 0);
for (addr = i = 0; i < PREALLOCATED_PMDS;
i++, pud++, addr += PUD_SIZE) {
pmd_t *pmd = pmds[i];
if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
pud_populate(mm, pud, pmd);
}
}
pgd_t *pgd_alloc(struct mm_struct *mm)
{
pgd_t *pgd;
pmd_t *pmds[PREALLOCATED_PMDS];
pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
if (pgd == NULL)
goto out;
mm->pgd = pgd;
if (preallocate_pmds(pmds) != 0)
goto out_free_pgd;
if (paravirt_pgd_alloc(mm) != 0)
goto out_free_pmds;
/*
* Make sure that pre-populating the pmds is atomic with
* respect to anything walking the pgd_list, so that they
* never see a partially populated pgd.
*/
spin_lock(&pgd_lock);
pgd_ctor(mm, pgd);
pgd_prepopulate_pmd(mm, pgd, pmds);
spin_unlock(&pgd_lock);
return pgd;
out_free_pmds:
free_pmds(pmds);
out_free_pgd:
free_page((unsigned long)pgd);
out:
return NULL;
}
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
pgd_mop_up_pmds(mm, pgd);
pgd_dtor(pgd);
paravirt_pgd_free(mm, pgd);
free_page((unsigned long)pgd);
}
/*
* Used to set accessed or dirty bits in the page table entries
* on other architectures. On x86, the accessed and dirty bits
* are tracked by hardware. However, do_wp_page calls this function
* to also make the pte writeable at the same time the dirty bit is
* set. In that case we do actually need to write the PTE.
*/
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
}
return changed;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
int changed = !pmd_same(*pmdp, entry);
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
*pmdp = entry;
pmd_update_defer(vma->vm_mm, address, pmdp);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
* #PF is architecturally guaranteed to do that and in the
* worst-case we'll generate a spurious fault.
*/
}
return changed;
}
#endif
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
int ret = 0;
if (pte_young(*ptep))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *) &ptep->pte);
if (ret)
pte_update(vma->vm_mm, addr, ptep);
return ret;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
int ret = 0;
if (pmd_young(*pmdp))
ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
(unsigned long *)pmdp);
if (ret)
pmd_update(vma->vm_mm, addr, pmdp);
return ret;
}
#endif
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
int young;
young = ptep_test_and_clear_young(vma, address, ptep);
if (young)
flush_tlb_page(vma, address);
return young;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
int young;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
young = pmdp_test_and_clear_young(vma, address, pmdp);
if (young)
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return young;
}
void pmdp_splitting_flush(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
int set;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
(unsigned long *)pmdp);
if (set) {
pmd_update(vma->vm_mm, address, pmdp);
/* need tlb flush only to serialize against gup-fast */
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
}
#endif
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
*
* Can be used to relocate the fixmap area and poke a hole in the top
* of kernel address space to make room for a hypervisor.
*/
void __init reserve_top_address(unsigned long reserve)
{
#ifdef CONFIG_X86_32
BUG_ON(fixmaps_set > 0);
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
(int)-reserve);
__FIXADDR_TOP = -reserve - PAGE_SIZE;
#endif
}
int fixmaps_set;
void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
}
set_pte_vaddr(address, pte);
fixmaps_set++;
}
void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
pgprot_t flags)
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}

133
arch/x86/mm/pgtable_32.c Normal file
View File

@@ -0,0 +1,133 @@
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <linux/module.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
/*
* Associate a virtual page frame with a given physical page frame
* and protection flags for that frame.
*/
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
if (pud_none(*pud)) {
BUG();
return;
}
pmd = pmd_offset(pud, vaddr);
if (pmd_none(*pmd)) {
BUG();
return;
}
pte = pte_offset_kernel(pmd, vaddr);
if (pte_val(pteval))
set_pte_at(&init_mm, vaddr, pte, pteval);
else
pte_clear(&init_mm, vaddr, pte);
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame. pfn is for the base of the page,
* vaddr is what the page gets mapped to - both must be properly aligned.
* The pmd must already be instantiated. Assumes PAE mode.
*/
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
return; /* BUG(); */
}
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
return; /* BUG(); */
}
pgd = swapper_pg_dir + pgd_index(vaddr);
if (pgd_none(*pgd)) {
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
return; /* BUG(); */
}
pud = pud_offset(pgd, vaddr);
pmd = pmd_offset(pud, vaddr);
set_pmd(pmd, pfn_pmd(pfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
unsigned long __FIXADDR_TOP = 0xfffff000;
EXPORT_SYMBOL(__FIXADDR_TOP);
/*
* vmalloc=size forces the vmalloc area to be exactly 'size'
* bytes. This can be used to increase (or decrease) the
* vmalloc area - the default is 128m.
*/
static int __init parse_vmalloc(char *arg)
{
if (!arg)
return -EINVAL;
/* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
__VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
return 0;
}
early_param("vmalloc", parse_vmalloc);
/*
* reservetop=size reserves a hole at the top of the kernel address space which
* a hypervisor can load into later. Needed for dynamically loaded hypervisors,
* so relocating the fixmap can be done before paging initialization.
*/
static int __init parse_reservetop(char *arg)
{
unsigned long address;
if (!arg)
return -EINVAL;
address = memparse(arg, &arg);
reserve_top_address(address);
fixup_early_ioremap();
return 0;
}
early_param("reservetop", parse_reservetop);

98
arch/x86/mm/physaddr.c Normal file
View File

@@ -0,0 +1,98 @@
#include <linux/bootmem.h>
#include <linux/mmdebug.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <asm/page.h>
#include "physaddr.h"
#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
x = y + phys_base;
VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
} else {
x = y + (__START_KERNEL_map - PAGE_OFFSET);
/* carry flag will be set if starting x was >= PAGE_OFFSET */
VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x));
}
return x;
}
EXPORT_SYMBOL(__phys_addr);
unsigned long __phys_addr_symbol(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* only check upper bounds since lower bounds will trigger carry */
VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE);
return y + phys_base;
}
EXPORT_SYMBOL(__phys_addr_symbol);
#endif
bool __virt_addr_valid(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
if (unlikely(x > y)) {
x = y + phys_base;
if (y >= KERNEL_IMAGE_SIZE)
return false;
} else {
x = y + (__START_KERNEL_map - PAGE_OFFSET);
/* carry flag will be set if starting x was >= PAGE_OFFSET */
if ((x > y) || !phys_addr_valid(x))
return false;
}
return pfn_valid(x >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);
#else
#ifdef CONFIG_DEBUG_VIRTUAL
unsigned long __phys_addr(unsigned long x)
{
unsigned long phys_addr = x - PAGE_OFFSET;
/* VMALLOC_* aren't constants */
VIRTUAL_BUG_ON(x < PAGE_OFFSET);
VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
/* max_low_pfn is set early, but not _that_ early */
if (max_low_pfn) {
VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn);
BUG_ON(slow_virt_to_phys((void *)x) != phys_addr);
}
return phys_addr;
}
EXPORT_SYMBOL(__phys_addr);
#endif
bool __virt_addr_valid(unsigned long x)
{
if (x < PAGE_OFFSET)
return false;
if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
return false;
if (x >= FIXADDR_START)
return false;
return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
}
EXPORT_SYMBOL(__virt_addr_valid);
#endif /* CONFIG_X86_64 */

10
arch/x86/mm/physaddr.h Normal file
View File

@@ -0,0 +1,10 @@
#include <asm/processor.h>
static inline int phys_addr_valid(resource_size_t addr)
{
#ifdef CONFIG_PHYS_ADDR_T_64BIT
return !(addr >> boot_cpu_data.x86_phys_bits);
#else
return 1;
#endif
}

60
arch/x86/mm/setup_nx.c Normal file
View File

@@ -0,0 +1,60 @@
#include <linux/spinlock.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
static int disable_nx __cpuinitdata;
/*
* noexec = on|off
*
* Control non-executable mappings for processes.
*
* on Enable
* off Disable
*/
static int __init noexec_setup(char *str)
{
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
disable_nx = 1;
}
x86_configure_nx();
return 0;
}
early_param("noexec", noexec_setup);
void __cpuinit x86_configure_nx(void)
{
if (cpu_has_nx && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
else
__supported_pte_mask &= ~_PAGE_NX;
}
void __init x86_report_nx(void)
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
printk(KERN_INFO "NX (Execute Disable) protection: "
"disabled by kernel command line option\n");
} else {
printk(KERN_INFO "NX (Execute Disable) protection: "
"active\n");
}
#else
/* 32bit non-PAE kernel, NX cannot be used */
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"cannot be enabled: non-PAE kernel!\n");
#endif
}
}

198
arch/x86/mm/srat.c Normal file
View File

@@ -0,0 +1,198 @@
/*
* ACPI 3.0 based NUMA setup
* Copyright 2004 Andi Kleen, SuSE Labs.
*
* Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
*
* Called from acpi_numa_init while reading the SRAT and SLIT tables.
* Assumes all memory regions belonging to a single proximity domain
* are in one chunk. Holes between them will be included in the node.
*/
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/mmzone.h>
#include <linux/bitmap.h>
#include <linux/module.h>
#include <linux/topology.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/mm.h>
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
int acpi_numa __initdata;
static __init int setup_node(int pxm)
{
return acpi_map_pxm_to_node(pxm);
}
static __init void bad_srat(void)
{
printk(KERN_ERR "SRAT: SRAT not used.\n");
acpi_numa = -1;
}
static __init inline int srat_disabled(void)
{
return acpi_numa < 0;
}
/* Callback for SLIT parsing */
void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
{
int i, j;
for (i = 0; i < slit->locality_count; i++)
for (j = 0; j < slit->locality_count; j++)
numa_set_distance(pxm_to_node(i), pxm_to_node(j),
slit->entry[slit->locality_count * i + j]);
}
/* Callback for Proximity Domain -> x2APIC mapping */
void __init
acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
{
int pxm, node;
int apic_id;
if (srat_disabled())
return;
if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
bad_srat();
return;
}
if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return;
pxm = pa->proximity_domain;
apic_id = pa->apic_id;
if (!apic->apic_id_valid(apic_id)) {
printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n",
pxm, apic_id);
return;
}
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
if (apic_id >= MAX_LOCAL_APIC) {
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
return;
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
pxm, apic_id, node);
}
/* Callback for Proximity Domain -> LAPIC mapping */
void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
{
int pxm, node;
int apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
bad_srat();
return;
}
if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
return;
pxm = pa->proximity_domain_lo;
if (acpi_srat_revision >= 2)
pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
bad_srat();
return;
}
if (get_uv_system_type() >= UV_X2APIC)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
if (apic_id >= MAX_LOCAL_APIC) {
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
return;
}
set_apicid_to_node(apic_id, node);
node_set(node, numa_nodes_parsed);
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
pxm, apic_id, node);
}
#ifdef CONFIG_MEMORY_HOTPLUG
static inline int save_add_info(void) {return 1;}
#else
static inline int save_add_info(void) {return 0;}
#endif
/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
int __init
acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
{
u64 start, end;
int node, pxm;
if (srat_disabled())
goto out_err;
if (ma->header.length != sizeof(struct acpi_srat_mem_affinity))
goto out_err_bad_srat;
if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
goto out_err;
if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
goto out_err;
start = ma->base_address;
end = start + ma->length;
pxm = ma->proximity_domain;
if (acpi_srat_revision <= 1)
pxm &= 0xff;
node = setup_node(pxm);
if (node < 0) {
printk(KERN_ERR "SRAT: Too many proximity domains.\n");
goto out_err_bad_srat;
}
if (numa_add_memblk(node, start, end) < 0)
goto out_err_bad_srat;
node_set(node, numa_nodes_parsed);
printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n",
node, pxm,
(unsigned long long) start, (unsigned long long) end - 1);
return 0;
out_err_bad_srat:
bad_srat();
out_err:
return -1;
}
void __init acpi_numa_arch_fixup(void) {}
int __init x86_acpi_numa_init(void)
{
int ret;
ret = acpi_numa_init();
if (ret < 0)
return ret;
return srat_disabled() ? -EINVAL : 0;
}

140
arch/x86/mm/testmmiotrace.c Normal file
View File

@@ -0,0 +1,140 @@
/*
* Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/io.h>
#include <linux/mmiotrace.h>
static unsigned long mmio_address;
module_param(mmio_address, ulong, 0);
MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
"(or 8 MB if read_far is non-zero).");
static unsigned long read_far = 0x400100;
module_param(read_far, ulong, 0);
MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
"(default: 0x400100).");
static unsigned v16(unsigned i)
{
return i * 12 + 7;
}
static unsigned v32(unsigned i)
{
return i * 212371 + 13;
}
static void do_write_test(void __iomem *p)
{
unsigned int i;
pr_info("write test.\n");
mmiotrace_printk("Write test.\n");
for (i = 0; i < 256; i++)
iowrite8(i, p + i);
for (i = 1024; i < (5 * 1024); i += 2)
iowrite16(v16(i), p + i);
for (i = (5 * 1024); i < (16 * 1024); i += 4)
iowrite32(v32(i), p + i);
}
static void do_read_test(void __iomem *p)
{
unsigned int i;
unsigned errs[3] = { 0 };
pr_info("read test.\n");
mmiotrace_printk("Read test.\n");
for (i = 0; i < 256; i++)
if (ioread8(p + i) != i)
++errs[0];
for (i = 1024; i < (5 * 1024); i += 2)
if (ioread16(p + i) != v16(i))
++errs[1];
for (i = (5 * 1024); i < (16 * 1024); i += 4)
if (ioread32(p + i) != v32(i))
++errs[2];
mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
errs[0], errs[1], errs[2]);
}
static void do_read_far_test(void __iomem *p)
{
pr_info("read far test.\n");
mmiotrace_printk("Read far test.\n");
ioread32(p + read_far);
}
static void do_test(unsigned long size)
{
void __iomem *p = ioremap_nocache(mmio_address, size);
if (!p) {
pr_err("could not ioremap, aborting.\n");
return;
}
mmiotrace_printk("ioremap returned %p.\n", p);
do_write_test(p);
do_read_test(p);
if (read_far && read_far < size - 4)
do_read_far_test(p);
iounmap(p);
}
/*
* Tests how mmiotrace behaves in face of multiple ioremap / iounmaps in
* a short time. We had a bug in deferred freeing procedure which tried
* to free this region multiple times (ioremap can reuse the same address
* for many mappings).
*/
static void do_test_bulk_ioremapping(void)
{
void __iomem *p;
int i;
for (i = 0; i < 10; ++i) {
p = ioremap_nocache(mmio_address, PAGE_SIZE);
if (p)
iounmap(p);
}
/* Force freeing. If it will crash we will know why. */
synchronize_rcu();
}
static int __init init(void)
{
unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
if (mmio_address == 0) {
pr_err("you have to use the module argument mmio_address.\n");
pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
return -ENXIO;
}
pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
"and writing 16 kB of rubbish in there.\n",
size >> 10, mmio_address);
do_test(size);
do_test_bulk_ioremapping();
pr_info("All done.\n");
return 0;
}
static void __exit cleanup(void)
{
pr_debug("unloaded.\n");
}
module_init(init);
module_exit(cleanup);
MODULE_LICENSE("GPL");

345
arch/x86/mm/tlb.c Normal file
View File

@@ -0,0 +1,345 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
#include <linux/debugfs.h>
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
= { &init_mm, 0, };
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
* writing to user space from interrupts. (Its not allowed anyway).
*
* Optimizations Manfred Spraul <manfred@colorfullife.com>
*
* More scalable flush, from Andi Kleen
*
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
struct flush_tlb_info {
struct mm_struct *flush_mm;
unsigned long flush_start;
unsigned long flush_end;
};
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
*/
void leave_mm(int cpu)
{
struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
load_cr3(swapper_pg_dir);
}
}
EXPORT_SYMBOL_GPL(leave_mm);
/*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
* 1) switch_mm() either 1a) or 1b)
* 1a) thread switch to a different mm
* 1a1) set cpu_tlbstate to TLBSTATE_OK
* Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
* if cpu0 was in lazy tlb mode.
* 1a2) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
* 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
* 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
* Stop ipi delivery for the old mm. This is not synchronized with
* the other cpus, but flush_tlb_func ignore flush ipis for the wrong
* mm, and in the worst case we perform a superfluous tlb flush.
* 1b) thread switch without mm change
* cpu active_mm is correct, cpu0 already handles flush ipis.
* 1b1) set cpu_tlbstate to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
* 1b3) if the bit was 0: leave_mm was called, flush the tlb.
* 2) switch %%esp, ie current
*
* The interrupt must handle 2 special cases:
* - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
* - the cpu performs speculative tlb reads, i.e. even if the cpu only
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
* The good news is that cpu_tlbstate is local to each cpu, no
* write/read ordering problems.
*/
/*
* TLB flush funcation:
* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
* 2) Leave the mm if we are in the lazy tlb mode.
*/
static void flush_tlb_func(void *info)
{
struct flush_tlb_info *f = info;
inc_irq_stat(irq_tlb_count);
if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
return;
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
if (f->flush_end == TLB_FLUSH_ALL)
local_flush_tlb();
else if (!f->flush_end)
__flush_tlb_single(f->flush_start);
else {
unsigned long addr;
addr = f->flush_start;
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
}
} else
leave_mm(smp_processor_id());
}
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm, unsigned long start,
unsigned long end)
{
struct flush_tlb_info info;
info.flush_mm = mm;
info.flush_start = start;
info.flush_end = end;
if (is_uv_system()) {
unsigned int cpu;
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
if (cpumask)
smp_call_function_many(cpumask, flush_tlb_func,
&info, 1);
return;
}
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
preempt_disable();
local_flush_tlb();
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
/*
* It can find out the THP large page, or
* HUGETLB page in tlb_flush when THP disabled
*/
static inline unsigned long has_large_page(struct mm_struct *mm,
unsigned long start, unsigned long end)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
unsigned long addr = ALIGN(start, HPAGE_SIZE);
for (; addr < end; addr += HPAGE_SIZE) {
pgd = pgd_offset(mm, addr);
if (likely(!pgd_none(*pgd))) {
pud = pud_offset(pgd, addr);
if (likely(!pud_none(*pud))) {
pmd = pmd_offset(pud, addr);
if (likely(!pmd_none(*pmd)))
if (pmd_large(*pmd))
return addr;
}
}
}
return 0;
}
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag)
{
unsigned long addr;
unsigned act_entries, tlb_entries = 0;
preempt_disable();
if (current->active_mm != mm)
goto flush_all;
if (!current->mm) {
leave_mm(smp_processor_id());
goto flush_all;
}
if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
|| vmflag & VM_HUGETLB) {
local_flush_tlb();
goto flush_all;
}
/* In modern CPU, last level tlb used for both data/ins */
if (vmflag & VM_EXEC)
tlb_entries = tlb_lli_4k[ENTRIES];
else
tlb_entries = tlb_lld_4k[ENTRIES];
/* Assume all of TLB entries was occupied by this task */
act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
/* tlb_flushall_shift is on balance point, details in commit log */
if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
local_flush_tlb();
else {
if (has_large_page(mm, start, end)) {
local_flush_tlb();
goto flush_all;
}
/* flush range by one by one 'invlpg' */
for (addr = start; addr < end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
if (cpumask_any_but(mm_cpumask(mm),
smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, start, end);
preempt_enable();
return;
}
flush_all:
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
{
struct mm_struct *mm = vma->vm_mm;
preempt_disable();
if (current->active_mm == mm) {
if (current->mm)
__flush_tlb_one(start);
else
leave_mm(smp_processor_id());
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
preempt_enable();
}
static void do_flush_tlb_all(void *info)
{
__flush_tlb_all();
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(smp_processor_id());
}
void flush_tlb_all(void)
{
on_each_cpu(do_flush_tlb_all, NULL, 1);
}
static void do_kernel_range_flush(void *info)
{
struct flush_tlb_info *f = info;
unsigned long addr;
/* flush range by one by one 'invlpg' */
for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
__flush_tlb_single(addr);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
unsigned act_entries;
struct flush_tlb_info info;
/* In modern CPU, last level tlb used for both data/ins */
act_entries = tlb_lld_4k[ENTRIES];
/* Balance as user space task's flush, a bit conservative */
if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
on_each_cpu(do_flush_tlb_all, NULL, 1);
else {
info.flush_start = start;
info.flush_end = end;
on_each_cpu(do_kernel_range_flush, &info, 1);
}
}
#ifdef CONFIG_DEBUG_TLBFLUSH
static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[32];
unsigned int len;
len = sprintf(buf, "%hd\n", tlb_flushall_shift);
return simple_read_from_buffer(user_buf, count, ppos, buf, len);
}
static ssize_t tlbflush_write_file(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
ssize_t len;
s8 shift;
len = min(count, sizeof(buf) - 1);
if (copy_from_user(buf, user_buf, len))
return -EFAULT;
buf[len] = '\0';
if (kstrtos8(buf, 0, &shift))
return -EINVAL;
if (shift < -1 || shift >= BITS_PER_LONG)
return -EINVAL;
tlb_flushall_shift = shift;
return count;
}
static const struct file_operations fops_tlbflush = {
.read = tlbflush_read_file,
.write = tlbflush_write_file,
.llseek = default_llseek,
};
static int __init create_tlb_flushall_shift(void)
{
debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
arch_debugfs_dir, NULL, &fops_tlbflush);
return 0;
}
late_initcall(create_tlb_flushall_shift);
#endif