Initial commit; kernel source import

This commit is contained in:
Nathan
2025-04-06 23:50:55 -05:00
commit 25c6d769f4
45093 changed files with 18199410 additions and 0 deletions

53
arch/x86/xen/Kconfig Normal file
View File

@@ -0,0 +1,53 @@
#
# This Kconfig describes xen options
#
config XEN
bool "Xen guest support"
depends on PARAVIRT
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
depends on X86_64 || (X86_32 && X86_PAE && !X86_VISWS)
depends on X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
Xen hypervisor.
config XEN_DOM0
def_bool y
depends on XEN && PCI_XEN && SWIOTLB_XEN
depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
# name in tools.
config XEN_PRIVILEGED_GUEST
def_bool XEN_DOM0
config XEN_PVHVM
def_bool y
depends on XEN && PCI && X86_LOCAL_APIC
config XEN_MAX_DOMAIN_MEMORY
int
default 500 if X86_64
default 64 if X86_32
depends on XEN
help
This only affects the sizing of some bss arrays, the unused
portions of which are freed.
config XEN_SAVE_RESTORE
bool
depends on XEN
select HIBERNATE_CALLBACKS
default y
config XEN_DEBUG_FS
bool "Enable Xen debug and tuning parameters in debugfs"
depends on XEN && DEBUG_FS
default n
help
Enable statistics output and various tuning options in debugfs.
Enabling this option may incur a significant performance overhead.

24
arch/x86/xen/Makefile Normal file
View File

@@ -0,0 +1,24 @@
ifdef CONFIG_FUNCTION_TRACER
# Do not profile debug and lowlevel utilities
CFLAGS_REMOVE_spinlock.o = -pg
CFLAGS_REMOVE_time.o = -pg
CFLAGS_REMOVE_irq.o = -pg
endif
# Make sure early boot has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_enlighten.o := $(nostackp)
CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
grant-table.o suspend.o platform-pci-unplug.o \
p2m.o
obj-$(CONFIG_EVENT_TRACING) += trace.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += apic.o vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o

34
arch/x86/xen/apic.c Normal file
View File

@@ -0,0 +1,34 @@
#include <linux/init.h>
#include <asm/x86_init.h>
#include <asm/apic.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
{
struct physdev_apic apic_op;
int ret;
apic_op.apic_physbase = mpc_ioapic_addr(apic);
apic_op.reg = reg;
ret = HYPERVISOR_physdev_op(PHYSDEVOP_apic_read, &apic_op);
if (!ret)
return apic_op.value;
/* fallback to return an emulated IO_APIC values */
if (reg == 0x1)
return 0x00170020;
else if (reg == 0x0)
return apic << 24;
return 0xfd;
}
void __init xen_init_apic(void)
{
x86_io_apic_ops.read = xen_io_apic_read;
}

21
arch/x86/xen/debugfs.c Normal file
View File

@@ -0,0 +1,21 @@
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/module.h>
#include "debugfs.h"
static struct dentry *d_xen_debug;
struct dentry * __init xen_init_debugfs(void)
{
if (!d_xen_debug) {
d_xen_debug = debugfs_create_dir("xen", NULL);
if (!d_xen_debug)
pr_warning("Could not create 'xen' debugfs directory\n");
}
return d_xen_debug;
}

6
arch/x86/xen/debugfs.h Normal file
View File

@@ -0,0 +1,6 @@
#ifndef _XEN_DEBUGFS_H
#define _XEN_DEBUGFS_H
struct dentry * __init xen_init_debugfs(void);
#endif /* _XEN_DEBUGFS_H */

1753
arch/x86/xen/enlighten.c Normal file

File diff suppressed because it is too large Load Diff

127
arch/x86/xen/grant-table.c Normal file
View File

@@ -0,0 +1,127 @@
/******************************************************************************
* grant_table.c
* x86 specific part
*
* Granting foreign access to our memory reservation.
*
* Copyright (c) 2005-2006, Christopher Clark
* Copyright (c) 2004-2005, K A Fraser
* Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
* VA Linux Systems Japan. Split out x86 specific part.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation; or, when distributed
* separately from the Linux kernel or incorporated into other
* software packages, subject to the following license:
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this source file (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use, copy, modify,
* merge, publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <xen/interface/xen.h>
#include <xen/page.h>
#include <xen/grant_table.h>
#include <asm/pgtable.h>
static int map_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
unsigned long **frames = (unsigned long **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
/*
* This function is used to map shared frames to store grant status. It is
* different from map_pte_fn above, the frames type here is uint64_t.
*/
static int map_pte_fn_status(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
uint64_t **frames = (uint64_t **)data;
set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
(*frames)++;
return 0;
}
static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
unsigned long addr, void *data)
{
set_pte_at(&init_mm, addr, pte, __pte(0));
return 0;
}
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
void **__shared)
{
int rc;
void *shared = *__shared;
if (shared == NULL) {
struct vm_struct *area =
alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn, &frames);
return rc;
}
int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
grant_status_t **__shared)
{
int rc;
grant_status_t *shared = *__shared;
if (shared == NULL) {
/* No need to pass in PTE as we are going to do it
* in apply_to_page_range anyhow. */
struct vm_struct *area =
alloc_vm_area(PAGE_SIZE * max_nr_gframes, NULL);
BUG_ON(area == NULL);
shared = area->addr;
*__shared = shared;
}
rc = apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes,
map_pte_fn_status, &frames);
return rc;
}
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
apply_to_page_range(&init_mm, (unsigned long)shared,
PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
}

134
arch/x86/xen/irq.c Normal file
View File

@@ -0,0 +1,134 @@
#include <linux/hardirq.h>
#include <asm/x86_init.h>
#include <xen/interface/xen.h>
#include <xen/interface/sched.h>
#include <xen/interface/vcpu.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
#include "xen-ops.h"
/*
* Force a proper event-channel callback from Xen after clearing the
* callback mask. We do this in a very simple manner, by making a call
* down into Xen. The pending flag will be checked by Xen on return.
*/
void xen_force_evtchn_callback(void)
{
(void)HYPERVISOR_xen_version(0, NULL);
}
static unsigned long xen_save_fl(void)
{
struct vcpu_info *vcpu;
unsigned long flags;
vcpu = this_cpu_read(xen_vcpu);
/* flag has opposite sense of mask */
flags = !vcpu->evtchn_upcall_mask;
/* convert to IF type flag
-0 -> 0x00000000
-1 -> 0xffffffff
*/
return (-flags) & X86_EFLAGS_IF;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
static void xen_restore_fl(unsigned long flags)
{
struct vcpu_info *vcpu;
/* convert from IF type flag */
flags = !(flags & X86_EFLAGS_IF);
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = flags;
preempt_enable_no_resched();
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
if (flags == 0) {
preempt_check_resched();
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
}
PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
static void xen_irq_disable(void)
{
/* There's a one instruction preempt window here. We need to
make sure we're don't switch CPUs between getting the vcpu
pointer and updating the mask. */
preempt_disable();
this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
preempt_enable_no_resched();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
static void xen_irq_enable(void)
{
struct vcpu_info *vcpu;
/* We don't need to worry about being preempted here, since
either a) interrupts are disabled, so no preemption, or b)
the caller is confused and is trying to re-enable interrupts
on an indeterminate processor. */
vcpu = this_cpu_read(xen_vcpu);
vcpu->evtchn_upcall_mask = 0;
/* Doesn't matter if we get preempted here, because any
pending event will get dealt with anyway. */
barrier(); /* unmask then check (avoid races) */
if (unlikely(vcpu->evtchn_upcall_pending))
xen_force_evtchn_callback();
}
PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
static void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
BUG();
}
static void xen_halt(void)
{
if (irqs_disabled())
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
else
xen_safe_halt();
}
static const struct pv_irq_ops xen_irq_ops __initconst = {
.save_fl = PV_CALLEE_SAVE(xen_save_fl),
.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
.adjust_exception_frame = xen_adjust_exception_frame,
#endif
};
void __init xen_init_irq_ops(void)
{
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
}

2565
arch/x86/xen/mmu.c Normal file

File diff suppressed because it is too large Load Diff

26
arch/x86/xen/mmu.h Normal file
View File

@@ -0,0 +1,26 @@
#ifndef _XEN_MMU_H
#include <linux/linkage.h>
#include <asm/page.h>
enum pt_level {
PT_PGD,
PT_PUD,
PT_PMD,
PT_PTE
};
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
unsigned long xen_read_cr2_direct(void);
extern void xen_init_mmu_ops(void);
extern void xen_hvm_init_mmu_ops(void);
#endif /* _XEN_MMU_H */

208
arch/x86/xen/multicalls.c Normal file
View File

@@ -0,0 +1,208 @@
/*
* Xen hypercall batching.
*
* Xen allows multiple hypercalls to be issued at once, using the
* multicall interface. This allows the cost of trapping into the
* hypervisor to be amortized over several calls.
*
* This file implements a simple interface for multicalls. There's a
* per-cpu buffer of outstanding multicalls. When you want to queue a
* multicall for issuing, you can allocate a multicall slot for the
* call and its arguments, along with storage for space which is
* pointed to by the arguments (for passing pointers to structures,
* etc). When the multicall is actually issued, all the space for the
* commands and allocated memory is freed for reuse.
*
* Multicalls are flushed whenever any of the buffers get full, or
* when explicitly requested. There's no way to get per-multicall
* return results back. It will BUG if any of the multicalls fail.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/debugfs.h>
#include <asm/xen/hypercall.h>
#include "multicalls.h"
#include "debugfs.h"
#define MC_BATCH 32
#define MC_DEBUG 0
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
#if MC_DEBUG
struct multicall_entry debug[MC_BATCH];
void *caller[MC_BATCH];
#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
void *data;
} callbacks[MC_BATCH];
};
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
void xen_mc_flush(void)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_entry *mc;
int ret = 0;
unsigned long flags;
int i;
BUG_ON(preemptible());
/* Disable interrupts in case someone comes in and queues
something in the middle */
local_irq_save(flags);
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
switch (b->mcidx) {
case 0:
/* no-op */
BUG_ON(b->argidx != 0);
break;
case 1:
/* Singleton multicall - bypass multicall machinery
and just do the call directly. */
mc = &b->entries[0];
mc->result = privcmd_call(mc->op,
mc->args[0], mc->args[1], mc->args[2],
mc->args[3], mc->args[4]);
ret = mc->result < 0;
break;
default:
#if MC_DEBUG
memcpy(b->debug, b->entries,
b->mcidx * sizeof(struct multicall_entry));
#endif
if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
BUG();
for (i = 0; i < b->mcidx; i++)
if (b->entries[i].result < 0)
ret++;
#if MC_DEBUG
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(KERN_DEBUG " call %2d/%d: op=%lu arg=[%lx] result=%ld\t%pF\n",
i+1, b->mcidx,
b->debug[i].op,
b->debug[i].args[0],
b->entries[i].result,
b->caller[i]);
}
}
#endif
}
b->mcidx = 0;
b->argidx = 0;
for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
(*cb->fn)(cb->data);
}
b->cbidx = 0;
local_irq_restore(flags);
WARN_ON(ret);
}
struct multicall_space __xen_mc_entry(size_t args)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret;
unsigned argidx = roundup(b->argidx, sizeof(u64));
trace_xen_mc_entry_alloc(args);
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (unlikely(b->mcidx == MC_BATCH ||
(argidx + args) >= MC_ARGS)) {
trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
xen_mc_flush();
argidx = roundup(b->argidx, sizeof(u64));
}
ret.mc = &b->entries[b->mcidx];
#if MC_DEBUG
b->caller[b->mcidx] = __builtin_return_address(0);
#endif
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
BUG_ON(b->argidx >= MC_ARGS);
return ret;
}
struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct multicall_space ret = { NULL, NULL };
BUG_ON(preemptible());
BUG_ON(b->argidx >= MC_ARGS);
if (unlikely(b->mcidx == 0 ||
b->entries[b->mcidx - 1].op != op)) {
trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
goto out;
}
if (unlikely((b->argidx + size) >= MC_ARGS)) {
trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
goto out;
}
ret.mc = &b->entries[b->mcidx - 1];
ret.args = &b->args[b->argidx];
b->argidx += size;
BUG_ON(b->argidx >= MC_ARGS);
trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
out:
return ret;
}
void xen_mc_callback(void (*fn)(void *), void *data)
{
struct mc_buffer *b = &__get_cpu_var(mc_buffer);
struct callback *cb;
if (b->cbidx == MC_BATCH) {
trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
xen_mc_flush();
}
trace_xen_mc_callback(fn, data);
cb = &b->callbacks[b->cbidx++];
cb->fn = fn;
cb->data = data;
}

68
arch/x86/xen/multicalls.h Normal file
View File

@@ -0,0 +1,68 @@
#ifndef _XEN_MULTICALLS_H
#define _XEN_MULTICALLS_H
#include <trace/events/xen.h>
#include "xen-ops.h"
/* Multicalls */
struct multicall_space
{
struct multicall_entry *mc;
void *args;
};
/* Allocate room for a multicall and its args */
struct multicall_space __xen_mc_entry(size_t args);
DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
/* Call to start a batch of multiple __xen_mc_entry()s. Must be
paired with xen_mc_issue() */
static inline void xen_mc_batch(void)
{
unsigned long flags;
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
trace_xen_mc_batch(paravirt_get_lazy_mode());
__this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
{
xen_mc_batch();
return __xen_mc_entry(args);
}
/* Flush all pending multicalls */
void xen_mc_flush(void);
/* Issue a multicall if we're not in a lazy mode */
static inline void xen_mc_issue(unsigned mode)
{
trace_xen_mc_issue(mode);
if ((paravirt_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
local_irq_restore(this_cpu_read(xen_mc_irq_flags));
}
/* Set up a callback to be called when the current batch is flushed */
void xen_mc_callback(void (*fn)(void *), void *data);
/*
* Try to extend the arguments of the previous multicall command. The
* previous command's op must match. If it does, then it attempts to
* extend the argument space allocated to the multicall entry by
* arg_size bytes.
*
* The returned multicall_space will return with mc pointing to the
* command on success, or NULL on failure, and args pointing to the
* newly allocated space.
*/
struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
#endif /* _XEN_MULTICALLS_H */

1172
arch/x86/xen/p2m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,105 @@
/* Glue code to lib/swiotlb-xen.c */
#include <linux/dma-mapping.h>
#include <linux/pci.h>
#include <xen/swiotlb-xen.h>
#include <asm/xen/hypervisor.h>
#include <xen/xen.h>
#include <asm/iommu_table.h>
#include <asm/xen/swiotlb-xen.h>
#ifdef CONFIG_X86_64
#include <asm/iommu.h>
#include <asm/dma.h>
#endif
#include <linux/export.h>
int xen_swiotlb __read_mostly;
static struct dma_map_ops xen_swiotlb_dma_ops = {
.mapping_error = xen_swiotlb_dma_mapping_error,
.alloc = xen_swiotlb_alloc_coherent,
.free = xen_swiotlb_free_coherent,
.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
.sync_single_for_device = xen_swiotlb_sync_single_for_device,
.sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = xen_swiotlb_sync_sg_for_device,
.map_sg = xen_swiotlb_map_sg_attrs,
.unmap_sg = xen_swiotlb_unmap_sg_attrs,
.map_page = xen_swiotlb_map_page,
.unmap_page = xen_swiotlb_unmap_page,
.dma_supported = xen_swiotlb_dma_supported,
};
/*
* pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary
*
* This returns non-zero if we are forced to use xen_swiotlb (by the boot
* option).
*/
int __init pci_xen_swiotlb_detect(void)
{
if (!xen_pv_domain())
return 0;
/* If running as PV guest, either iommu=soft, or swiotlb=force will
* activate this IOMMU. If running as PV privileged, activate it
* irregardless.
*/
if ((xen_initial_domain() || swiotlb || swiotlb_force))
xen_swiotlb = 1;
/* If we are running under Xen, we MUST disable the native SWIOTLB.
* Don't worry about swiotlb_force flag activating the native, as
* the 'swiotlb' flag is the only one turning it on. */
swiotlb = 0;
#ifdef CONFIG_X86_64
/* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0
* (so no iommu=X command line over-writes).
* Considering that PV guests do not want the *native SWIOTLB* but
* only Xen SWIOTLB it is not useful to us so set no_iommu=1 here.
*/
if (max_pfn > MAX_DMA32_PFN)
no_iommu = 1;
#endif
return xen_swiotlb;
}
void __init pci_xen_swiotlb_init(void)
{
if (xen_swiotlb) {
xen_swiotlb_init(1, true /* early */);
dma_ops = &xen_swiotlb_dma_ops;
/* Make sure ACS will be enabled */
pci_request_acs();
}
}
int pci_xen_swiotlb_init_late(void)
{
int rc;
if (xen_swiotlb)
return 0;
rc = xen_swiotlb_init(1, false /* late */);
if (rc)
return rc;
dma_ops = &xen_swiotlb_dma_ops;
/* Make sure ACS will be enabled */
pci_request_acs();
return 0;
}
EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
NULL,
pci_xen_swiotlb_init,
NULL);

View File

@@ -0,0 +1,144 @@
/******************************************************************************
* platform-pci-unplug.c
*
* Xen platform PCI device driver
* Copyright (c) 2010, Citrix
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 59 Temple
* Place - Suite 330, Boston, MA 02111-1307 USA.
*
*/
#include <linux/init.h>
#include <linux/io.h>
#include <linux/module.h>
#include <xen/platform_pci.h>
#include "xen-ops.h"
#define XEN_PLATFORM_ERR_MAGIC -1
#define XEN_PLATFORM_ERR_PROTOCOL -2
#define XEN_PLATFORM_ERR_BLACKLIST -3
/* store the value of xen_emul_unplug after the unplug is done */
int xen_platform_pci_unplug;
EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
#ifdef CONFIG_XEN_PVHVM
static int xen_emul_unplug;
static int check_platform_magic(void)
{
short magic;
char protocol;
magic = inw(XEN_IOPORT_MAGIC);
if (magic != XEN_IOPORT_MAGIC_VAL) {
printk(KERN_ERR "Xen Platform PCI: unrecognised magic value\n");
return XEN_PLATFORM_ERR_MAGIC;
}
protocol = inb(XEN_IOPORT_PROTOVER);
printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n",
protocol);
switch (protocol) {
case 1:
outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM);
outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER);
if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) {
printk(KERN_ERR "Xen Platform: blacklisted by host\n");
return XEN_PLATFORM_ERR_BLACKLIST;
}
break;
default:
printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version");
return XEN_PLATFORM_ERR_PROTOCOL;
}
return 0;
}
void xen_unplug_emulated_devices(void)
{
int r;
/* user explicitly requested no unplug */
if (xen_emul_unplug & XEN_UNPLUG_NEVER)
return;
/* check the version of the xen platform PCI device */
r = check_platform_magic();
/* If the version matches enable the Xen platform PCI driver.
* Also enable the Xen platform PCI driver if the host does
* not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC)
* but the user told us that unplugging is unnecessary. */
if (r && !(r == XEN_PLATFORM_ERR_MAGIC &&
(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)))
return;
/* Set the default value of xen_emul_unplug depending on whether or
* not the Xen PV frontends and the Xen platform PCI driver have
* been compiled for this kernel (modules or built-in are both OK). */
if (!xen_emul_unplug) {
if (xen_must_unplug_nics()) {
printk(KERN_INFO "Netfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated NICs.\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
}
if (xen_must_unplug_disks()) {
printk(KERN_INFO "Blkfront and the Xen platform PCI driver have "
"been compiled for this kernel: unplug emulated disks.\n"
"You might have to change the root device\n"
"from /dev/hd[a-d] to /dev/xvd[a-d]\n"
"in your root= kernel command line option\n");
xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
}
}
/* Now unplug the emulated devices */
if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))
outw(xen_emul_unplug, XEN_IOPORT_UNPLUG);
xen_platform_pci_unplug = xen_emul_unplug;
}
static int __init parse_xen_emul_unplug(char *arg)
{
char *p, *q;
int l;
for (p = arg; p; p = q) {
q = strchr(p, ',');
if (q) {
l = q - p;
q++;
} else {
l = strlen(p);
}
if (!strncmp(p, "all", l))
xen_emul_unplug |= XEN_UNPLUG_ALL;
else if (!strncmp(p, "ide-disks", l))
xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS;
else if (!strncmp(p, "aux-ide-disks", l))
xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS;
else if (!strncmp(p, "nics", l))
xen_emul_unplug |= XEN_UNPLUG_ALL_NICS;
else if (!strncmp(p, "unnecessary", l))
xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY;
else if (!strncmp(p, "never", l))
xen_emul_unplug |= XEN_UNPLUG_NEVER;
else
printk(KERN_WARNING "unrecognised option '%s' "
"in parameter 'xen_emul_unplug'\n", p);
}
return 0;
}
early_param("xen_emul_unplug", parse_xen_emul_unplug);
#endif

588
arch/x86/xen/setup.c Normal file
View File

@@ -0,0 +1,588 @@
/*
* Machine specific setup for xen
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/numa.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/page.h>
#include <xen/interface/callback.h>
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
#include "xen-ops.h"
#include "vdso.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void xen_sysenter_target(void);
extern void xen_syscall_target(void);
extern void xen_syscall32_target(void);
/* Amount of extra memory space we add to the e820 ranges */
struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
* of base:extra, all the base memory can be filled with page
* structures for the extra memory, leaving no space for anything
* else.
*
* 10x seems like a reasonable balance between scaling flexibility and
* leaving a practically usable system.
*/
#define EXTRA_MEM_RATIO (10)
static void __init xen_add_extra_mem(u64 start, u64 size)
{
unsigned long pfn;
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
/* Add new region. */
if (xen_extra_mem[i].size == 0) {
xen_extra_mem[i].start = start;
xen_extra_mem[i].size = size;
break;
}
/* Append to existing region. */
if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) {
xen_extra_mem[i].size += size;
break;
}
}
if (i == XEN_EXTRA_MEM_MAX_REGIONS)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size);
xen_max_p2m_pfn = PFN_DOWN(start + size);
for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
continue;
WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
pfn, mfn);
__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
}
static unsigned long __init xen_do_chunk(unsigned long start,
unsigned long end, bool release)
{
struct xen_memory_reservation reservation = {
.address_bits = 0,
.extent_order = 0,
.domid = DOMID_SELF
};
unsigned long len = 0;
unsigned long pfn;
int ret;
for (pfn = start; pfn < end; pfn++) {
unsigned long frame;
unsigned long mfn = pfn_to_mfn(pfn);
if (release) {
/* Make sure pfn exists to start with */
if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
continue;
frame = mfn;
} else {
if (mfn != INVALID_P2M_ENTRY)
continue;
frame = pfn;
}
set_xen_guest_handle(reservation.extent_start, &frame);
reservation.nr_extents = 1;
ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
&reservation);
WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
release ? "release" : "populate", pfn, ret);
if (ret == 1) {
if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
if (release)
break;
set_xen_guest_handle(reservation.extent_start, &frame);
reservation.nr_extents = 1;
ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
&reservation);
break;
}
len++;
} else
break;
}
if (len)
printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
release ? "Freeing" : "Populating",
start, end, len,
release ? "freed" : "added");
return len;
}
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
return xen_do_chunk(start, end, true);
}
static unsigned long __init xen_populate_chunk(
const struct e820entry *list, size_t map_size,
unsigned long max_pfn, unsigned long *last_pfn,
unsigned long credits_left)
{
const struct e820entry *entry;
unsigned int i;
unsigned long done = 0;
unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
unsigned long pfns;
long capacity;
if (credits_left <= 0)
break;
if (entry->type != E820_RAM)
continue;
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after the xen_start_info->nr_pages */
if (e_pfn <= max_pfn)
continue;
s_pfn = PFN_UP(entry->addr);
/* If the E820 falls within the nr_pages, we want to start
* at the nr_pages PFN.
* If that would mean going past the E820 entry, skip it
*/
if (s_pfn <= max_pfn) {
capacity = e_pfn - max_pfn;
dest_pfn = max_pfn;
} else {
capacity = e_pfn - s_pfn;
dest_pfn = s_pfn;
}
if (credits_left < capacity)
capacity = credits_left;
pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
done += pfns;
*last_pfn = (dest_pfn + pfns);
if (pfns < capacity)
break;
credits_left -= pfns;
}
return done;
}
static void __init xen_set_identity_and_release_chunk(
unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
unsigned long *released, unsigned long *identity)
{
unsigned long pfn;
/*
* If the PFNs are currently mapped, the VA mapping also needs
* to be updated to be 1:1.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
(void)HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
mfn_pte(pfn, PAGE_KERNEL_IO), 0);
if (start_pfn < nr_pages)
*released += xen_release_chunk(
start_pfn, min(end_pfn, nr_pages));
*identity += set_phys_range_identity(start_pfn, end_pfn);
}
static unsigned long __init xen_set_identity_and_release(
const struct e820entry *list, size_t map_size, unsigned long nr_pages)
{
phys_addr_t start = 0;
unsigned long released = 0;
unsigned long identity = 0;
const struct e820entry *entry;
int i;
/*
* Combine non-RAM regions and gaps until a RAM region (or the
* end of the map) is reached, then set the 1:1 map and
* release the pages (if available) in those non-RAM regions.
*
* The combined non-RAM regions are rounded to a whole number
* of pages so any partial pages are accessible via the 1:1
* mapping. This is needed for some BIOSes that put (for
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
for (i = 0, entry = list; i < map_size; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
if (entry->type == E820_RAM || i == map_size - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
if (entry->type == E820_RAM)
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
xen_set_identity_and_release_chunk(
start_pfn, end_pfn, nr_pages,
&released, &identity);
start = end;
}
}
if (released)
printk(KERN_INFO "Released %lu pages of unused memory\n", released);
if (identity)
printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
return released;
}
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
domid_t domid = DOMID_SELF;
int ret;
/*
* For the initial domain we use the maximum reservation as
* the maximum page.
*
* For guest domains the current maximum reservation reflects
* the current maximum rather than the static maximum. In this
* case the e820 map provided to us will cover the static
* maximum region.
*/
if (xen_initial_domain()) {
ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid);
if (ret > 0)
max_pages = ret;
}
return min(max_pages, MAX_DOMAIN_PAGES);
}
static void xen_align_and_add_e820_region(u64 start, u64 size, int type)
{
u64 end = start + size;
/* Align RAM regions to page boundaries. */
if (type == E820_RAM) {
start = PAGE_ALIGN(start);
end &= ~((u64)PAGE_SIZE - 1);
}
e820_add_region(start, end - start, type);
}
void xen_ignore_unusable(struct e820entry *list, size_t map_size)
{
struct e820entry *entry;
unsigned int i;
for (i = 0, entry = list; i < map_size; i++, entry++) {
if (entry->type == E820_UNUSABLE)
entry->type = E820_RAM;
}
}
/**
* machine_specific_memory_setup - Hook for machine specific memory setup.
**/
char * __init xen_memory_setup(void)
{
static struct e820entry map[E820MAX] __initdata;
unsigned long max_pfn = xen_start_info->nr_pages;
unsigned long long mem_end;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long last_pfn = 0;
unsigned long extra_pages = 0;
unsigned long populated;
int i;
int op;
max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
mem_end = PFN_PHYS(max_pfn);
memmap.nr_entries = E820MAX;
set_xen_guest_handle(memmap.buffer, map);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
XENMEM_memory_map;
rc = HYPERVISOR_memory_op(op, &memmap);
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
map[0].addr = 0ULL;
map[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
map[0].size += 8ULL << 20;
map[0].type = E820_RAM;
rc = 0;
}
BUG_ON(rc);
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
* regions, so if we're using the machine memory map leave the
* region as RAM as it is in the pseudo-physical map.
*
* UNUSABLE regions in domUs are not handled and will need
* a patch in the future.
*/
if (xen_initial_domain())
xen_ignore_unusable(map, memmap.nr_entries);
/* Make sure the Xen-supplied memory map is well-ordered. */
sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries);
max_pages = xen_get_max_pages();
if (max_pages > max_pfn)
extra_pages += max_pages - max_pfn;
/*
* Set P2M for all non-RAM pages and E820 gaps to be identity
* type PFNs. Any RAM pages that would be made inaccesible by
* this are first released.
*/
xen_released_pages = xen_set_identity_and_release(
map, memmap.nr_entries, max_pfn);
/*
* Populate back the non-RAM pages and E820 gaps that had been
* released. */
populated = xen_populate_chunk(map, memmap.nr_entries,
max_pfn, &last_pfn, xen_released_pages);
xen_released_pages -= populated;
extra_pages += xen_released_pages;
if (last_pfn > max_pfn) {
max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
mem_end = PFN_PHYS(max_pfn);
}
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size. On non-highmem systems, the base
* size is the full initial memory allocation; on highmem it
* is limited to the max size of lowmem, so that it doesn't
* get completely filled.
*
* In principle there could be a problem in lowmem systems if
* the initial memory is also very large with respect to
* lowmem, but we won't try to deal with that here.
*/
extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages);
i = 0;
while (i < memmap.nr_entries) {
u64 addr = map[i].addr;
u64 size = map[i].size;
u32 type = map[i].type;
if (type == E820_RAM) {
if (addr < mem_end) {
size = min(size, mem_end - addr);
} else if (extra_pages) {
size = min(size, (u64)extra_pages * PAGE_SIZE);
extra_pages -= size / PAGE_SIZE;
xen_add_extra_mem(addr, size);
} else
type = E820_UNUSABLE;
}
xen_align_and_add_e820_region(addr, size, type);
map[i].addr += size;
map[i].size -= size;
if (map[i].size == 0)
i++;
}
/*
* In domU, the ISA region is normal, usable memory, but we
* reserve ISA memory anyway because too many things poke
* about in there.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
/*
* Reserve Xen bits:
* - mfn_list
* - xen_start_info
* See comment above "struct start_info" in <xen/interface/xen.h>
* We tried to make the the memblock_reserve more selective so
* that it would be clear what region is reserved. Sadly we ran
* in the problem wherein on a 64-bit hypervisor with a 32-bit
* initial domain, the pt_base has the cr3 value which is not
* neccessarily where the pagetable starts! As Jan put it: "
* Actually, the adjustment turns out to be correct: The page
* tables for a 32-on-64 dom0 get allocated in the order "first L1",
* "first L2", "first L3", so the offset to the page table base is
* indeed 2. When reading xen/include/public/xen.h's comment
* very strictly, this is not a violation (since there nothing is said
* that the first thing in the page table space is pointed to by
* pt_base; I admit that this seems to be implied though, namely
* do I think that it is implied that the page table space is the
* range [pt_base, pt_base + nt_pt_frames), whereas that
* range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
* which - without a priori knowledge - the kernel would have
* difficulty to figure out)." - so lets just fall back to the
* easy way and reserve the whole region.
*/
memblock_reserve(__pa(xen_start_info->mfn_list),
xen_start_info->pt_base - xen_start_info->mfn_list);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
return "Xen";
}
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
*/
static void __init fiddle_vdso(void)
{
#ifdef CONFIG_X86_32
u32 *mask;
mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
#endif
}
static int __cpuinit register_callback(unsigned type, const void *func)
{
struct callback_register callback = {
.type = type,
.address = XEN_CALLBACK(__KERNEL_CS, func),
.flags = CALLBACKF_mask_events,
};
return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
}
void __cpuinit xen_enable_sysenter(void)
{
int ret;
unsigned sysenter_feature;
#ifdef CONFIG_X86_32
sysenter_feature = X86_FEATURE_SEP;
#else
sysenter_feature = X86_FEATURE_SYSENTER32;
#endif
if (!boot_cpu_has(sysenter_feature))
return;
ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
if(ret != 0)
setup_clear_cpu_cap(sysenter_feature);
}
void __cpuinit xen_enable_syscall(void)
{
#ifdef CONFIG_X86_64
int ret;
ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
if (ret != 0) {
printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
/* Pretty fatal; 64-bit userspace has no other
mechanism for syscalls. */
}
if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
ret = register_callback(CALLBACKTYPE_syscall32,
xen_syscall32_target);
if (ret != 0)
setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
{
xen_panic_handler_init();
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable,
VMASST_TYPE_pae_extended_cr3);
if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
BUG();
xen_enable_sysenter();
xen_enable_syscall();
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
disable_acpi();
}
#endif
memcpy(boot_command_line, xen_start_info->cmd_line,
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
/* Set up idle, making sure it calls safe_halt() pvop */
disable_cpuidle();
disable_cpufreq();
WARN_ON(xen_set_default_idle());
fiddle_vdso();
#ifdef CONFIG_NUMA
numa_off = 1;
#endif
}

698
arch/x86/xen/smp.c Normal file
View File

@@ -0,0 +1,698 @@
/*
* Xen SMP support
*
* This file implements the Xen versions of smp_ops. SMP under Xen is
* very straightforward. Bringing a CPU up is simply a matter of
* loading its initial context and setting it running.
*
* IPIs are handled through the Xen event mechanism.
*
* Because virtual CPUs can be scheduled onto any real CPU, there's no
* useful topology information for the kernel to make use of. As a
* result, all CPUs are treated as if they're single-core and
* single-threaded.
*/
#include <linux/sched.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cpu.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include <asm/xen/interface.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/page.h>
#include <xen/events.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
cpumask_var_t xen_cpu_initialized_map;
static DEFINE_PER_CPU(int, xen_resched_irq);
static DEFINE_PER_CPU(int, xen_callfunc_irq);
static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
static DEFINE_PER_CPU(int, xen_irq_work);
static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
/*
* Reschedule call back.
*/
static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
{
inc_irq_stat(irq_resched_count);
scheduler_ipi();
return IRQ_HANDLED;
}
static void __cpuinit cpu_bringup(void)
{
int cpu;
cpu_init();
touch_softlockup_watchdog();
preempt_disable();
xen_enable_sysenter();
xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
notify_cpu_starting(cpu);
set_cpu_online(cpu, true);
this_cpu_write(cpu_state, CPU_ONLINE);
wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
wmb(); /* make sure everything is out */
}
static void __cpuinit cpu_bringup_and_idle(void)
{
cpu_bringup();
cpu_startup_entry(CPUHP_ONLINE);
}
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
const char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
resched_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu) = rc;
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
debug_name, NULL);
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu) = rc;
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu) = rc;
/*
* The IRQ worker on PVHVM goes through the native path and uses the
* IPI mechanism.
*/
if (xen_hvm_domain())
return 0;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
callfunc_name,
NULL);
if (rc < 0)
goto fail;
per_cpu(xen_irq_work, cpu) = rc;
return 0;
fail:
if (per_cpu(xen_resched_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
if (per_cpu(xen_callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
if (per_cpu(xen_debug_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
NULL);
if (xen_hvm_domain())
return rc;
if (per_cpu(xen_irq_work, cpu) >= 0)
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
return rc;
}
static void __init xen_fill_possible_map(void)
{
int i, rc;
if (xen_initial_domain())
return;
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
set_cpu_possible(i, true);
}
}
}
static void __init xen_filter_cpu_maps(void)
{
int i, rc;
unsigned int subtract = 0;
if (!xen_initial_domain())
return;
num_processors = 0;
disabled_cpus = 0;
for (i = 0; i < nr_cpu_ids; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
if (rc >= 0) {
num_processors++;
set_cpu_possible(i, true);
} else {
set_cpu_possible(i, false);
set_cpu_present(i, false);
subtract++;
}
}
#ifdef CONFIG_HOTPLUG_CPU
/* This is akin to using 'nr_cpus' on the Linux command line.
* Which is OK as when we use 'dom0_max_vcpus=X' we can only
* have up to X, while nr_cpu_ids is greater than X. This
* normally is not a problem, except when CPU hotplugging
* is involved and then there might be more than X CPUs
* in the guest - which will not work as there is no
* hypercall to expand the max number of VCPUs an already
* running guest has. So cap it up to X. */
if (subtract)
nr_cpu_ids = nr_cpu_ids - subtract;
#endif
}
static void __init xen_smp_prepare_boot_cpu(void)
{
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
make_lowmem_page_readwrite(xen_initial_gdt);
xen_filter_cpu_maps();
xen_setup_vcpu_info_placement();
}
static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
unsigned int i;
if (skip_ioapic_setup) {
char *m = (max_cpus == 0) ?
"The nosmp parameter is incompatible with Xen; " \
"use Xen dom0_max_vcpus=1 parameter" :
"The noapic parameter is incompatible with Xen";
xen_raw_printk(m);
panic(m);
}
xen_init_lock_cpu(0);
smp_store_boot_cpu_info();
cpu_data(0).x86_max_cores = 1;
for_each_possible_cpu(i) {
zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
}
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
BUG();
if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
panic("could not allocate xen_cpu_initialized_map\n");
cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
continue;
set_cpu_possible(cpu, false);
}
for_each_possible_cpu(cpu)
set_cpu_present(cpu, true);
}
static int __cpuinit
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
struct desc_struct *gdt;
unsigned long gdt_mfn;
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ss = __KERNEL_DS;
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#else
ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
{
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
xen_copy_trap_info(ctxt->trap_ctxt);
ctxt->ldt_ents = 0;
BUG_ON((unsigned long)gdt & ~PAGE_MASK);
gdt_mfn = arbitrary_virt_to_mfn(gdt);
make_lowmem_page_readonly(gdt);
make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
ctxt->gdt_frames[0] = gdt_mfn;
ctxt->gdt_ents = GDT_ENTRIES;
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_cs = __KERNEL_CS;
#endif
ctxt->event_callback_eip =
(unsigned long)xen_hypervisor_callback;
ctxt->failsafe_callback_eip =
(unsigned long)xen_failsafe_callback;
}
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt))
BUG();
kfree(ctxt);
return 0;
}
static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)
{
int rc;
per_cpu(current_task, cpu) = idle;
#ifdef CONFIG_X86_32
irq_ctx_init(cpu);
#else
clear_tsk_thread_flag(idle, TIF_FORK);
per_cpu(kernel_stack, cpu) =
(unsigned long)task_stack_page(idle) -
KERNEL_STACK_OFFSET + THREAD_SIZE;
#endif
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
rc = cpu_initialize_context(cpu, idle);
if (rc)
return rc;
if (num_online_cpus() == 1)
/* Just in case we booted with a single CPU. */
alternatives_enable_smp();
rc = xen_smp_intr_init(cpu);
if (rc)
return rc;
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
barrier();
}
return 0;
}
static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
#ifdef CONFIG_HOTPLUG_CPU
static int xen_cpu_disable(void)
{
unsigned int cpu = smp_processor_id();
if (cpu == 0)
return -EBUSY;
cpu_disable_common();
load_cr3(swapper_pg_dir);
return 0;
}
static void xen_cpu_die(unsigned int cpu)
{
while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
current->state = TASK_UNINTERRUPTIBLE;
schedule_timeout(HZ/10);
}
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
if (!xen_hvm_domain())
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);
xen_uninit_lock_cpu(cpu);
xen_teardown_timer(cpu);
}
static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
cpu_bringup();
/*
* commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
* clears certain data that the cpu_idle loop (which called us
* and that we return from) expects. The only way to get that
* data back is to call:
*/
tick_nohz_idle_enter();
}
#else /* !CONFIG_HOTPLUG_CPU */
static int xen_cpu_disable(void)
{
return -ENOSYS;
}
static void xen_cpu_die(unsigned int cpu)
{
BUG();
}
static void xen_play_dead(void)
{
BUG();
}
#endif
static void stop_self(void *v)
{
int cpu = smp_processor_id();
/* make sure we're not pinning something down */
load_cr3(swapper_pg_dir);
/* should set up a minimal gdt */
set_cpu_online(cpu, false);
HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL);
BUG();
}
static void xen_stop_other_cpus(int wait)
{
smp_call_function(stop_self, NULL, wait);
}
static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
static void __xen_send_IPI_mask(const struct cpumask *mask,
int vector)
{
unsigned cpu;
for_each_cpu_and(cpu, mask, cpu_online_mask)
xen_send_IPI_one(cpu, vector);
}
static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
{
int cpu;
__xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
/* Make sure other vcpus get a chance to run if they need to. */
for_each_cpu(cpu, mask) {
if (xen_vcpu_stolen(cpu)) {
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
break;
}
}
}
static void xen_smp_send_call_function_single_ipi(int cpu)
{
__xen_send_IPI_mask(cpumask_of(cpu),
XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
static inline int xen_map_vector(int vector)
{
int xen_vector;
switch (vector) {
case RESCHEDULE_VECTOR:
xen_vector = XEN_RESCHEDULE_VECTOR;
break;
case CALL_FUNCTION_VECTOR:
xen_vector = XEN_CALL_FUNCTION_VECTOR;
break;
case CALL_FUNCTION_SINGLE_VECTOR:
xen_vector = XEN_CALL_FUNCTION_SINGLE_VECTOR;
break;
case IRQ_WORK_VECTOR:
xen_vector = XEN_IRQ_WORK_VECTOR;
break;
default:
xen_vector = -1;
printk(KERN_ERR "xen: vector 0x%x is not implemented\n",
vector);
}
return xen_vector;
}
void xen_send_IPI_mask(const struct cpumask *mask,
int vector)
{
int xen_vector = xen_map_vector(vector);
if (xen_vector >= 0)
__xen_send_IPI_mask(mask, xen_vector);
}
void xen_send_IPI_all(int vector)
{
int xen_vector = xen_map_vector(vector);
if (xen_vector >= 0)
__xen_send_IPI_mask(cpu_online_mask, xen_vector);
}
void xen_send_IPI_self(int vector)
{
int xen_vector = xen_map_vector(vector);
if (xen_vector >= 0)
xen_send_IPI_one(smp_processor_id(), xen_vector);
}
void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
int vector)
{
unsigned cpu;
unsigned int this_cpu = smp_processor_id();
int xen_vector = xen_map_vector(vector);
if (!(num_online_cpus() > 1) || (xen_vector < 0))
return;
for_each_cpu_and(cpu, mask, cpu_online_mask) {
if (this_cpu == cpu)
continue;
xen_send_IPI_one(cpu, xen_vector);
}
}
void xen_send_IPI_allbutself(int vector)
{
xen_send_IPI_mask_allbutself(cpu_online_mask, vector);
}
static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
{
irq_enter();
generic_smp_call_function_single_interrupt();
inc_irq_stat(irq_call_count);
irq_exit();
return IRQ_HANDLED;
}
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
{
irq_enter();
irq_work_run();
inc_irq_stat(apic_irq_work_irqs);
irq_exit();
return IRQ_HANDLED;
}
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
.cpu_up = xen_cpu_up,
.cpu_die = xen_cpu_die,
.cpu_disable = xen_cpu_disable,
.play_dead = xen_play_dead,
.stop_other_cpus = xen_stop_other_cpus,
.smp_send_reschedule = xen_smp_send_reschedule,
.send_call_func_ipi = xen_smp_send_call_function_ipi,
.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
};
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
xen_fill_possible_map();
xen_init_spinlocks();
}
static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
{
native_smp_prepare_cpus(max_cpus);
WARN_ON(xen_smp_intr_init(0));
xen_init_lock_cpu(0);
}
static int __cpuinit xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
/*
* xen_smp_intr_init() needs to run before native_cpu_up()
* so that IPI vectors are set up on the booting CPU before
* it is marked online in native_cpu_up().
*/
rc = xen_smp_intr_init(cpu);
WARN_ON(rc);
if (!rc)
rc = native_cpu_up(cpu, tidle);
return rc;
}
static void xen_hvm_cpu_die(unsigned int cpu)
{
xen_cpu_die(cpu);
native_cpu_die(cpu);
}
void __init xen_hvm_smp_init(void)
{
if (!xen_have_vector_callback)
return;
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
smp_ops.cpu_up = xen_hvm_cpu_up;
smp_ops.cpu_die = xen_hvm_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
}

11
arch/x86/xen/smp.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef _XEN_SMP_H
extern void xen_send_IPI_mask(const struct cpumask *mask,
int vector);
extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
int vector);
extern void xen_send_IPI_allbutself(int vector);
extern void xen_send_IPI_all(int vector);
extern void xen_send_IPI_self(int vector);
#endif

478
arch/x86/xen/spinlock.c Normal file
View File

@@ -0,0 +1,478 @@
/*
* Split spinlock implementation out into its own file, so it can be
* compiled in a FTRACE-compatible way.
*/
#include <linux/kernel_stat.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/log2.h>
#include <linux/gfp.h>
#include <asm/paravirt.h>
#include <xen/interface/xen.h>
#include <xen/events.h>
#include "xen-ops.h"
#include "debugfs.h"
#ifdef CONFIG_XEN_DEBUG_FS
static struct xen_spinlock_stats
{
u64 taken;
u32 taken_slow;
u32 taken_slow_nested;
u32 taken_slow_pickup;
u32 taken_slow_spurious;
u32 taken_slow_irqenable;
u64 released;
u32 released_slow;
u32 released_slow_kicked;
#define HISTO_BUCKETS 30
u32 histo_spin_total[HISTO_BUCKETS+1];
u32 histo_spin_spinning[HISTO_BUCKETS+1];
u32 histo_spin_blocked[HISTO_BUCKETS+1];
u64 time_total;
u64 time_spinning;
u64 time_blocked;
} spinlock_stats;
static u8 zero_stats;
static unsigned lock_timeout = 1 << 10;
#define TIMEOUT lock_timeout
static inline void check_zero(void)
{
if (unlikely(zero_stats)) {
memset(&spinlock_stats, 0, sizeof(spinlock_stats));
zero_stats = 0;
}
}
#define ADD_STATS(elem, val) \
do { check_zero(); spinlock_stats.elem += (val); } while(0)
static inline u64 spin_time_start(void)
{
return xen_clocksource_read();
}
static void __spin_time_accum(u64 delta, u32 *array)
{
unsigned index = ilog2(delta);
check_zero();
if (index < HISTO_BUCKETS)
array[index]++;
else
array[HISTO_BUCKETS]++;
}
static inline void spin_time_accum_spinning(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
spinlock_stats.time_spinning += delta;
}
static inline void spin_time_accum_total(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_total);
spinlock_stats.time_total += delta;
}
static inline void spin_time_accum_blocked(u64 start)
{
u32 delta = xen_clocksource_read() - start;
__spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
spinlock_stats.time_blocked += delta;
}
#else /* !CONFIG_XEN_DEBUG_FS */
#define TIMEOUT (1 << 10)
#define ADD_STATS(elem, val) do { (void)(val); } while(0)
static inline u64 spin_time_start(void)
{
return 0;
}
static inline void spin_time_accum_total(u64 start)
{
}
static inline void spin_time_accum_spinning(u64 start)
{
}
static inline void spin_time_accum_blocked(u64 start)
{
}
#endif /* CONFIG_XEN_DEBUG_FS */
/*
* Size struct xen_spinlock so it's the same as arch_spinlock_t.
*/
#if NR_CPUS < 256
typedef u8 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incb %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decb %0" : "+m" ((xl)->spinners) : : "memory");
#else
typedef u16 xen_spinners_t;
# define inc_spinners(xl) \
asm(LOCK_PREFIX " incw %0" : "+m" ((xl)->spinners) : : "memory");
# define dec_spinners(xl) \
asm(LOCK_PREFIX " decw %0" : "+m" ((xl)->spinners) : : "memory");
#endif
struct xen_spinlock {
unsigned char lock; /* 0 -> free; 1 -> locked */
xen_spinners_t spinners; /* count of waiting cpus */
};
static int xen_spin_is_locked(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
return xl->lock != 0;
}
static int xen_spin_is_contended(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
/* Not strictly true; this is only the count of contended
lock-takers entering the slow path. */
return xl->spinners != 0;
}
static int xen_spin_trylock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
u8 old = 1;
asm("xchgb %b0,%1"
: "+q" (old), "+m" (xl->lock) : : "memory");
return old == 0;
}
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
/*
* Mark a cpu as interested in a lock. Returns the CPU's previous
* lock of interest, in case we got preempted by an interrupt.
*/
static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
struct xen_spinlock *prev;
prev = __this_cpu_read(lock_spinners);
__this_cpu_write(lock_spinners, xl);
wmb(); /* set lock of interest before count */
inc_spinners(xl);
return prev;
}
/*
* Mark a cpu as no longer interested in a lock. Restores previous
* lock of interest (NULL for none).
*/
static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
{
dec_spinners(xl);
wmb(); /* decrement count before restoring lock */
__this_cpu_write(lock_spinners, prev);
}
static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
int irq = __this_cpu_read(lock_kicker_irq);
int ret;
u64 start;
/* If kicker interrupts not initialized yet, just spin */
if (irq == -1)
return 0;
start = spin_time_start();
/* announce we're spinning */
prev = spinning_lock(xl);
ADD_STATS(taken_slow, 1);
ADD_STATS(taken_slow_nested, prev != NULL);
do {
unsigned long flags;
/* clear pending */
xen_clear_irq_pending(irq);
/* check again make sure it didn't become free while
we weren't looking */
ret = xen_spin_trylock(lock);
if (ret) {
ADD_STATS(taken_slow_pickup, 1);
/*
* If we interrupted another spinlock while it
* was blocking, make sure it doesn't block
* without rechecking the lock.
*/
if (prev != NULL)
xen_set_irq_pending(irq);
goto out;
}
flags = arch_local_save_flags();
if (irq_enable) {
ADD_STATS(taken_slow_irqenable, 1);
raw_local_irq_enable();
}
/*
* Block until irq becomes pending. If we're
* interrupted at this point (after the trylock but
* before entering the block), then the nested lock
* handler guarantees that the irq will be left
* pending if there's any chance the lock became free;
* xen_poll_irq() returns immediately if the irq is
* pending.
*/
xen_poll_irq(irq);
raw_local_irq_restore(flags);
ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
} while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
out:
unspinning_lock(xl, prev);
spin_time_accum_blocked(start);
return ret;
}
static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
unsigned timeout;
u8 oldval;
u64 start_spin;
ADD_STATS(taken, 1);
start_spin = spin_time_start();
do {
u64 start_spin_fast = spin_time_start();
timeout = TIMEOUT;
asm("1: xchgb %1,%0\n"
" testb %1,%1\n"
" jz 3f\n"
"2: rep;nop\n"
" cmpb $0,%0\n"
" je 1b\n"
" dec %2\n"
" jnz 2b\n"
"3:\n"
: "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
: "1" (1)
: "memory");
spin_time_accum_spinning(start_spin_fast);
} while (unlikely(oldval != 0 &&
(TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
spin_time_accum_total(start_spin);
}
static void xen_spin_lock(struct arch_spinlock *lock)
{
__xen_spin_lock(lock, false);
}
static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
{
__xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
}
static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
{
int cpu;
ADD_STATS(released_slow, 1);
for_each_online_cpu(cpu) {
/* XXX should mix up next cpu selection */
if (per_cpu(lock_spinners, cpu) == xl) {
ADD_STATS(released_slow_kicked, 1);
xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
}
}
}
static void xen_spin_unlock(struct arch_spinlock *lock)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
ADD_STATS(released, 1);
smp_wmb(); /* make sure no writes get moved after unlock */
xl->lock = 0; /* release lock */
/*
* Make sure unlock happens before checking for waiting
* spinners. We need a strong barrier to enforce the
* write-read ordering to different memory locations, as the
* CPU makes no implied guarantees about their ordering.
*/
mb();
if (unlikely(xl->spinners))
xen_spin_unlock_slow(xl);
}
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
BUG();
return IRQ_HANDLED;
}
void __cpuinit xen_init_lock_cpu(int cpu)
{
int irq;
const char *name;
WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
cpu, per_cpu(lock_kicker_irq, cpu));
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
name,
NULL);
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
}
void xen_uninit_lock_cpu(int cpu)
{
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
}
void __init xen_init_spinlocks(void)
{
/*
* See git commit f10cd522c5fbfec9ae3cc01967868c9c2401ed23
* (xen: disable PV spinlocks on HVM)
*/
if (xen_hvm_domain())
return;
BUILD_BUG_ON(sizeof(struct xen_spinlock) > sizeof(arch_spinlock_t));
pv_lock_ops.spin_is_locked = xen_spin_is_locked;
pv_lock_ops.spin_is_contended = xen_spin_is_contended;
pv_lock_ops.spin_lock = xen_spin_lock;
pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
pv_lock_ops.spin_trylock = xen_spin_trylock;
pv_lock_ops.spin_unlock = xen_spin_unlock;
}
#ifdef CONFIG_XEN_DEBUG_FS
static struct dentry *d_spin_debug;
static int __init xen_spinlock_debugfs(void)
{
struct dentry *d_xen = xen_init_debugfs();
if (d_xen == NULL)
return -ENOMEM;
d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
debugfs_create_u32("taken_slow", 0444, d_spin_debug,
&spinlock_stats.taken_slow);
debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
&spinlock_stats.taken_slow_nested);
debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
&spinlock_stats.taken_slow_pickup);
debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
&spinlock_stats.taken_slow_spurious);
debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
&spinlock_stats.taken_slow_irqenable);
debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
debugfs_create_u32("released_slow", 0444, d_spin_debug,
&spinlock_stats.released_slow);
debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
&spinlock_stats.released_slow_kicked);
debugfs_create_u64("time_spinning", 0444, d_spin_debug,
&spinlock_stats.time_spinning);
debugfs_create_u64("time_blocked", 0444, d_spin_debug,
&spinlock_stats.time_blocked);
debugfs_create_u64("time_total", 0444, d_spin_debug,
&spinlock_stats.time_total);
debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
return 0;
}
fs_initcall(xen_spinlock_debugfs);
#endif /* CONFIG_XEN_DEBUG_FS */

80
arch/x86/xen/suspend.c Normal file
View File

@@ -0,0 +1,80 @@
#include <linux/types.h>
#include <linux/clockchips.h>
#include <xen/interface/xen.h>
#include <xen/grant_table.h>
#include <xen/events.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include <asm/fixmap.h>
#include "xen-ops.h"
#include "mmu.h"
void xen_arch_pre_suspend(void)
{
xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
mfn_to_pfn(xen_start_info->console.domU.mfn);
BUG_ON(!irqs_disabled());
HYPERVISOR_shared_info = &xen_dummy_shared_info;
if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
__pte_ma(0), 0))
BUG();
}
void xen_arch_hvm_post_suspend(int suspend_cancelled)
{
#ifdef CONFIG_XEN_PVHVM
int cpu;
xen_hvm_init_shared_info();
xen_callback_vector();
xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
for_each_online_cpu(cpu) {
xen_setup_runstate_info(cpu);
}
}
#endif
}
void xen_arch_post_suspend(int suspend_cancelled)
{
xen_build_mfn_list_list();
xen_setup_shared_info();
if (suspend_cancelled) {
xen_start_info->store_mfn =
pfn_to_mfn(xen_start_info->store_mfn);
xen_start_info->console.domU.mfn =
pfn_to_mfn(xen_start_info->console.domU.mfn);
} else {
#ifdef CONFIG_SMP
BUG_ON(xen_cpu_initialized_map == NULL);
cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
#endif
xen_vcpu_restore();
}
}
static void xen_vcpu_notify_restore(void *data)
{
unsigned long reason = (unsigned long)data;
/* Boot processor notified via generic timekeeping_resume() */
if ( smp_processor_id() == 0)
return;
clockevents_notify(reason, NULL);
}
void xen_arch_resume(void)
{
on_each_cpu(xen_vcpu_notify_restore,
(void *)CLOCK_EVT_NOTIFY_RESUME, 1);
}

519
arch/x86/xen/time.c Normal file
View File

@@ -0,0 +1,519 @@
/*
* Xen time implementation.
*
* This is implemented in terms of a clocksource driver which uses
* the hypervisor clock as a nanosecond timebase, and a clockevent
* driver which uses the hypervisor's timer mechanism.
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/kernel_stat.h>
#include <linux/math64.h>
#include <linux/gfp.h>
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
#include <xen/events.h>
#include <xen/features.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
#include "xen-ops.h"
/* Xen may fire a timer up to this many ns early */
#define TIMER_SLOP 100000
#define NS_PER_TICK (1000000000LL / HZ)
/* runstate info updated by Xen */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
/* snapshots of runstate info */
static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
/* unused ns of stolen time */
static DEFINE_PER_CPU(u64, xen_residual_stolen);
/* return an consistent snapshot of 64-bit time/counter value */
static u64 get64(const u64 *p)
{
u64 ret;
if (BITS_PER_LONG < 64) {
u32 *p32 = (u32 *)p;
u32 h, l;
/*
* Read high then low, and then make sure high is
* still the same; this will only loop if low wraps
* and carries into high.
* XXX some clean way to make this endian-proof?
*/
do {
h = p32[1];
barrier();
l = p32[0];
barrier();
} while (p32[1] != h);
ret = (((u64)h) << 32) | l;
} else
ret = *p;
return ret;
}
/*
* Runstate accounting
*/
static void get_runstate_snapshot(struct vcpu_runstate_info *res)
{
u64 state_time;
struct vcpu_runstate_info *state;
BUG_ON(preemptible());
state = &__get_cpu_var(xen_runstate);
/*
* The runstate info is always updated by the hypervisor on
* the current CPU, so there's no need to use anything
* stronger than a compiler barrier when fetching it.
*/
do {
state_time = get64(&state->state_entry_time);
barrier();
*res = *state;
barrier();
} while (get64(&state->state_entry_time) != state_time);
}
/* return true when a vcpu could run but has no real cpu to run on */
bool xen_vcpu_stolen(int vcpu)
{
return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
}
void xen_setup_runstate_info(int cpu)
{
struct vcpu_register_runstate_memory_area area;
area.addr.v = &per_cpu(xen_runstate, cpu);
if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
cpu, &area))
BUG();
}
static void do_stolen_accounting(void)
{
struct vcpu_runstate_info state;
struct vcpu_runstate_info *snap;
s64 runnable, offline, stolen;
cputime_t ticks;
get_runstate_snapshot(&state);
WARN_ON(state.state != RUNSTATE_running);
snap = &__get_cpu_var(xen_runstate_snapshot);
/* work out how much time the VCPU has not been runn*ing* */
runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
*snap = state;
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
}
/* Get the TSC speed from Xen */
static unsigned long xen_tsc_khz(void)
{
struct pvclock_vcpu_time_info *info =
&HYPERVISOR_shared_info->vcpu_info[0].time;
return pvclock_tsc_khz(info);
}
cycle_t xen_clocksource_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
preempt_disable_notrace();
src = &__get_cpu_var(xen_vcpu)->time;
ret = pvclock_clocksource_read(src);
preempt_enable_notrace();
return ret;
}
static cycle_t xen_clocksource_get_cycles(struct clocksource *cs)
{
return xen_clocksource_read();
}
static void xen_read_wallclock(struct timespec *ts)
{
struct shared_info *s = HYPERVISOR_shared_info;
struct pvclock_wall_clock *wall_clock = &(s->wc);
struct pvclock_vcpu_time_info *vcpu_time;
vcpu_time = &get_cpu_var(xen_vcpu)->time;
pvclock_read_wallclock(wall_clock, vcpu_time, ts);
put_cpu_var(xen_vcpu);
}
static unsigned long xen_get_wallclock(void)
{
struct timespec ts;
xen_read_wallclock(&ts);
return ts.tv_sec;
}
static int xen_set_wallclock(unsigned long now)
{
struct xen_platform_op op;
int rc;
/* do nothing for domU */
if (!xen_initial_domain())
return -1;
op.cmd = XENPF_settime;
op.u.settime.secs = now;
op.u.settime.nsecs = 0;
op.u.settime.system_time = xen_clocksource_read();
rc = HYPERVISOR_dom0_op(&op);
WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now);
return rc;
}
static struct clocksource xen_clocksource __read_mostly = {
.name = "xen",
.rating = 400,
.read = xen_clocksource_get_cycles,
.mask = ~0,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
/*
Xen clockevent implementation
Xen has two clockevent implementations:
The old timer_op one works with all released versions of Xen prior
to version 3.0.4. This version of the hypervisor provides a
single-shot timer with nanosecond resolution. However, sharing the
same event channel is a 100Hz tick which is delivered while the
vcpu is running. We don't care about or use this tick, but it will
cause the core time code to think the timer fired too soon, and
will end up resetting it each time. It could be filtered, but
doing so has complications when the ktime clocksource is not yet
the xen clocksource (ie, at boot time).
The new vcpu_op-based timer interface allows the tick timer period
to be changed or turned off. The tick timer is not useful as a
periodic timer because events are only delivered to running vcpus.
The one-shot timer can report when a timeout is in the past, so
set_next_event is capable of returning -ETIME when appropriate.
This interface is used when available.
*/
/*
Get a hypervisor absolute time. In theory we could maintain an
offset between the kernel's time and the hypervisor's time, and
apply that to a kernel's absolute timeout. Unfortunately the
hypervisor and kernel times can drift even if the kernel is using
the Xen clocksource, because ntp can warp the kernel's clocksource.
*/
static s64 get_abs_timeout(unsigned long delta)
{
return xen_clocksource_read() + delta;
}
static void xen_timerop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
/* unsupported */
WARN_ON(1);
break;
case CLOCK_EVT_MODE_ONESHOT:
case CLOCK_EVT_MODE_RESUME:
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
HYPERVISOR_set_timer_op(0); /* cancel timeout */
break;
}
}
static int xen_timerop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
BUG();
/* We may have missed the deadline, but there's no real way of
knowing for sure. If the event was in the past, then we'll
get an immediate interrupt. */
return 0;
}
static const struct clock_event_device xen_timerop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_timerop_set_mode,
.set_next_event = xen_timerop_set_next_event,
};
static void xen_vcpuop_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
WARN_ON(1); /* unsupported */
break;
case CLOCK_EVT_MODE_ONESHOT:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_UNUSED:
case CLOCK_EVT_MODE_SHUTDOWN:
if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
break;
case CLOCK_EVT_MODE_RESUME:
break;
}
}
static int xen_vcpuop_set_next_event(unsigned long delta,
struct clock_event_device *evt)
{
int cpu = smp_processor_id();
struct vcpu_set_singleshot_timer single;
int ret;
WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
single.timeout_abs_ns = get_abs_timeout(delta);
single.flags = VCPU_SSHOTTMR_future;
ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
BUG_ON(ret != 0 && ret != -ETIME);
return ret;
}
static const struct clock_event_device xen_vcpuop_clockevent = {
.name = "xen",
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
.mult = 1,
.shift = 0,
.rating = 500,
.set_mode = xen_vcpuop_set_mode,
.set_next_event = xen_vcpuop_set_next_event,
};
static const struct clock_event_device *xen_clockevent =
&xen_timerop_clockevent;
static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events) = { .irq = -1 };
static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
{
struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
irqreturn_t ret;
ret = IRQ_NONE;
if (evt->event_handler) {
evt->event_handler(evt);
ret = IRQ_HANDLED;
}
do_stolen_accounting();
return ret;
}
void xen_setup_timer(int cpu)
{
const char *name;
struct clock_event_device *evt;
int irq;
evt = &per_cpu(xen_clock_events, cpu);
WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
name = kasprintf(GFP_KERNEL, "timer%d", cpu);
if (!name)
name = "<timer kasprintf failed>";
irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
IRQF_DISABLED|IRQF_PERCPU|
IRQF_NOBALANCING|IRQF_TIMER|
IRQF_FORCE_RESUME,
name, NULL);
memcpy(evt, xen_clockevent, sizeof(*evt));
evt->cpumask = cpumask_of(cpu);
evt->irq = irq;
}
void xen_teardown_timer(int cpu)
{
struct clock_event_device *evt;
BUG_ON(cpu == 0);
evt = &per_cpu(xen_clock_events, cpu);
unbind_from_irqhandler(evt->irq, NULL);
evt->irq = -1;
}
void xen_setup_cpu_clockevents(void)
{
BUG_ON(preemptible());
clockevents_register_device(&__get_cpu_var(xen_clock_events));
}
void xen_timer_resume(void)
{
int cpu;
pvclock_resume();
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
for_each_online_cpu(cpu) {
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
BUG();
}
}
static const struct pv_time_ops xen_time_ops __initconst = {
.sched_clock = xen_clocksource_read,
};
static void __init xen_time_init(void)
{
int cpu = smp_processor_id();
struct timespec tp;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
/* Successfully turned off 100Hz tick, so we have the
vcpuop-based timer interface */
printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
xen_clockevent = &xen_vcpuop_clockevent;
}
/* Set initial system time with full resolution */
xen_read_wallclock(&tp);
do_settimeofday(&tp);
setup_force_cpu_cap(X86_FEATURE_TSC);
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
}
void __init xen_init_time_ops(void)
{
pv_time_ops = xen_time_ops;
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#ifdef CONFIG_XEN_PVHVM
static void xen_hvm_setup_cpu_clockevents(void)
{
int cpu = smp_processor_id();
xen_setup_runstate_info(cpu);
/*
* xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
* doing it xen_hvm_cpu_notify (which gets called by smp_init during
* early bootup and also during CPU hotplug events).
*/
xen_setup_cpu_clockevents();
}
void __init xen_hvm_init_time_ops(void)
{
/* vector callback is needed otherwise we cannot receive interrupts
* on cpu > 0 and at this point we don't know how many cpus are
* available */
if (!xen_have_vector_callback)
return;
if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
printk(KERN_INFO "Xen doesn't support pvclock on HVM,"
"disable pv timer\n");
return;
}
pv_time_ops = xen_time_ops;
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
x86_platform.calibrate_tsc = xen_tsc_khz;
x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#endif

62
arch/x86/xen/trace.c Normal file
View File

@@ -0,0 +1,62 @@
#include <linux/ftrace.h>
#include <xen/interface/xen.h>
#define N(x) [__HYPERVISOR_##x] = "("#x")"
static const char *xen_hypercall_names[] = {
N(set_trap_table),
N(mmu_update),
N(set_gdt),
N(stack_switch),
N(set_callbacks),
N(fpu_taskswitch),
N(sched_op_compat),
N(dom0_op),
N(set_debugreg),
N(get_debugreg),
N(update_descriptor),
N(memory_op),
N(multicall),
N(update_va_mapping),
N(set_timer_op),
N(event_channel_op_compat),
N(xen_version),
N(console_io),
N(physdev_op_compat),
N(grant_table_op),
N(vm_assist),
N(update_va_mapping_otherdomain),
N(iret),
N(vcpu_op),
N(set_segment_base),
N(mmuext_op),
N(acm_op),
N(nmi_op),
N(sched_op),
N(callback_op),
N(xenoprof_op),
N(event_channel_op),
N(physdev_op),
N(hvm_op),
/* Architecture-specific hypercall definitions. */
N(arch_0),
N(arch_1),
N(arch_2),
N(arch_3),
N(arch_4),
N(arch_5),
N(arch_6),
N(arch_7),
};
#undef N
static const char *xen_hypercall_name(unsigned op)
{
if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
return xen_hypercall_names[op];
return "";
}
#define CREATE_TRACE_POINTS
#include <trace/events/xen.h>

4
arch/x86/xen/vdso.h Normal file
View File

@@ -0,0 +1,4 @@
/* Bit used for the pseudo-hwcap for non-negative segments. We use
bit 1 to avoid bugs in some versions of glibc when bit 0 is
used; the choice is otherwise arbitrary. */
#define VDSO_NOTE_NONEGSEG_BIT 1

74
arch/x86/xen/vga.c Normal file
View File

@@ -0,0 +1,74 @@
#include <linux/screen_info.h>
#include <linux/init.h>
#include <asm/bootparam.h>
#include <asm/setup.h>
#include <xen/interface/xen.h>
#include "xen-ops.h"
void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
{
struct screen_info *screen_info = &boot_params.screen_info;
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
screen_info->orig_video_isVGA = 1;
screen_info->orig_video_lines = 25;
screen_info->orig_video_cols = 80;
screen_info->orig_video_ega_bx = 3;
screen_info->orig_video_points = 16;
screen_info->orig_y = screen_info->orig_video_lines - 1;
switch (info->video_type) {
case XEN_VGATYPE_TEXT_MODE_3:
if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+ sizeof(info->u.text_mode_3))
break;
screen_info->orig_video_lines = info->u.text_mode_3.rows;
screen_info->orig_video_cols = info->u.text_mode_3.columns;
screen_info->orig_x = info->u.text_mode_3.cursor_x;
screen_info->orig_y = info->u.text_mode_3.cursor_y;
screen_info->orig_video_points =
info->u.text_mode_3.font_height;
break;
case XEN_VGATYPE_EFI_LFB:
case XEN_VGATYPE_VESA_LFB:
if (size < offsetof(struct dom0_vga_console_info,
u.vesa_lfb.gbl_caps))
break;
screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
screen_info->lfb_width = info->u.vesa_lfb.width;
screen_info->lfb_height = info->u.vesa_lfb.height;
screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
screen_info->red_size = info->u.vesa_lfb.red_size;
screen_info->red_pos = info->u.vesa_lfb.red_pos;
screen_info->green_size = info->u.vesa_lfb.green_size;
screen_info->green_pos = info->u.vesa_lfb.green_pos;
screen_info->blue_size = info->u.vesa_lfb.blue_size;
screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
if (info->video_type == XEN_VGATYPE_EFI_LFB) {
screen_info->orig_video_isVGA = VIDEO_TYPE_EFI;
break;
}
if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.gbl_caps)
+ sizeof(info->u.vesa_lfb.gbl_caps))
screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
if (size >= offsetof(struct dom0_vga_console_info,
u.vesa_lfb.mode_attrs)
+ sizeof(info->u.vesa_lfb.mode_attrs))
screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
break;
}
}

142
arch/x86/xen/xen-asm.S Normal file
View File

@@ -0,0 +1,142 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in percpu data) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include "xen-asm.h"
/*
* Enable events. This clears the event mask and tests the pending
* event status with one and operation. If there are pending events,
* then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* Test for pending */
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jz 1f
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
/*
* Disabling events is simply a matter of making the event mask
* non-zero.
*/
ENTRY(xen_irq_disable_direct)
movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
ENDPATCH(xen_irq_disable_direct)
ret
ENDPROC(xen_irq_disable_direct)
RELOC(xen_irq_disable_direct, 0)
/*
* (xen_)save_fl is used to get the current interrupt enable status.
* Callers expect the status to be in X86_EFLAGS_IF, and other bits
* may be set in the return value. We take advantage of this by
* making sure that X86_EFLAGS_IF has the right value (and other bits
* in that byte are 0), but other bits in the return value are
* undefined. We need to toggle the state of the bit, because Xen and
* x86 use opposite senses (mask vs enable).
*/
ENTRY(xen_save_fl_direct)
testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
setz %ah
addb %ah, %ah
ENDPATCH(xen_save_fl_direct)
ret
ENDPROC(xen_save_fl_direct)
RELOC(xen_save_fl_direct, 0)
/*
* In principle the caller should be passing us a value return from
* xen_save_fl_direct, but for robustness sake we test only the
* X86_EFLAGS_IF flag rather than the whole byte. After setting the
* interrupt mask state, it checks for unmasked pending events and
* enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
#else
testb $X86_EFLAGS_IF>>8, %ah
#endif
setz PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
/*
* Preempt here doesn't matter because that will deal with any
* pending interrupts. The pending check may end up being run
* on the wrong CPU, but that doesn't hurt.
*/
/* check for unmasked and pending */
cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
jnz 1f
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
#ifdef CONFIG_X86_32
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
#else
push %rax
push %rcx
push %rdx
push %rsi
push %rdi
push %r8
push %r9
push %r10
push %r11
call xen_force_evtchn_callback
pop %r11
pop %r10
pop %r9
pop %r8
pop %rdi
pop %rsi
pop %rdx
pop %rcx
pop %rax
#endif
ret

12
arch/x86/xen/xen-asm.h Normal file
View File

@@ -0,0 +1,12 @@
#ifndef _XEN_XEN_ASM_H
#define _XEN_XEN_ASM_H
#include <linux/linkage.h>
#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
#define ENDPATCH(x) .globl x##_end; x##_end=.
/* Pseudo-flag used for virtual NMI, which we don't implement yet */
#define XEN_EFLAGS_NMI 0x80000000
#endif

228
arch/x86/xen/xen-asm_32.S Normal file
View File

@@ -0,0 +1,228 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <asm/asm.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
/*
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
check_events:
push %eax
push %ecx
push %edx
call xen_force_evtchn_callback
pop %edx
pop %ecx
pop %eax
ret
/*
* We can't use sysexit directly, because we're not running in ring0.
* But we can easily fake it up using iret. Assuming xen_sysexit is
* jumped to with a standard stack frame, we can just strip it back to
* a standard iret frame and use iret.
*/
ENTRY(xen_sysexit)
movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
lea PT_EIP(%esp), %esp
jmp xen_iret
ENDPROC(xen_sysexit)
/*
* This is run where a normal iret would be run, with the same stack setup:
* 8: eflags
* 4: cs
* esp-> 0: eip
*
* This attempts to make sure that any pending events are dealt with
* on return to usermode, but there is a small window in which an
* event can happen just before entering usermode. If the nested
* interrupt ends up setting one of the TIF_WORK_MASK pending work
* flags, they will not be tested again before returning to
* usermode. This means that a process can end up with pending work,
* which will be unprocessed until the process enters and leaves the
* kernel again, which could be an unbounded amount of time. This
* means that a pending signal or reschedule event could be
* indefinitely delayed.
*
* The fix is to notice a nested interrupt in the critical window, and
* if one occurs, then fold the nested interrupt into the current
* interrupt stack frame, and re-process it iteratively rather than
* recursively. This means that it will exit via the normal path, and
* all pending work will be dealt with appropriately.
*
* Because the nested interrupt handler needs to deal with the current
* stack state in whatever form its in, we keep things simple by only
* using a single register which is pushed/popped on the stack.
*/
ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
push %eax
ESP_OFFSET=4 # bytes pushed onto stack
/*
* Store vcpu_info pointer for easy access. Do it this way to
* avoid having to reload %fs
*/
#ifdef CONFIG_SMP
GET_THREAD_INFO(%eax)
movl %ss:TI_cpu(%eax), %eax
movl %ss:__per_cpu_offset(,%eax,4), %eax
mov %ss:xen_vcpu(%eax), %eax
#else
movl %ss:xen_vcpu, %eax
#endif
/* check IF state we're restoring */
testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
/*
* Maybe enable events. Once this happens we could get a
* recursive event, so the critical region starts immediately
* afterwards. However, if that happens we don't end up
* resuming the code, so we don't have to be worried about
* being preempted to another CPU.
*/
setz %ss:XEN_vcpu_info_mask(%eax)
xen_iret_start_crit:
/* check for unmasked and pending */
cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax)
/*
* If there's something pending, mask events again so we can
* jump back into xen_hypervisor_callback. Otherwise do not
* touch XEN_vcpu_info_mask.
*/
jne 1f
movb $1, %ss:XEN_vcpu_info_mask(%eax)
1: popl %eax
/*
* From this point on the registers are restored and the stack
* updated, so we don't need to worry about it if we're
* preempted
*/
iret_restore_end:
/*
* Jump to hypervisor_callback after fixing up the stack.
* Events are masked, so jumping out of the critical region is
* OK.
*/
je xen_hypervisor_callback
1: iret
xen_iret_end_crit:
_ASM_EXTABLE(1b, iret_exc)
hyper_iret:
/* put this out of line since its very rarely used */
jmp hypercall_page + __HYPERVISOR_iret * 32
.globl xen_iret_start_crit, xen_iret_end_crit
/*
* This is called by xen_hypervisor_callback in entry.S when it sees
* that the EIP at the time of interrupt was between
* xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in
* %eax so we can do a more refined determination of what to do.
*
* The stack format at this point is:
* ----------------
* ss : (ss/esp may be present if we came from usermode)
* esp :
* eflags } outer exception info
* cs }
* eip }
* ---------------- <- edi (copy dest)
* eax : outer eax if it hasn't been restored
* ----------------
* eflags } nested exception info
* cs } (no ss/esp because we're nested
* eip } from the same ring)
* orig_eax }<- esi (copy src)
* - - - - - - - -
* fs }
* es }
* ds } SAVE_ALL state
* eax }
* : :
* ebx }<- esp
* ----------------
*
* In order to deliver the nested exception properly, we need to shift
* everything from the return addr up to the error code so it sits
* just under the outer exception info. This means that when we
* handle the exception, we do it in the context of the outer
* exception rather than starting a new one.
*
* The only caveat is that if the outer eax hasn't been restored yet
* (ie, it's still on stack), we need to insert its value into the
* SAVE_ALL state before going on, since it's usermode state which we
* eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
/*
* Paranoia: Make sure we're really coming from kernel space.
* One could imagine a case where userspace jumps into the
* critical range address, but just before the CPU delivers a
* GP, it decides to deliver an interrupt instead. Unlikely?
* Definitely. Easy to avoid? Yes. The Intel documents
* explicitly say that the reported EIP for a bad jump is the
* jump instruction itself, not the destination, but some
* virtual environments get this wrong.
*/
movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
lea PT_ORIG_EAX(%esp), %esi
lea PT_EFLAGS(%esp), %edi
/*
* If eip is before iret_restore_end then stack
* hasn't been restored yet.
*/
cmp $iret_restore_end, %eax
jae 1f
movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */
movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */
/* set up the copy */
1: std
mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi), %esp /* point esp to new frame */
2: jmp xen_do_upcall

159
arch/x86/xen/xen-asm_64.S Normal file
View File

@@ -0,0 +1,159 @@
/*
* Asm versions of Xen pv-ops, suitable for either direct use or
* inlining. The inline versions are the same as the direct-use
* versions, with the pre- and post-amble chopped off.
*
* This code is encoded for size rather than absolute efficiency, with
* a view to being able to inline as much as possible.
*
* We only bother with direct forms (ie, vcpu in pda) of the
* operations here; the indirect forms are better handled in C, since
* they're generally too large to inline anyway.
*/
#include <asm/errno.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
#include <asm/segment.h>
#include <xen/interface/xen.h>
#include "xen-asm.h"
ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
mov 8+8(%rsp), %r11
ret $16
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
* Xen64 iret frame:
*
* ss
* rsp
* rflags
* cs
* rip <-- standard iret frame
*
* flags
*
* rcx }
* r11 }<-- pushed by hypercall page
* rsp->rax }
*/
ENTRY(xen_iret)
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_iret)
RELOC(xen_iret, 1b+1)
/*
* sysexit is not used for 64-bit processes, so it's only ever used to
* return to 32-bit compat userspace.
*/
ENTRY(xen_sysexit)
pushq $__USER32_DS
pushq %rcx
pushq $X86_EFLAGS_IF
pushq $__USER32_CS
pushq %rdx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysexit)
RELOC(xen_sysexit, 1b+1)
ENTRY(xen_sysret64)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER_CS
pushq %rcx
pushq $VGCF_in_syscall
1: jmp hypercall_iret
ENDPATCH(xen_sysret64)
RELOC(xen_sysret64, 1b+1)
ENTRY(xen_sysret32)
/*
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
movq %rsp, PER_CPU_VAR(old_rsp)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
pushq PER_CPU_VAR(old_rsp)
pushq %r11
pushq $__USER32_CS
pushq %rcx
pushq $0
1: jmp hypercall_iret
ENDPATCH(xen_sysret32)
RELOC(xen_sysret32, 1b+1)
/*
* Xen handles syscall callbacks much like ordinary exceptions, which
* means we have:
* - kernel gs
* - kernel rsp
* - an iret-like stack frame on the stack (including rcx and r11):
* ss
* rsp
* rflags
* cs
* rip
* r11
* rsp->rcx
*
* In all the entrypoints, we undo all that to make it look like a
* CPU-generated syscall/sysenter and jump to the normal entrypoint.
*/
.macro undo_xen_syscall
mov 0*8(%rsp), %rcx
mov 1*8(%rsp), %r11
mov 5*8(%rsp), %rsp
.endm
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
jmp system_call_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
jmp ia32_cstar_target
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
jmp ia32_sysenter_target
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */
ENTRY(xen_syscall32_target)
ENTRY(xen_sysenter_target)
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
pushq $0
jmp hypercall_iret
ENDPROC(xen_syscall32_target)
ENDPROC(xen_sysenter_target)
#endif /* CONFIG_IA32_EMULATION */

107
arch/x86/xen/xen-head.S Normal file
View File

@@ -0,0 +1,107 @@
/* Xen-specific pieces of head.S, intended to be included in the right
place in head.S */
#ifdef CONFIG_XEN
#include <linux/elfnote.h>
#include <linux/init.h>
#include <asm/boot.h>
#include <asm/asm.h>
#include <asm/page_types.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
cld
#ifdef CONFIG_X86_32
mov %esi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%esp
#else
mov %rsi,xen_start_info
mov $init_thread_union+THREAD_SIZE,%rsp
#endif
jmp xen_start_kernel
__FINIT
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
#define NEXT_HYPERCALL(x) \
ENTRY(xen_hypercall_##x) \
.skip 32
NEXT_HYPERCALL(set_trap_table)
NEXT_HYPERCALL(mmu_update)
NEXT_HYPERCALL(set_gdt)
NEXT_HYPERCALL(stack_switch)
NEXT_HYPERCALL(set_callbacks)
NEXT_HYPERCALL(fpu_taskswitch)
NEXT_HYPERCALL(sched_op_compat)
NEXT_HYPERCALL(platform_op)
NEXT_HYPERCALL(set_debugreg)
NEXT_HYPERCALL(get_debugreg)
NEXT_HYPERCALL(update_descriptor)
NEXT_HYPERCALL(ni)
NEXT_HYPERCALL(memory_op)
NEXT_HYPERCALL(multicall)
NEXT_HYPERCALL(update_va_mapping)
NEXT_HYPERCALL(set_timer_op)
NEXT_HYPERCALL(event_channel_op_compat)
NEXT_HYPERCALL(xen_version)
NEXT_HYPERCALL(console_io)
NEXT_HYPERCALL(physdev_op_compat)
NEXT_HYPERCALL(grant_table_op)
NEXT_HYPERCALL(vm_assist)
NEXT_HYPERCALL(update_va_mapping_otherdomain)
NEXT_HYPERCALL(iret)
NEXT_HYPERCALL(vcpu_op)
NEXT_HYPERCALL(set_segment_base)
NEXT_HYPERCALL(mmuext_op)
NEXT_HYPERCALL(xsm_op)
NEXT_HYPERCALL(nmi_op)
NEXT_HYPERCALL(sched_op)
NEXT_HYPERCALL(callback_op)
NEXT_HYPERCALL(xenoprof_op)
NEXT_HYPERCALL(event_channel_op)
NEXT_HYPERCALL(physdev_op)
NEXT_HYPERCALL(hvm_op)
NEXT_HYPERCALL(sysctl)
NEXT_HYPERCALL(domctl)
NEXT_HYPERCALL(kexec_op)
NEXT_HYPERCALL(tmem_op) /* 38 */
ENTRY(xen_hypercall_rsvr)
.skip 320
NEXT_HYPERCALL(mca) /* 48 */
NEXT_HYPERCALL(arch_1)
NEXT_HYPERCALL(arch_2)
NEXT_HYPERCALL(arch_3)
NEXT_HYPERCALL(arch_4)
NEXT_HYPERCALL(arch_5)
NEXT_HYPERCALL(arch_6)
.balign PAGE_SIZE
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
#else
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */

126
arch/x86/xen/xen-ops.h Normal file
View File

@@ -0,0 +1,126 @@
#ifndef XEN_OPS_H
#define XEN_OPS_H
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/irqreturn.h>
#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
extern const char xen_failsafe_callback[];
extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
void xen_setup_mfn_list_list(void);
void xen_setup_shared_info(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
extern unsigned long xen_max_p2m_pfn;
void xen_set_pat(u64);
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
unsigned long __init xen_revector_p2m_tree(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);
void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
cycle_t xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
void __init xen_init_time_ops(void);
void __init xen_hvm_init_time_ops(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
bool xen_vcpu_stolen(int vcpu);
void xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
void xen_smp_init(void);
void __init xen_hvm_smp_init(void);
extern cpumask_var_t xen_cpu_initialized_map;
#else
static inline void xen_smp_init(void) {}
static inline void xen_hvm_smp_init(void) {}
#endif
#ifdef CONFIG_PARAVIRT_SPINLOCKS
void __init xen_init_spinlocks(void);
void __cpuinit xen_init_lock_cpu(int cpu);
void xen_uninit_lock_cpu(int cpu);
#else
static inline void xen_init_spinlocks(void)
{
}
static inline void xen_init_lock_cpu(int cpu)
{
}
static inline void xen_uninit_lock_cpu(int cpu)
{
}
#endif
struct dom0_vga_console_info;
#ifdef CONFIG_XEN_DOM0
void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
void __init xen_init_apic(void);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
size_t size)
{
}
static inline void __init xen_init_apic(void)
{
}
#endif
/* Declare an asm function, along with symbols needed to make it
inlineable */
#define DECL_ASM(ret, name, ...) \
ret name(__VA_ARGS__); \
extern char name##_end[]; \
extern char name##_reloc[] \
DECL_ASM(void, xen_irq_enable_direct, void);
DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
void xen_sysret32(void);
void xen_sysret64(void);
void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
#endif /* XEN_OPS_H */