2005-04-17 02:20:36 +04:00
/*
* linux / arch / x86_64 / kernel / vsyscall . c
*
* Copyright ( C ) 2001 Andrea Arcangeli < andrea @ suse . de > SuSE
* Copyright 2003 Andi Kleen , SuSE Labs .
*
* Thanks to hpa @ transmeta . com for some useful hint .
* Special thanks to Ingo Molnar for his early experience with
* a different vsyscall implementation for Linux / IA32 and for the name .
*
* vsyscall 1 is located at - 10 Mbyte , vsyscall 2 is located
* at virtual address - 10 Mbyte + 1024 bytes etc . . . There are at max 4
* vsyscalls . One vsyscall can reserve more than 1 slot to avoid
* jumping out of line if necessary . We cannot add more with this
* mechanism because older kernels won ' t return - ENOSYS .
* If we want more than four we need a vDSO .
*
* Note : the concept clashes with user mode linux . If you use UML and
* want per guest time just set the kernel . vsyscall64 sysctl to 0.
*/
# include <linux/time.h>
# include <linux/init.h>
# include <linux/kernel.h>
# include <linux/timer.h>
# include <linux/seqlock.h>
# include <linux/jiffies.h>
# include <linux/sysctl.h>
2006-09-26 12:52:28 +04:00
# include <linux/getcpu.h>
2005-04-17 02:20:36 +04:00
# include <asm/vsyscall.h>
# include <asm/pgtable.h>
# include <asm/page.h>
# include <asm/fixmap.h>
# include <asm/errno.h>
# include <asm/io.h>
2006-09-26 12:52:28 +04:00
# include <asm/segment.h>
# include <asm/desc.h>
# include <asm/topology.h>
2005-04-17 02:20:36 +04:00
# define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
int __sysctl_vsyscall __section_sysctl_vsyscall = 1 ;
seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED ;
2006-09-26 12:52:28 +04:00
int __vgetcpu_mode __section_vgetcpu_mode ;
2005-04-17 02:20:36 +04:00
# include <asm/unistd.h>
2006-01-12 00:45:30 +03:00
static __always_inline void timeval_normalize ( struct timeval * tv )
2005-04-17 02:20:36 +04:00
{
time_t __sec ;
__sec = tv - > tv_usec / 1000000 ;
if ( __sec ) {
tv - > tv_usec % = 1000000 ;
tv - > tv_sec + = __sec ;
}
}
2006-01-12 00:45:30 +03:00
static __always_inline void do_vgettimeofday ( struct timeval * tv )
2005-04-17 02:20:36 +04:00
{
long sequence , t ;
unsigned long sec , usec ;
do {
sequence = read_seqbegin ( & __xtime_lock ) ;
sec = __xtime . tv_sec ;
usec = ( __xtime . tv_nsec / 1000 ) +
( __jiffies - __wall_jiffies ) * ( 1000000 / HZ ) ;
2005-05-17 08:53:28 +04:00
if ( __vxtime . mode ! = VXTIME_HPET ) {
2006-01-12 00:45:24 +03:00
t = get_cycles_sync ( ) ;
2005-04-17 02:20:36 +04:00
if ( t < __vxtime . last_tsc )
t = __vxtime . last_tsc ;
usec + = ( ( t - __vxtime . last_tsc ) *
__vxtime . tsc_quot ) > > 32 ;
/* See comment in x86_64 do_gettimeofday. */
} else {
2006-09-26 12:52:33 +04:00
usec + = ( ( readl ( ( void __iomem * )
fix_to_virt ( VSYSCALL_HPET ) + 0xf0 ) -
2005-04-17 02:20:36 +04:00
__vxtime . last ) * __vxtime . quot ) > > 32 ;
}
} while ( read_seqretry ( & __xtime_lock , sequence ) ) ;
tv - > tv_sec = sec + usec / 1000000 ;
tv - > tv_usec = usec % 1000000 ;
}
/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
2006-01-12 00:45:30 +03:00
static __always_inline void do_get_tz ( struct timezone * tz )
2005-04-17 02:20:36 +04:00
{
* tz = __sys_tz ;
}
2006-01-12 00:45:30 +03:00
static __always_inline int gettimeofday ( struct timeval * tv , struct timezone * tz )
2005-04-17 02:20:36 +04:00
{
int ret ;
asm volatile ( " vsysc2: syscall "
: " =a " ( ret )
: " 0 " ( __NR_gettimeofday ) , " D " ( tv ) , " S " ( tz ) : __syscall_clobber ) ;
return ret ;
}
2006-01-12 00:45:30 +03:00
static __always_inline long time_syscall ( long * t )
2005-04-17 02:20:36 +04:00
{
long secs ;
asm volatile ( " vsysc1: syscall "
: " =a " ( secs )
: " 0 " ( __NR_time ) , " D " ( t ) : __syscall_clobber ) ;
return secs ;
}
2005-09-12 20:49:24 +04:00
int __vsyscall ( 0 ) vgettimeofday ( struct timeval * tv , struct timezone * tz )
2005-04-17 02:20:36 +04:00
{
2006-06-26 15:56:58 +04:00
if ( ! __sysctl_vsyscall )
2005-04-17 02:20:36 +04:00
return gettimeofday ( tv , tz ) ;
if ( tv )
do_vgettimeofday ( tv ) ;
if ( tz )
do_get_tz ( tz ) ;
return 0 ;
}
/* This will break when the xtime seconds get inaccurate, but that is
* unlikely */
2005-09-12 20:49:24 +04:00
time_t __vsyscall ( 1 ) vtime ( time_t * t )
2005-04-17 02:20:36 +04:00
{
2006-06-26 15:56:58 +04:00
if ( ! __sysctl_vsyscall )
2005-04-17 02:20:36 +04:00
return time_syscall ( t ) ;
else if ( t )
* t = __xtime . tv_sec ;
return __xtime . tv_sec ;
}
2006-09-26 12:52:28 +04:00
/* Fast way to get current CPU and node.
This helps to do per node and per CPU caches in user space .
The result is not guaranteed without CPU affinity , but usually
works out because the scheduler tries to keep a thread on the same
CPU .
tcache must point to a two element sized long array .
All arguments can be NULL . */
long __vsyscall ( 2 )
vgetcpu ( unsigned * cpu , unsigned * node , struct getcpu_cache * tcache )
2005-04-17 02:20:36 +04:00
{
2006-09-26 12:52:28 +04:00
unsigned int dummy , p ;
unsigned long j = 0 ;
/* Fast cache - only recompute value once per jiffies and avoid
relatively costly rdtscp / cpuid otherwise .
This works because the scheduler usually keeps the process
on the same CPU and this syscall doesn ' t guarantee its
results anyways .
We do this here because otherwise user space would do it on
its own in a likely inferior way ( no access to jiffies ) .
If you don ' t like it pass NULL . */
if ( tcache & & tcache - > t0 = = ( j = __jiffies ) ) {
p = tcache - > t1 ;
} else if ( __vgetcpu_mode = = VGETCPU_RDTSCP ) {
/* Load per CPU data from RDTSCP */
rdtscp ( dummy , dummy , p ) ;
} else {
/* Load per CPU data from GDT */
asm ( " lsl %1,%0 " : " =r " ( p ) : " r " ( __PER_CPU_SEG ) ) ;
}
if ( tcache ) {
tcache - > t0 = j ;
tcache - > t1 = p ;
}
if ( cpu )
* cpu = p & 0xfff ;
if ( node )
* node = p > > 12 ;
return 0 ;
2005-04-17 02:20:36 +04:00
}
2005-09-12 20:49:24 +04:00
long __vsyscall ( 3 ) venosys_1 ( void )
2005-04-17 02:20:36 +04:00
{
return - ENOSYS ;
}
# ifdef CONFIG_SYSCTL
# define SYSCALL 0x050f
# define NOP2 0x9090
/*
* NOP out syscall in vsyscall page when not needed .
*/
static int vsyscall_sysctl_change ( ctl_table * ctl , int write , struct file * filp ,
void __user * buffer , size_t * lenp , loff_t * ppos )
{
extern u16 vsysc1 , vsysc2 ;
2006-09-26 12:52:33 +04:00
u16 __iomem * map1 ;
u16 __iomem * map2 ;
2005-04-17 02:20:36 +04:00
int ret = proc_dointvec ( ctl , write , filp , buffer , lenp , ppos ) ;
if ( ! write )
return ret ;
/* gcc has some trouble with __va(__pa()), so just do it this
way . */
map1 = ioremap ( __pa_symbol ( & vsysc1 ) , 2 ) ;
if ( ! map1 )
return - ENOMEM ;
map2 = ioremap ( __pa_symbol ( & vsysc2 ) , 2 ) ;
if ( ! map2 ) {
ret = - ENOMEM ;
goto out ;
}
if ( ! sysctl_vsyscall ) {
2006-09-26 12:52:33 +04:00
writew ( SYSCALL , map1 ) ;
writew ( SYSCALL , map2 ) ;
2005-04-17 02:20:36 +04:00
} else {
2006-09-26 12:52:33 +04:00
writew ( NOP2 , map1 ) ;
writew ( NOP2 , map2 ) ;
2005-04-17 02:20:36 +04:00
}
iounmap ( map2 ) ;
out :
iounmap ( map1 ) ;
return ret ;
}
static int vsyscall_sysctl_nostrat ( ctl_table * t , int __user * name , int nlen ,
void __user * oldval , size_t __user * oldlenp ,
void __user * newval , size_t newlen ,
void * * context )
{
return - ENOSYS ;
}
static ctl_table kernel_table2 [ ] = {
{ . ctl_name = 99 , . procname = " vsyscall64 " ,
. data = & sysctl_vsyscall , . maxlen = sizeof ( int ) , . mode = 0644 ,
. strategy = vsyscall_sysctl_nostrat ,
. proc_handler = vsyscall_sysctl_change } ,
{ 0 , }
} ;
static ctl_table kernel_root_table2 [ ] = {
{ . ctl_name = CTL_KERN , . procname = " kernel " , . mode = 0555 ,
. child = kernel_table2 } ,
{ 0 } ,
} ;
# endif
2006-09-26 12:52:28 +04:00
static void __cpuinit write_rdtscp_cb ( void * info )
{
write_rdtscp_aux ( ( unsigned long ) info ) ;
}
void __cpuinit vsyscall_set_cpu ( int cpu )
{
unsigned long * d ;
unsigned long node = 0 ;
# ifdef CONFIG_NUMA
node = cpu_to_node [ cpu ] ;
# endif
if ( cpu_has ( & cpu_data [ cpu ] , X86_FEATURE_RDTSCP ) ) {
void * info = ( void * ) ( ( node < < 12 ) | cpu ) ;
/* Can happen on preemptive kernel */
if ( get_cpu ( ) = = cpu )
write_rdtscp_cb ( info ) ;
# ifdef CONFIG_SMP
else {
/* the notifier is unfortunately not executed on the
target CPU */
smp_call_function_single ( cpu , write_rdtscp_cb , info , 0 , 1 ) ;
}
# endif
put_cpu ( ) ;
}
/* Store cpu number in limit so that it can be loaded quickly
in user space in vgetcpu .
12 bits for the CPU and 8 bits for the node . */
d = ( unsigned long * ) ( cpu_gdt ( cpu ) + GDT_ENTRY_PER_CPU ) ;
* d = 0x0f40000000000ULL ;
* d | = cpu ;
* d | = ( node & 0xf ) < < 12 ;
* d | = ( node > > 4 ) < < 48 ;
}
2005-04-17 02:20:36 +04:00
static void __init map_vsyscall ( void )
{
extern char __vsyscall_0 ;
unsigned long physaddr_page0 = __pa_symbol ( & __vsyscall_0 ) ;
__set_fixmap ( VSYSCALL_FIRST_PAGE , physaddr_page0 , PAGE_KERNEL_VSYSCALL ) ;
}
static int __init vsyscall_init ( void )
{
BUG_ON ( ( ( unsigned long ) & vgettimeofday ! =
VSYSCALL_ADDR ( __NR_vgettimeofday ) ) ) ;
BUG_ON ( ( unsigned long ) & vtime ! = VSYSCALL_ADDR ( __NR_vtime ) ) ;
BUG_ON ( ( VSYSCALL_ADDR ( 0 ) ! = __fix_to_virt ( VSYSCALL_FIRST_PAGE ) ) ) ;
2006-09-26 12:52:28 +04:00
BUG_ON ( ( unsigned long ) & vgetcpu ! = VSYSCALL_ADDR ( __NR_vgetcpu ) ) ;
2005-04-17 02:20:36 +04:00
map_vsyscall ( ) ;
2005-05-17 08:53:33 +04:00
# ifdef CONFIG_SYSCTL
2005-04-17 02:20:36 +04:00
register_sysctl_table ( kernel_root_table2 , 0 ) ;
2005-05-17 08:53:33 +04:00
# endif
2005-04-17 02:20:36 +04:00
return 0 ;
}
__initcall ( vsyscall_init ) ;