2007-07-26 21:41:02 +04:00
/*P:100 This is the Launcher code, a simple program which lays out the
* " physical " memory for the new Guest by mapping the kernel image and the
* virtual devices , then reads repeatedly from / dev / lguest to run the Guest .
2007-10-22 05:03:26 +04:00
: */
2007-07-19 12:49:29 +04:00
# define _LARGEFILE64_SOURCE
# define _GNU_SOURCE
# include <stdio.h>
# include <string.h>
# include <unistd.h>
# include <err.h>
# include <stdint.h>
# include <stdlib.h>
# include <elf.h>
# include <sys/mman.h>
2007-08-29 01:35:59 +04:00
# include <sys/param.h>
2007-07-19 12:49:29 +04:00
# include <sys/types.h>
# include <sys/stat.h>
# include <sys/wait.h>
# include <fcntl.h>
# include <stdbool.h>
# include <errno.h>
# include <ctype.h>
# include <sys/socket.h>
# include <sys/ioctl.h>
# include <sys/time.h>
# include <time.h>
# include <netinet/in.h>
# include <net/if.h>
# include <linux/sockios.h>
# include <linux/if_tun.h>
# include <sys/uio.h>
# include <termios.h>
# include <getopt.h>
# include <zlib.h>
2007-10-22 05:24:22 +04:00
# include <assert.h>
# include <sched.h>
/*L:110 We can ignore the 30 include files we need for this program, but I do
2007-07-26 21:41:03 +04:00
* want to draw attention to the use of kernel - style types .
*
* As Linus said , " C is a Spartan language, and so should your naming be. " I
* like these abbreviations and the header we need uses them , so we define them
* here .
*/
2007-07-19 12:49:29 +04:00
typedef unsigned long long u64 ;
typedef uint32_t u32 ;
typedef uint16_t u16 ;
typedef uint8_t u8 ;
2007-10-22 04:56:24 +04:00
# include "linux/lguest_launcher.h"
2007-10-22 05:24:22 +04:00
# include "linux/pci_ids.h"
# include "linux/virtio_config.h"
# include "linux/virtio_net.h"
# include "linux/virtio_blk.h"
# include "linux/virtio_console.h"
# include "linux/virtio_ring.h"
2007-10-22 04:56:24 +04:00
# include "asm-x86/e820.h"
2007-07-26 21:41:03 +04:00
/*:*/
2007-07-19 12:49:29 +04:00
# define PAGE_PRESENT 0x7 /* Present, RW, Execute */
# define NET_PEERNUM 1
# define BRIDGE_PFX "bridge:"
# ifndef SIOCBRADDIF
# define SIOCBRADDIF 0x89a2 /* add interface to bridge */
# endif
2007-10-22 05:03:26 +04:00
/* We can have up to 256 pages for devices. */
# define DEVICE_PAGES 256
2007-10-22 05:24:22 +04:00
/* This fits nicely in a single 4096-byte page. */
# define VIRTQUEUE_NUM 127
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
* this , and although I wouldn ' t recommend it , it works quite nicely here . */
2007-07-19 12:49:29 +04:00
static bool verbose ;
# define verbose(args...) \
do { if ( verbose ) printf ( args ) ; } while ( 0 )
2007-07-26 21:41:03 +04:00
/*:*/
/* The pipe to send commands to the waker process */
2007-07-19 12:49:29 +04:00
static int waker_fd ;
2007-10-22 05:03:26 +04:00
/* The pointer to the start of guest memory. */
static void * guest_base ;
/* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit , guest_max ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* This is our list of devices. */
2007-07-19 12:49:29 +04:00
struct device_list
{
2007-07-26 21:41:03 +04:00
/* Summary information about the devices in our list: ready to pass to
* select ( ) to ask which need servicing . */
2007-07-19 12:49:29 +04:00
fd_set infds ;
int max_infd ;
2007-10-22 05:24:22 +04:00
/* Counter to assign interrupt numbers. */
unsigned int next_irq ;
/* Counter to print out convenient device numbers. */
unsigned int device_num ;
2007-07-26 21:41:03 +04:00
/* The descriptor page for the devices. */
2007-10-22 05:24:22 +04:00
u8 * descpage ;
/* The tail of the last descriptor. */
unsigned int desc_used ;
2007-07-26 21:41:03 +04:00
/* A single linked list of devices. */
2007-07-19 12:49:29 +04:00
struct device * dev ;
2007-07-26 21:41:03 +04:00
/* ... And an end pointer so we can easily append new devices */
2007-07-19 12:49:29 +04:00
struct device * * lastdev ;
} ;
2007-10-22 05:24:22 +04:00
/* The list of Guest devices, based on command line arguments. */
static struct device_list devices ;
2007-07-26 21:41:03 +04:00
/* The device structure describes a single device. */
2007-07-19 12:49:29 +04:00
struct device
{
2007-07-26 21:41:03 +04:00
/* The linked-list pointer. */
2007-07-19 12:49:29 +04:00
struct device * next ;
2007-10-22 05:24:22 +04:00
/* The this device's descriptor, as mapped into the Guest. */
2007-07-19 12:49:29 +04:00
struct lguest_device_desc * desc ;
2007-10-22 05:24:22 +04:00
/* The name of this device, for --verbose. */
const char * name ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* If handle_input is set, it wants to be called when this file
* descriptor is ready . */
2007-07-19 12:49:29 +04:00
int fd ;
bool ( * handle_input ) ( int fd , struct device * me ) ;
2007-10-22 05:24:22 +04:00
/* Any queues attached to this device */
struct virtqueue * vq ;
2007-07-19 12:49:29 +04:00
/* Device-specific data. */
void * priv ;
} ;
2007-10-22 05:24:22 +04:00
/* The virtqueue structure describes a queue attached to a device. */
struct virtqueue
{
struct virtqueue * next ;
/* Which device owns me. */
struct device * dev ;
/* The configuration for this queue. */
struct lguest_vqconfig config ;
/* The actual ring of buffers. */
struct vring vring ;
/* Last available index we saw. */
u16 last_avail_idx ;
/* The routine to call when the Guest pings us. */
void ( * handle_output ) ( int fd , struct virtqueue * me ) ;
} ;
/* Since guest is UP and we don't run at the same time, we don't need barriers.
* But I include them in the code in case others copy it . */
# define wmb()
/* Convert an iovec element to the given type.
*
* This is a fairly ugly trick : we need to know the size of the type and
* alignment requirement to check the pointer is kosher . It ' s also nice to
* have the name of the type in case we report failure .
*
* Typing those three things all the time is cumbersome and error prone , so we
* have a macro which sets them all up and passes to the real function . */
# define convert(iov, type) \
( ( type * ) _convert ( ( iov ) , sizeof ( type ) , __alignof__ ( type ) , # type ) )
static void * _convert ( struct iovec * iov , size_t size , size_t align ,
const char * name )
{
if ( iov - > iov_len ! = size )
errx ( 1 , " Bad iovec size %zu for %s " , iov - > iov_len , name ) ;
if ( ( unsigned long ) iov - > iov_base % align ! = 0 )
errx ( 1 , " Bad alignment %p for %s " , iov - > iov_base , name ) ;
return iov - > iov_base ;
}
/* The virtio configuration space is defined to be little-endian. x86 is
* little - endian too , but it ' s nice to be explicit so we have these helpers . */
# define cpu_to_le16(v16) (v16)
# define cpu_to_le32(v32) (v32)
# define cpu_to_le64(v64) (v64)
# define le16_to_cpu(v16) (v16)
# define le32_to_cpu(v32) (v32)
# define le64_to_cpu(v32) (v64)
2007-10-22 05:03:26 +04:00
/*L:100 The Launcher code itself takes us out into userspace, that scary place
* where pointers run wild and free ! Unfortunately , like most userspace
* programs , it ' s quite boring ( which is why everyone likes to hack on the
* kernel ! ) . Perhaps if you make up an Lguest Drinking Game at this point , it
* will get you through this section . Or , maybe not .
*
* The Launcher sets up a big chunk of memory to be the Guest ' s " physical "
* memory and stores it in " guest_base " . In other words , Guest physical = =
* Launcher virtual with an offset .
*
* This can be tough to get your head around , but usually it just means that we
* use these trivial conversion functions when the Guest gives us it ' s
* " physical " addresses : */
static void * from_guest_phys ( unsigned long addr )
{
return guest_base + addr ;
}
static unsigned long to_guest_phys ( const void * addr )
{
return ( addr - guest_base ) ;
}
2007-07-26 21:41:03 +04:00
/*L:130
* Loading the Kernel .
*
* We start with couple of simple helper routines . open_or_die ( ) avoids
* error - checking code cluttering the callers : */
2007-07-19 12:49:29 +04:00
static int open_or_die ( const char * name , int flags )
{
int fd = open ( name , flags ) ;
if ( fd < 0 )
err ( 1 , " Failed to open %s " , name ) ;
return fd ;
}
2007-10-22 05:03:26 +04:00
/* map_zeroed_pages() takes a number of pages. */
static void * map_zeroed_pages ( unsigned int num )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:03:26 +04:00
int fd = open_or_die ( " /dev/zero " , O_RDONLY ) ;
void * addr ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* We use a private mapping (ie. if we write to the page, it will be
2007-10-22 05:03:26 +04:00
* copied ) . */
addr = mmap ( NULL , getpagesize ( ) * num ,
PROT_READ | PROT_WRITE | PROT_EXEC , MAP_PRIVATE , fd , 0 ) ;
if ( addr = = MAP_FAILED )
err ( 1 , " Mmaping %u pages of /dev/zero " , num ) ;
return addr ;
}
/* Get some more pages for a device. */
static void * get_pages ( unsigned int num )
{
void * addr = from_guest_phys ( guest_limit ) ;
guest_limit + = num * getpagesize ( ) ;
if ( guest_limit > guest_max )
errx ( 1 , " Not enough memory for devices " ) ;
return addr ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* To find out where to start we look for the magic Guest string, which marks
* the code we see in lguest_asm . S . This is a hack which we are currently
* plotting to replace with the normal Linux entry point . */
2007-10-22 05:03:36 +04:00
static unsigned long entry_point ( const void * start , const void * end )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:03:26 +04:00
const void * p ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:03:36 +04:00
/* The scan gives us the physical starting address. We boot with
* pagetables set up with virtual and physical the same , so that ' s
* OK . */
2007-07-19 12:49:29 +04:00
for ( p = start ; p < end ; p + + )
if ( memcmp ( p , " GenuineLguest " , strlen ( " GenuineLguest " ) ) = = 0 )
2007-10-22 05:03:36 +04:00
return to_guest_phys ( p + strlen ( " GenuineLguest " ) ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 04:56:21 +04:00
errx ( 1 , " Is this image a genuine lguest? " ) ;
2007-07-19 12:49:29 +04:00
}
2007-08-29 01:35:59 +04:00
/* This routine is used to load the kernel or initrd. It tries mmap, but if
* that fails ( Plan 9 ' s kernel file isn ' t nicely aligned on page boundaries ) ,
* it falls back to reading the memory in . */
static void map_at ( int fd , void * addr , unsigned long offset , unsigned long len )
{
ssize_t r ;
/* We map writable even though for some segments are marked read-only.
* The kernel really wants to be writable : it patches its own
* instructions .
*
* MAP_PRIVATE means that the page won ' t be copied until a write is
* done to it . This allows us to share untouched memory between
* Guests . */
if ( mmap ( addr , len , PROT_READ | PROT_WRITE | PROT_EXEC ,
MAP_FIXED | MAP_PRIVATE , fd , offset ) ! = MAP_FAILED )
return ;
/* pread does a seek and a read in one shot: saves a few lines. */
r = pread ( fd , addr , len , offset ) ;
if ( r ! = len )
err ( 1 , " Reading offset %lu len %lu gave %zi " , offset , len , r ) ;
}
2007-07-26 21:41:03 +04:00
/* This routine takes an open vmlinux image, which is in ELF, and maps it into
* the Guest memory . ELF = Embedded Linking Format , which is the format used
* by all modern binaries on Linux including the kernel .
*
* The ELF headers give * two * addresses : a physical address , and a virtual
2007-10-22 05:03:36 +04:00
* address . We use the physical address ; the Guest will map itself to the
* virtual address .
2007-07-26 21:41:03 +04:00
*
* We return the starting address . */
2007-10-22 05:03:36 +04:00
static unsigned long map_elf ( int elf_fd , const Elf32_Ehdr * ehdr )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:03:26 +04:00
void * start = ( void * ) - 1 , * end = NULL ;
2007-07-19 12:49:29 +04:00
Elf32_Phdr phdr [ ehdr - > e_phnum ] ;
unsigned int i ;
2007-07-26 21:41:03 +04:00
/* Sanity checks on the main ELF header: an x86 executable with a
* reasonable number of correctly - sized program headers . */
2007-07-19 12:49:29 +04:00
if ( ehdr - > e_type ! = ET_EXEC
| | ehdr - > e_machine ! = EM_386
| | ehdr - > e_phentsize ! = sizeof ( Elf32_Phdr )
| | ehdr - > e_phnum < 1 | | ehdr - > e_phnum > 65536U / sizeof ( Elf32_Phdr ) )
errx ( 1 , " Malformed elf header " ) ;
2007-07-26 21:41:03 +04:00
/* An ELF executable contains an ELF header and a number of "program"
* headers which indicate which parts ( " segments " ) of the program to
* load where . */
/* We read in all the program headers at once: */
2007-07-19 12:49:29 +04:00
if ( lseek ( elf_fd , ehdr - > e_phoff , SEEK_SET ) < 0 )
err ( 1 , " Seeking to program headers " ) ;
if ( read ( elf_fd , phdr , sizeof ( phdr ) ) ! = sizeof ( phdr ) )
err ( 1 , " Reading program headers " ) ;
2007-07-26 21:41:03 +04:00
/* Try all the headers: there are usually only three. A read-only one,
* a read - write one , and a " note " section which isn ' t loadable . */
2007-07-19 12:49:29 +04:00
for ( i = 0 ; i < ehdr - > e_phnum ; i + + ) {
2007-07-26 21:41:03 +04:00
/* If this isn't a loadable segment, we ignore it */
2007-07-19 12:49:29 +04:00
if ( phdr [ i ] . p_type ! = PT_LOAD )
continue ;
verbose ( " Section %i: size %i addr %p \n " ,
i , phdr [ i ] . p_memsz , ( void * ) phdr [ i ] . p_paddr ) ;
2007-07-26 21:41:03 +04:00
/* We track the first and last address we mapped, so we can
* tell entry_point ( ) where to scan . */
2007-10-22 05:03:26 +04:00
if ( from_guest_phys ( phdr [ i ] . p_paddr ) < start )
start = from_guest_phys ( phdr [ i ] . p_paddr ) ;
if ( from_guest_phys ( phdr [ i ] . p_paddr ) + phdr [ i ] . p_filesz > end )
end = from_guest_phys ( phdr [ i ] . p_paddr ) + phdr [ i ] . p_filesz ;
2007-07-19 12:49:29 +04:00
2007-08-29 01:35:59 +04:00
/* We map this section of the file at its physical address. */
2007-10-22 05:03:26 +04:00
map_at ( elf_fd , from_guest_phys ( phdr [ i ] . p_paddr ) ,
2007-08-29 01:35:59 +04:00
phdr [ i ] . p_offset , phdr [ i ] . p_filesz ) ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:03:36 +04:00
return entry_point ( start , end ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
* which need loading are extracted and compressed raw . This denies us the
* information we need to make a fully - general loader . */
2007-10-22 05:03:36 +04:00
static unsigned long unpack_bzimage ( int fd )
2007-07-19 12:49:29 +04:00
{
gzFile f ;
int ret , len = 0 ;
2007-07-26 21:41:03 +04:00
/* A bzImage always gets loaded at physical address 1M. This is
* actually configurable as CONFIG_PHYSICAL_START , but as the comment
* there says , " Don't change this unless you know what you are doing " .
* Indeed . */
2007-10-22 05:03:26 +04:00
void * img = from_guest_phys ( 0x100000 ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* gzdopen takes our file descriptor (carefully placed at the start of
* the GZIP header we found ) and returns a gzFile . */
2007-07-19 12:49:29 +04:00
f = gzdopen ( fd , " rb " ) ;
2007-07-26 21:41:03 +04:00
/* We read it into memory in 64k chunks until we hit the end. */
2007-07-19 12:49:29 +04:00
while ( ( ret = gzread ( f , img + len , 65536 ) ) > 0 )
len + = ret ;
if ( ret < 0 )
err ( 1 , " reading image from bzImage " ) ;
verbose ( " Unpacked size %i addr %p \n " , len , img ) ;
2007-07-26 21:41:03 +04:00
2007-10-22 05:03:36 +04:00
return entry_point ( img , img + len ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
* supposed to jump into it and it will unpack itself . We can ' t do that
* because the Guest can ' t run the unpacking code , and adding features to
* lguest kills puppies , so we don ' t want to .
*
* The bzImage is formed by putting the decompressing code in front of the
* compressed kernel code . So we can simple scan through it looking for the
* first " gzip " header , and start decompressing from there . */
2007-10-22 05:03:36 +04:00
static unsigned long load_bzimage ( int fd )
2007-07-19 12:49:29 +04:00
{
unsigned char c ;
int state = 0 ;
2007-07-26 21:41:03 +04:00
/* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
2007-07-19 12:49:29 +04:00
while ( read ( fd , & c , 1 ) = = 1 ) {
switch ( state ) {
case 0 :
if ( c = = 0x1F )
state + + ;
break ;
case 1 :
if ( c = = 0x8B )
state + + ;
else
state = 0 ;
break ;
case 2 . . . 8 :
state + + ;
break ;
case 9 :
2007-07-26 21:41:03 +04:00
/* Seek back to the start of the gzip header. */
2007-07-19 12:49:29 +04:00
lseek ( fd , - 10 , SEEK_CUR ) ;
2007-07-26 21:41:03 +04:00
/* One final check: "compressed under UNIX". */
if ( c ! = 0x03 )
2007-07-19 12:49:29 +04:00
state = - 1 ;
else
2007-10-22 05:03:36 +04:00
return unpack_bzimage ( fd ) ;
2007-07-19 12:49:29 +04:00
}
}
errx ( 1 , " Could not find kernel in bzImage " ) ;
}
2007-07-26 21:41:03 +04:00
/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
* come wrapped up in the self - decompressing " bzImage " format . With some funky
* coding , we can load those , too . */
2007-10-22 05:03:36 +04:00
static unsigned long load_kernel ( int fd )
2007-07-19 12:49:29 +04:00
{
Elf32_Ehdr hdr ;
2007-07-26 21:41:03 +04:00
/* Read in the first few bytes. */
2007-07-19 12:49:29 +04:00
if ( read ( fd , & hdr , sizeof ( hdr ) ) ! = sizeof ( hdr ) )
err ( 1 , " Reading kernel " ) ;
2007-07-26 21:41:03 +04:00
/* If it's an ELF file, it starts with "\177ELF" */
2007-07-19 12:49:29 +04:00
if ( memcmp ( hdr . e_ident , ELFMAG , SELFMAG ) = = 0 )
2007-10-22 05:03:36 +04:00
return map_elf ( fd , & hdr ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Otherwise we assume it's a bzImage, and try to unpack it */
2007-10-22 05:03:36 +04:00
return load_bzimage ( fd ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* This is a trivial little helper to align pages. Andi Kleen hated it because
* it calls getpagesize ( ) twice : " it's dumb code. "
*
* Kernel guys get really het up about optimization , even when it ' s not
* necessary . I leave this code as a reaction against that . */
2007-07-19 12:49:29 +04:00
static inline unsigned long page_align ( unsigned long addr )
{
2007-07-26 21:41:03 +04:00
/* Add upwards and truncate downwards. */
2007-07-19 12:49:29 +04:00
return ( ( addr + getpagesize ( ) - 1 ) & ~ ( getpagesize ( ) - 1 ) ) ;
}
2007-07-26 21:41:03 +04:00
/*L:180 An "initial ram disk" is a disk image loaded into memory along with
* the kernel which the kernel can use to boot from without needing any
* drivers . Most distributions now use this as standard : the initrd contains
* the code to load the appropriate driver modules for the current machine .
*
* Importantly , James Morris works for RedHat , and Fedora uses initrds for its
* kernels . He sent me this ( and tells me when I break it ) . */
2007-07-19 12:49:29 +04:00
static unsigned long load_initrd ( const char * name , unsigned long mem )
{
int ifd ;
struct stat st ;
unsigned long len ;
ifd = open_or_die ( name , O_RDONLY ) ;
2007-07-26 21:41:03 +04:00
/* fstat() is needed to get the file size. */
2007-07-19 12:49:29 +04:00
if ( fstat ( ifd , & st ) < 0 )
err ( 1 , " fstat() on initrd '%s' " , name ) ;
2007-08-29 01:35:59 +04:00
/* We map the initrd at the top of memory, but mmap wants it to be
* page - aligned , so we round the size up for that . */
2007-07-19 12:49:29 +04:00
len = page_align ( st . st_size ) ;
2007-10-22 05:03:26 +04:00
map_at ( ifd , from_guest_phys ( mem - len ) , 0 , st . st_size ) ;
2007-07-26 21:41:03 +04:00
/* Once a file is mapped, you can close the file descriptor. It's a
* little odd , but quite useful . */
2007-07-19 12:49:29 +04:00
close ( ifd ) ;
2007-08-29 01:35:59 +04:00
verbose ( " mapped initrd %s size=%lu @ %p \n " , name , len , ( void * ) mem - len ) ;
2007-07-26 21:41:03 +04:00
/* We return the initrd size. */
2007-07-19 12:49:29 +04:00
return len ;
}
2007-10-22 05:03:36 +04:00
/* Once we know how much memory we have, we can construct simple linear page
* tables which set virtual = = physical which will get the Guest far enough
2007-10-22 05:03:26 +04:00
* into the boot to create its own .
2007-07-26 21:41:03 +04:00
*
* We lay them out of the way , just below the initrd ( which is why we need to
* know its size ) . */
2007-07-19 12:49:29 +04:00
static unsigned long setup_pagetables ( unsigned long mem ,
2007-10-22 05:03:36 +04:00
unsigned long initrd_size )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:03:31 +04:00
unsigned long * pgdir , * linear ;
2007-07-19 12:49:29 +04:00
unsigned int mapped_pages , i , linear_pages ;
2007-10-22 05:03:31 +04:00
unsigned int ptes_per_page = getpagesize ( ) / sizeof ( void * ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:03:36 +04:00
mapped_pages = mem / getpagesize ( ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Each PTE page can map ptes_per_page pages: how many do we need? */
2007-07-19 12:49:29 +04:00
linear_pages = ( mapped_pages + ptes_per_page - 1 ) / ptes_per_page ;
2007-07-26 21:41:03 +04:00
/* We put the toplevel page directory page at the top of memory. */
2007-10-22 05:03:26 +04:00
pgdir = from_guest_phys ( mem ) - initrd_size - getpagesize ( ) ;
2007-07-26 21:41:03 +04:00
/* Now we use the next linear_pages pages as pte pages */
2007-07-19 12:49:29 +04:00
linear = ( void * ) pgdir - linear_pages * getpagesize ( ) ;
2007-07-26 21:41:03 +04:00
/* Linear mapping is easy: put every page's address into the mapping in
* order . PAGE_PRESENT contains the flags Present , Writable and
* Executable . */
2007-07-19 12:49:29 +04:00
for ( i = 0 ; i < mapped_pages ; i + + )
linear [ i ] = ( ( i * getpagesize ( ) ) | PAGE_PRESENT ) ;
2007-10-22 05:03:36 +04:00
/* The top level points to the linear page table pages above. */
2007-07-19 12:49:29 +04:00
for ( i = 0 ; i < mapped_pages ; i + = ptes_per_page ) {
2007-10-22 05:03:36 +04:00
pgdir [ i / ptes_per_page ]
2007-10-22 05:03:31 +04:00
= ( ( to_guest_phys ( linear ) + i * sizeof ( void * ) )
2007-10-22 05:03:26 +04:00
| PAGE_PRESENT ) ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:03:26 +04:00
verbose ( " Linear mapping of %u pages in %u pte pages at %#lx \n " ,
mapped_pages , linear_pages , to_guest_phys ( linear ) ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* We return the top level (guest-physical) address: the kernel needs
* to know where it is . */
2007-10-22 05:03:26 +04:00
return to_guest_phys ( pgdir ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* Simple routine to roll all the commandline arguments together with spaces
* between them . */
2007-07-19 12:49:29 +04:00
static void concat ( char * dst , char * args [ ] )
{
unsigned int i , len = 0 ;
for ( i = 0 ; args [ i ] ; i + + ) {
strcpy ( dst + len , args [ i ] ) ;
strcat ( dst + len , " " ) ;
len + = strlen ( args [ i ] ) + 1 ;
}
/* In case it's empty. */
dst [ len ] = ' \0 ' ;
}
2007-07-26 21:41:03 +04:00
/* This is where we actually tell the kernel to initialize the Guest. We saw
* the arguments it expects when we looked at initialize ( ) in lguest_user . c :
2007-10-22 05:03:26 +04:00
* the base of guest " physical " memory , the top physical page to allow , the
2007-10-22 05:03:36 +04:00
* top level pagetable and the entry point for the Guest . */
static int tell_kernel ( unsigned long pgdir , unsigned long start )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:03:31 +04:00
unsigned long args [ ] = { LHREQ_INITIALIZE ,
( unsigned long ) guest_base ,
2007-10-22 05:03:36 +04:00
guest_limit / getpagesize ( ) , pgdir , start } ;
2007-07-19 12:49:29 +04:00
int fd ;
2007-10-22 05:03:26 +04:00
verbose ( " Guest: %p - %p (%#lx) \n " ,
guest_base , guest_base + guest_limit , guest_limit ) ;
2007-07-19 12:49:29 +04:00
fd = open_or_die ( " /dev/lguest " , O_RDWR ) ;
if ( write ( fd , args , sizeof ( args ) ) < 0 )
err ( 1 , " Writing to /dev/lguest " ) ;
2007-07-26 21:41:03 +04:00
/* We return the /dev/lguest file descriptor to control this Guest */
2007-07-19 12:49:29 +04:00
return fd ;
}
2007-07-26 21:41:03 +04:00
/*:*/
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
static void add_device_fd ( int fd )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
FD_SET ( fd , & devices . infds ) ;
if ( fd > devices . max_infd )
devices . max_infd = fd ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/*L:200
* The Waker .
*
* With a console and network devices , we can have lots of input which we need
* to process . We could try to tell the kernel what file descriptors to watch ,
* but handing a file descriptor mask through to the kernel is fairly icky .
*
* Instead , we fork off a process which watches the file descriptors and writes
* the LHREQ_BREAK command to the / dev / lguest filedescriptor to tell the Host
* loop to stop running the Guest . This causes it to return from the
* / dev / lguest read with - EAGAIN , where it will write to / dev / lguest to reset
* the LHREQ_BREAK and wake us up again .
*
* This , of course , is merely a different * kind * of icky .
*/
2007-10-22 05:24:22 +04:00
static void wake_parent ( int pipefd , int lguest_fd )
2007-07-19 12:49:29 +04:00
{
2007-07-26 21:41:03 +04:00
/* Add the pipe from the Launcher to the fdset in the device_list, so
* we watch it , too . */
2007-10-22 05:24:22 +04:00
add_device_fd ( pipefd ) ;
2007-07-19 12:49:29 +04:00
for ( ; ; ) {
2007-10-22 05:24:22 +04:00
fd_set rfds = devices . infds ;
2007-10-22 05:03:31 +04:00
unsigned long args [ ] = { LHREQ_BREAK , 1 } ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Wait until input is ready from one of the devices. */
2007-10-22 05:24:22 +04:00
select ( devices . max_infd + 1 , & rfds , NULL , NULL , NULL ) ;
2007-07-26 21:41:03 +04:00
/* Is it a message from the Launcher? */
2007-07-19 12:49:29 +04:00
if ( FD_ISSET ( pipefd , & rfds ) ) {
int ignorefd ;
2007-07-26 21:41:03 +04:00
/* If read() returns 0, it means the Launcher has
* exited . We silently follow . */
2007-07-19 12:49:29 +04:00
if ( read ( pipefd , & ignorefd , sizeof ( ignorefd ) ) = = 0 )
exit ( 0 ) ;
2007-07-26 21:41:03 +04:00
/* Otherwise it's telling us there's a problem with one
* of the devices , and we should ignore that file
* descriptor from now on . */
2007-10-22 05:24:22 +04:00
FD_CLR ( ignorefd , & devices . infds ) ;
2007-07-26 21:41:03 +04:00
} else /* Send LHREQ_BREAK command. */
2007-07-19 12:49:29 +04:00
write ( lguest_fd , args , sizeof ( args ) ) ;
}
}
2007-07-26 21:41:03 +04:00
/* This routine just sets up a pipe to the Waker process. */
2007-10-22 05:24:22 +04:00
static int setup_waker ( int lguest_fd )
2007-07-19 12:49:29 +04:00
{
int pipefd [ 2 ] , child ;
2007-07-26 21:41:03 +04:00
/* We create a pipe to talk to the waker, and also so it knows when the
* Launcher dies ( and closes pipe ) . */
2007-07-19 12:49:29 +04:00
pipe ( pipefd ) ;
child = fork ( ) ;
if ( child = = - 1 )
err ( 1 , " forking " ) ;
if ( child = = 0 ) {
2007-07-26 21:41:03 +04:00
/* Close the "writing" end of our copy of the pipe */
2007-07-19 12:49:29 +04:00
close ( pipefd [ 1 ] ) ;
2007-10-22 05:24:22 +04:00
wake_parent ( pipefd [ 0 ] , lguest_fd ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* Close the reading end of our copy of the pipe. */
2007-07-19 12:49:29 +04:00
close ( pipefd [ 0 ] ) ;
2007-07-26 21:41:03 +04:00
/* Here is the fd used to talk to the waker. */
2007-07-19 12:49:29 +04:00
return pipefd [ 1 ] ;
}
2007-07-26 21:41:03 +04:00
/*L:210
* Device Handling .
*
* When the Guest sends DMA to us , it sends us an array of addresses and sizes .
* We need to make sure it ' s not trying to reach into the Launcher itself , so
* we have a convenient routine which check it and exits with an error message
* if something funny is going on :
*/
2007-07-19 12:49:29 +04:00
static void * _check_pointer ( unsigned long addr , unsigned int size ,
unsigned int line )
{
2007-07-26 21:41:03 +04:00
/* We have to separately check addr and addr+size, because size could
* be huge and addr + size might wrap around . */
2007-10-22 05:03:26 +04:00
if ( addr > = guest_limit | | addr + size > = guest_limit )
2007-10-22 05:24:22 +04:00
errx ( 1 , " %s:%i: Invalid address %#lx " , __FILE__ , line , addr ) ;
2007-07-26 21:41:03 +04:00
/* We return a pointer for the caller's convenience, now we know it's
* safe to use . */
2007-10-22 05:03:26 +04:00
return from_guest_phys ( addr ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* A macro which transparently hands the line number to the real function. */
2007-07-19 12:49:29 +04:00
# define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
2007-10-22 05:24:22 +04:00
/* This simply sets up an iovec array where we can put data to be discarded.
* This happens when the Guest doesn ' t want or can ' t handle the input : we have
* to get rid of it somewhere , and if we bury it in the ceiling space it will
* start to smell after a week . */
static void discard_iovec ( struct iovec * iov , unsigned int * num )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
static char discard_buf [ 1024 ] ;
* num = 1 ;
iov - > iov_base = discard_buf ;
iov - > iov_len = sizeof ( discard_buf ) ;
}
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* This function returns the next descriptor in the chain, or vq->vring.num. */
static unsigned next_desc ( struct virtqueue * vq , unsigned int i )
{
unsigned int next ;
/* If this descriptor says it doesn't chain, we're done. */
if ( ! ( vq - > vring . desc [ i ] . flags & VRING_DESC_F_NEXT ) )
return vq - > vring . num ;
/* Check they're not leading us off end of descriptors. */
next = vq - > vring . desc [ i ] . next ;
/* Make sure compiler knows to grab that: we don't want it changing! */
wmb ( ) ;
if ( next > = vq - > vring . num )
errx ( 1 , " Desc next is %u " , next ) ;
return next ;
}
/* This looks in the virtqueue and for the first available buffer, and converts
* it to an iovec for convenient access . Since descriptors consist of some
* number of output then some number of input descriptors , it ' s actually two
* iovecs , but we pack them into one and note how many of each there were .
*
* This function returns the descriptor number found , or vq - > vring . num ( which
* is never a valid descriptor number ) if none was found . */
static unsigned get_vq_desc ( struct virtqueue * vq ,
struct iovec iov [ ] ,
unsigned int * out_num , unsigned int * in_num )
{
unsigned int i , head ;
/* Check it isn't doing very strange things with descriptor numbers. */
if ( ( u16 ) ( vq - > vring . avail - > idx - vq - > last_avail_idx ) > vq - > vring . num )
errx ( 1 , " Guest moved used index from %u to %u " ,
vq - > last_avail_idx , vq - > vring . avail - > idx ) ;
/* If there's nothing new since last we looked, return invalid. */
if ( vq - > vring . avail - > idx = = vq - > last_avail_idx )
return vq - > vring . num ;
/* Grab the next descriptor number they're advertising, and increment
* the index we ' ve seen . */
head = vq - > vring . avail - > ring [ vq - > last_avail_idx + + % vq - > vring . num ] ;
/* If their number is silly, that's a fatal mistake. */
if ( head > = vq - > vring . num )
errx ( 1 , " Guest says index %u is available " , head ) ;
/* When we start there are none of either input nor output. */
* out_num = * in_num = 0 ;
i = head ;
do {
/* Grab the first descriptor, and check it's OK. */
iov [ * out_num + * in_num ] . iov_len = vq - > vring . desc [ i ] . len ;
iov [ * out_num + * in_num ] . iov_base
= check_pointer ( vq - > vring . desc [ i ] . addr ,
vq - > vring . desc [ i ] . len ) ;
/* If this is an input descriptor, increment that count. */
if ( vq - > vring . desc [ i ] . flags & VRING_DESC_F_WRITE )
( * in_num ) + + ;
else {
/* If it's an output descriptor, they're all supposed
* to come before any input descriptors . */
if ( * in_num )
errx ( 1 , " Descriptor has out after in " ) ;
( * out_num ) + + ;
}
/* If we've got too many, that implies a descriptor loop. */
if ( * out_num + * in_num > vq - > vring . num )
errx ( 1 , " Looped descriptor " ) ;
} while ( ( i = next_desc ( vq , i ) ) ! = vq - > vring . num ) ;
2007-07-26 21:41:03 +04:00
2007-10-22 05:24:22 +04:00
return head ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* Once we've used one of their buffers, we tell them about it. We'll then
* want to send them an interrupt , using trigger_irq ( ) . */
static void add_used ( struct virtqueue * vq , unsigned int head , int len )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
struct vring_used_elem * used ;
/* Get a pointer to the next entry in the used ring. */
used = & vq - > vring . used - > ring [ vq - > vring . used - > idx % vq - > vring . num ] ;
used - > id = head ;
used - > len = len ;
/* Make sure buffer is written before we update index. */
wmb ( ) ;
vq - > vring . used - > idx + + ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* This actually sends the interrupt for this virtqueue */
static void trigger_irq ( int fd , struct virtqueue * vq )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
unsigned long buf [ ] = { LHREQ_IRQ , vq - > config . irq } ;
if ( vq - > vring . avail - > flags & VRING_AVAIL_F_NO_INTERRUPT )
return ;
/* Send the Guest an interrupt tell them we used something up. */
2007-07-19 12:49:29 +04:00
if ( write ( fd , buf , sizeof ( buf ) ) ! = 0 )
2007-10-22 05:24:22 +04:00
err ( 1 , " Triggering irq %i " , vq - > config . irq ) ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* And here's the combo meal deal. Supersize me! */
static void add_used_and_trigger ( int fd , struct virtqueue * vq ,
unsigned int head , int len )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
add_used ( vq , head , len ) ;
trigger_irq ( fd , vq ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* Here is the input terminal setting we save, and the routine to restore them
* on exit so the user can see what they type next . */
2007-07-19 12:49:29 +04:00
static struct termios orig_term ;
static void restore_term ( void )
{
tcsetattr ( STDIN_FILENO , TCSANOW , & orig_term ) ;
}
2007-07-26 21:41:03 +04:00
/* We associate some data with the console for our exit hack. */
2007-07-19 12:49:29 +04:00
struct console_abort
{
2007-07-26 21:41:03 +04:00
/* How many times have they hit ^C? */
2007-07-19 12:49:29 +04:00
int count ;
2007-07-26 21:41:03 +04:00
/* When did they start? */
2007-07-19 12:49:29 +04:00
struct timeval start ;
} ;
2007-07-26 21:41:03 +04:00
/* This is the routine which handles console input (ie. stdin). */
2007-07-19 12:49:29 +04:00
static bool handle_console_input ( int fd , struct device * dev )
{
int len ;
2007-10-22 05:24:22 +04:00
unsigned int head , in_num , out_num ;
struct iovec iov [ dev - > vq - > vring . num ] ;
2007-07-19 12:49:29 +04:00
struct console_abort * abort = dev - > priv ;
2007-10-22 05:24:22 +04:00
/* First we need a console buffer from the Guests's input virtqueue. */
head = get_vq_desc ( dev - > vq , iov , & out_num , & in_num ) ;
if ( head = = dev - > vq - > vring . num ) {
/* If they're not ready for input, we warn and set up to
* discard . */
warnx ( " console: no dma buffer! " ) ;
discard_iovec ( iov , & in_num ) ;
} else if ( out_num )
errx ( 1 , " Output buffers in console in queue? " ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* This is why we convert to iovecs: the readv() call uses them, and so
* it reads straight into the Guest ' s buffer . */
2007-10-22 05:24:22 +04:00
len = readv ( dev - > fd , iov , in_num ) ;
2007-07-19 12:49:29 +04:00
if ( len < = 0 ) {
2007-07-26 21:41:03 +04:00
/* This implies that the console is closed, is /dev/null, or
2007-10-22 05:24:22 +04:00
* something went terribly wrong . */
2007-07-19 12:49:29 +04:00
warnx ( " Failed to get console input, ignoring console. " ) ;
2007-10-22 05:24:22 +04:00
/* Put the input terminal back and return failure (meaning,
* don ' t call us again ) . */
restore_term ( ) ;
return false ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* If we actually read the data into the Guest, tell them about it. */
if ( head ! = dev - > vq - > vring . num )
add_used_and_trigger ( fd , dev - > vq , head , len ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Three ^C within one second? Exit.
*
* This is such a hack , but works surprisingly well . Each ^ C has to be
* in a buffer by itself , so they can ' t be too fast . But we check that
* we get three within about a second , so they can ' t be too slow . */
2007-07-19 12:49:29 +04:00
if ( len = = 1 & & ( ( char * ) iov [ 0 ] . iov_base ) [ 0 ] = = 3 ) {
if ( ! abort - > count + + )
gettimeofday ( & abort - > start , NULL ) ;
else if ( abort - > count = = 3 ) {
struct timeval now ;
gettimeofday ( & now , NULL ) ;
if ( now . tv_sec < = abort - > start . tv_sec + 1 ) {
2007-10-22 05:03:31 +04:00
unsigned long args [ ] = { LHREQ_BREAK , 0 } ;
2007-07-26 21:41:03 +04:00
/* Close the fd so Waker will know it has to
* exit . */
2007-07-19 12:49:29 +04:00
close ( waker_fd ) ;
2007-07-26 21:41:03 +04:00
/* Just in case waker is blocked in BREAK, send
* unbreak now . */
2007-07-19 12:49:29 +04:00
write ( fd , args , sizeof ( args ) ) ;
exit ( 2 ) ;
}
abort - > count = 0 ;
}
} else
2007-07-26 21:41:03 +04:00
/* Any other key resets the abort counter. */
2007-07-19 12:49:29 +04:00
abort - > count = 0 ;
2007-07-26 21:41:03 +04:00
/* Everything went OK! */
2007-07-19 12:49:29 +04:00
return true ;
}
2007-10-22 05:24:22 +04:00
/* Handling output for console is simple: we just get all the output buffers
* and write them to stdout . */
static void handle_console_output ( int fd , struct virtqueue * vq )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
unsigned int head , out , in ;
int len ;
struct iovec iov [ vq - > vring . num ] ;
/* Keep getting output buffers from the Guest until we run out. */
while ( ( head = get_vq_desc ( vq , iov , & out , & in ) ) ! = vq - > vring . num ) {
if ( in )
errx ( 1 , " Input buffers in output queue? " ) ;
len = writev ( STDOUT_FILENO , iov , out ) ;
add_used_and_trigger ( fd , vq , head , len ) ;
}
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* Handling output for network is also simple: we get all the output buffers
* and write them ( ignoring the first element ) to this device ' s file descriptor
* ( stdout ) . */
static void handle_net_output ( int fd , struct virtqueue * vq )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
unsigned int head , out , in ;
int len ;
struct iovec iov [ vq - > vring . num ] ;
/* Keep getting output buffers from the Guest until we run out. */
while ( ( head = get_vq_desc ( vq , iov , & out , & in ) ) ! = vq - > vring . num ) {
if ( in )
errx ( 1 , " Input buffers in output queue? " ) ;
/* Check header, but otherwise ignore it (we said we supported
* no features ) . */
( void ) convert ( & iov [ 0 ] , struct virtio_net_hdr ) ;
len = writev ( vq - > dev - > fd , iov + 1 , out - 1 ) ;
add_used_and_trigger ( fd , vq , head , len ) ;
}
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* This is where we handle a packet coming in from the tun device to our
* Guest . */
2007-07-19 12:49:29 +04:00
static bool handle_tun_input ( int fd , struct device * dev )
{
2007-10-22 05:24:22 +04:00
unsigned int head , in_num , out_num ;
2007-07-19 12:49:29 +04:00
int len ;
2007-10-22 05:24:22 +04:00
struct iovec iov [ dev - > vq - > vring . num ] ;
struct virtio_net_hdr * hdr ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* First we need a network buffer from the Guests's recv virtqueue. */
head = get_vq_desc ( dev - > vq , iov , & out_num , & in_num ) ;
if ( head = = dev - > vq - > vring . num ) {
2007-07-26 21:41:03 +04:00
/* Now, it's expected that if we try to send a packet too
2007-10-22 05:24:22 +04:00
* early , the Guest won ' t be ready yet . Wait until the device
* status says it ' s ready . */
/* FIXME: Actually want DRIVER_ACTIVE here. */
if ( dev - > desc - > status & VIRTIO_CONFIG_S_DRIVER_OK )
2007-07-19 12:49:29 +04:00
warn ( " network: no dma buffer! " ) ;
2007-10-22 05:24:22 +04:00
discard_iovec ( iov , & in_num ) ;
} else if ( out_num )
errx ( 1 , " Output buffers in network recv queue? " ) ;
/* First element is the header: we set it to 0 (no features). */
hdr = convert ( & iov [ 0 ] , struct virtio_net_hdr ) ;
hdr - > flags = 0 ;
hdr - > gso_type = VIRTIO_NET_HDR_GSO_NONE ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Read the packet from the device directly into the Guest's buffer. */
2007-10-22 05:24:22 +04:00
len = readv ( dev - > fd , iov + 1 , in_num - 1 ) ;
2007-07-19 12:49:29 +04:00
if ( len < = 0 )
err ( 1 , " reading network " ) ;
2007-07-26 21:41:03 +04:00
2007-10-22 05:24:22 +04:00
/* If we actually read the data into the Guest, tell them about it. */
if ( head ! = dev - > vq - > vring . num )
add_used_and_trigger ( fd , dev - > vq , head , sizeof ( * hdr ) + len ) ;
2007-07-19 12:49:29 +04:00
verbose ( " tun input packet len %i [%02x %02x] (%s) \n " , len ,
2007-10-22 05:24:22 +04:00
( ( u8 * ) iov [ 1 ] . iov_base ) [ 0 ] , ( ( u8 * ) iov [ 1 ] . iov_base ) [ 1 ] ,
head ! = dev - > vq - > vring . num ? " sent " : " discarded " ) ;
2007-07-26 21:41:03 +04:00
/* All good. */
2007-07-19 12:49:29 +04:00
return true ;
}
2007-10-22 05:24:22 +04:00
/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
static void handle_output ( int fd , unsigned long addr )
2007-07-19 12:49:29 +04:00
{
struct device * i ;
2007-10-22 05:24:22 +04:00
struct virtqueue * vq ;
/* Check each virtqueue. */
for ( i = devices . dev ; i ; i = i - > next ) {
for ( vq = i - > vq ; vq ; vq = vq - > next ) {
if ( vq - > config . pfn = = addr / getpagesize ( )
& & vq - > handle_output ) {
verbose ( " Output to %s \n " , vq - > dev - > name ) ;
vq - > handle_output ( fd , vq ) ;
return ;
}
2007-07-19 12:49:29 +04:00
}
}
2007-07-26 21:41:03 +04:00
2007-10-22 05:24:22 +04:00
/* Early console write is done using notify on a nul-terminated string
* in Guest memory . */
if ( addr > = guest_limit )
errx ( 1 , " Bad NOTIFY %#lx " , addr ) ;
write ( STDOUT_FILENO , from_guest_phys ( addr ) ,
strnlen ( from_guest_phys ( addr ) , guest_limit - addr ) ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:03 +04:00
/* This is called when the waker wakes us up: check for incoming file
* descriptors . */
2007-10-22 05:24:22 +04:00
static void handle_input ( int fd )
2007-07-19 12:49:29 +04:00
{
2007-07-26 21:41:03 +04:00
/* select() wants a zeroed timeval to mean "don't wait". */
2007-07-19 12:49:29 +04:00
struct timeval poll = { . tv_sec = 0 , . tv_usec = 0 } ;
for ( ; ; ) {
struct device * i ;
2007-10-22 05:24:22 +04:00
fd_set fds = devices . infds ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* If nothing is ready, we're done. */
2007-10-22 05:24:22 +04:00
if ( select ( devices . max_infd + 1 , & fds , NULL , NULL , & poll ) = = 0 )
2007-07-19 12:49:29 +04:00
break ;
2007-07-26 21:41:03 +04:00
/* Otherwise, call the device(s) which have readable
* file descriptors and a method of handling them . */
2007-10-22 05:24:22 +04:00
for ( i = devices . dev ; i ; i = i - > next ) {
2007-07-19 12:49:29 +04:00
if ( i - > handle_input & & FD_ISSET ( i - > fd , & fds ) ) {
2007-07-26 21:41:03 +04:00
/* If handle_input() returns false, it means we
* should no longer service it .
* handle_console_input ( ) does this . */
2007-07-19 12:49:29 +04:00
if ( ! i - > handle_input ( fd , i ) ) {
2007-07-26 21:41:03 +04:00
/* Clear it from the set of input file
* descriptors kept at the head of the
* device list . */
2007-10-22 05:24:22 +04:00
FD_CLR ( i - > fd , & devices . infds ) ;
2007-07-19 12:49:29 +04:00
/* Tell waker to ignore it too... */
write ( waker_fd , & i - > fd , sizeof ( i - > fd ) ) ;
}
}
}
}
}
2007-07-26 21:41:03 +04:00
/*L:190
* Device Setup
*
* All devices need a descriptor so the Guest knows it exists , and a " struct
* device " so the Launcher can keep track of it. We have common helper
* routines to allocate them .
*
* This routine allocates a new " struct lguest_device_desc " from descriptor
2007-10-22 05:24:22 +04:00
* table just above the Guest ' s normal memory . It returns a pointer to that
* descriptor . */
static struct lguest_device_desc * new_dev_desc ( u16 type )
2007-07-19 12:49:29 +04:00
{
2007-10-22 05:24:22 +04:00
struct lguest_device_desc * d ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* We only have one page for all the descriptors. */
if ( devices . desc_used + sizeof ( * d ) > getpagesize ( ) )
errx ( 1 , " Too many devices " ) ;
/* We don't need to set config_len or status: page is 0 already. */
d = ( void * ) devices . descpage + devices . desc_used ;
d - > type = type ;
devices . desc_used + = sizeof ( * d ) ;
return d ;
}
/* Each device descriptor is followed by some configuration information.
* The first byte is a " status " byte for the Guest to report what ' s happening .
* After that are fields : u8 type , u8 len , [ . . . len bytes . . . ] .
*
* This routine adds a new field to an existing device ' s descriptor . It only
* works for the last device , but that ' s OK because that ' s how we use it . */
static void add_desc_field ( struct device * dev , u8 type , u8 len , const void * c )
{
/* This is the last descriptor, right? */
assert ( devices . descpage + devices . desc_used
= = ( u8 * ) ( dev - > desc + 1 ) + dev - > desc - > config_len ) ;
/* We only have one page of device descriptions. */
if ( devices . desc_used + 2 + len > getpagesize ( ) )
errx ( 1 , " Too many devices " ) ;
/* Copy in the new config header: type then length. */
devices . descpage [ devices . desc_used + + ] = type ;
devices . descpage [ devices . desc_used + + ] = len ;
memcpy ( devices . descpage + devices . desc_used , c , len ) ;
devices . desc_used + = len ;
/* Update the device descriptor length: two byte head then data. */
dev - > desc - > config_len + = 2 + len ;
}
/* This routine adds a virtqueue to a device. We specify how many descriptors
* the virtqueue is to have . */
static void add_virtqueue ( struct device * dev , unsigned int num_descs ,
void ( * handle_output ) ( int fd , struct virtqueue * me ) )
{
unsigned int pages ;
struct virtqueue * * i , * vq = malloc ( sizeof ( * vq ) ) ;
void * p ;
/* First we need some pages for this virtqueue. */
pages = ( vring_size ( num_descs ) + getpagesize ( ) - 1 ) / getpagesize ( ) ;
p = get_pages ( pages ) ;
/* Initialize the configuration. */
vq - > config . num = num_descs ;
vq - > config . irq = devices . next_irq + + ;
vq - > config . pfn = to_guest_phys ( p ) / getpagesize ( ) ;
/* Initialize the vring. */
vring_init ( & vq - > vring , num_descs , p ) ;
/* Add the configuration information to this device's descriptor. */
add_desc_field ( dev , VIRTIO_CONFIG_F_VIRTQUEUE ,
sizeof ( vq - > config ) , & vq - > config ) ;
/* Add to tail of list, so dev->vq is first vq, dev->vq->next is
* second . */
for ( i = & dev - > vq ; * i ; i = & ( * i ) - > next ) ;
* i = vq ;
/* Link virtqueue back to device. */
vq - > dev = dev ;
/* Set up handler. */
vq - > handle_output = handle_output ;
if ( ! handle_output )
vq - > vring . used - > flags = VRING_USED_F_NO_NOTIFY ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/* This routine does all the creation and setup of a new device, including
* caling new_dev_desc ( ) to allocate the descriptor and device memory . */
static struct device * new_device ( const char * name , u16 type , int fd ,
bool ( * handle_input ) ( int , struct device * ) )
2007-07-19 12:49:29 +04:00
{
struct device * dev = malloc ( sizeof ( * dev ) ) ;
2007-07-26 21:41:03 +04:00
/* Append to device list. Prepending to a single-linked list is
* easier , but the user expects the devices to be arranged on the bus
* in command - line order . The first network device on the command line
* is eth0 , the first block device / dev / lgba , etc . */
2007-10-22 05:24:22 +04:00
* devices . lastdev = dev ;
2007-07-19 12:49:29 +04:00
dev - > next = NULL ;
2007-10-22 05:24:22 +04:00
devices . lastdev = & dev - > next ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Now we populate the fields one at a time. */
2007-07-19 12:49:29 +04:00
dev - > fd = fd ;
2007-07-26 21:41:03 +04:00
/* If we have an input handler for this file descriptor, then we add it
* to the device_list ' s fdset and maxfd . */
2007-07-19 12:49:29 +04:00
if ( handle_input )
2007-10-22 05:24:22 +04:00
add_device_fd ( dev - > fd ) ;
dev - > desc = new_dev_desc ( type ) ;
2007-07-19 12:49:29 +04:00
dev - > handle_input = handle_input ;
2007-10-22 05:24:22 +04:00
dev - > name = name ;
2007-07-19 12:49:29 +04:00
return dev ;
}
2007-07-26 21:41:03 +04:00
/* Our first setup routine is the console. It's a fairly simple device, but
* UNIX tty handling makes it uglier than it could be . */
2007-10-22 05:24:22 +04:00
static void setup_console ( void )
2007-07-19 12:49:29 +04:00
{
struct device * dev ;
2007-07-26 21:41:03 +04:00
/* If we can save the initial standard input settings... */
2007-07-19 12:49:29 +04:00
if ( tcgetattr ( STDIN_FILENO , & orig_term ) = = 0 ) {
struct termios term = orig_term ;
2007-07-26 21:41:03 +04:00
/* Then we turn off echo, line buffering and ^C etc. We want a
* raw input stream to the Guest . */
2007-07-19 12:49:29 +04:00
term . c_lflag & = ~ ( ISIG | ICANON | ECHO ) ;
tcsetattr ( STDIN_FILENO , TCSANOW , & term ) ;
2007-07-26 21:41:03 +04:00
/* If we exit gracefully, the original settings will be
* restored so the user can see what they ' re typing . */
2007-07-19 12:49:29 +04:00
atexit ( restore_term ) ;
}
2007-10-22 05:24:22 +04:00
dev = new_device ( " console " , VIRTIO_ID_CONSOLE ,
STDIN_FILENO , handle_console_input ) ;
2007-07-26 21:41:03 +04:00
/* We store the console state in dev->priv, and initialize it. */
2007-07-19 12:49:29 +04:00
dev - > priv = malloc ( sizeof ( struct console_abort ) ) ;
( ( struct console_abort * ) dev - > priv ) - > count = 0 ;
2007-10-22 05:24:22 +04:00
/* The console needs two virtqueues: the input then the output. We
* don ' t care when they refill the input queue , since we don ' t hold
* data waiting for them . That ' s why the input queue ' s callback is
* NULL . */
add_virtqueue ( dev , VIRTQUEUE_NUM , NULL ) ;
add_virtqueue ( dev , VIRTQUEUE_NUM , handle_console_output ) ;
verbose ( " device %u: console \n " , devices . device_num + + ) ;
2007-07-19 12:49:29 +04:00
}
2007-10-22 05:24:22 +04:00
/*:*/
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
* - - sharenet = < name > option which opens or creates a named pipe . This can be
* used to send packets to another guest in a 1 : 1 manner .
2007-07-26 21:41:03 +04:00
*
2007-10-22 05:24:22 +04:00
* More sopisticated is to use one of the tools developed for project like UML
* to do networking .
2007-07-26 21:41:03 +04:00
*
2007-10-22 05:24:22 +04:00
* Faster is to do virtio bonding in kernel . Doing this 1 : 1 would be
* completely generic ( " here's my vring, attach to your vring " ) and would work
* for any traffic . Of course , namespace and permissions issues need to be
* dealt with . A more sophisticated " multi-channel " virtio_net . c could hide
* multiple inter - guest channels behind one interface , although it would
* require some manner of hotplugging new virtio channels .
*
* Finally , we could implement a virtio network switch in the kernel . : */
2007-07-19 12:49:29 +04:00
static u32 str2ip ( const char * ipaddr )
{
unsigned int byte [ 4 ] ;
sscanf ( ipaddr , " %u.%u.%u.%u " , & byte [ 0 ] , & byte [ 1 ] , & byte [ 2 ] , & byte [ 3 ] ) ;
return ( byte [ 0 ] < < 24 ) | ( byte [ 1 ] < < 16 ) | ( byte [ 2 ] < < 8 ) | byte [ 3 ] ;
}
2007-07-26 21:41:03 +04:00
/* This code is "adapted" from libbridge: it attaches the Host end of the
* network device to the bridge device specified by the command line .
*
* This is yet another James Morris contribution ( I ' m an IP - level guy , so I
* dislike bridging ) , and I just try not to break it . */
2007-07-19 12:49:29 +04:00
static void add_to_bridge ( int fd , const char * if_name , const char * br_name )
{
int ifidx ;
struct ifreq ifr ;
if ( ! * br_name )
errx ( 1 , " must specify bridge name " ) ;
ifidx = if_nametoindex ( if_name ) ;
if ( ! ifidx )
errx ( 1 , " interface %s does not exist! " , if_name ) ;
strncpy ( ifr . ifr_name , br_name , IFNAMSIZ ) ;
ifr . ifr_ifindex = ifidx ;
if ( ioctl ( fd , SIOCBRADDIF , & ifr ) < 0 )
err ( 1 , " can't add %s to bridge %s " , if_name , br_name ) ;
}
2007-07-26 21:41:03 +04:00
/* This sets up the Host end of the network device with an IP address, brings
* it up so packets will flow , the copies the MAC address into the hwaddr
2007-10-22 05:24:22 +04:00
* pointer . */
2007-07-19 12:49:29 +04:00
static void configure_device ( int fd , const char * devname , u32 ipaddr ,
unsigned char hwaddr [ 6 ] )
{
struct ifreq ifr ;
struct sockaddr_in * sin = ( struct sockaddr_in * ) & ifr . ifr_addr ;
2007-07-26 21:41:03 +04:00
/* Don't read these incantations. Just cut & paste them like I did! */
2007-07-19 12:49:29 +04:00
memset ( & ifr , 0 , sizeof ( ifr ) ) ;
strcpy ( ifr . ifr_name , devname ) ;
sin - > sin_family = AF_INET ;
sin - > sin_addr . s_addr = htonl ( ipaddr ) ;
if ( ioctl ( fd , SIOCSIFADDR , & ifr ) ! = 0 )
err ( 1 , " Setting %s interface address " , devname ) ;
ifr . ifr_flags = IFF_UP ;
if ( ioctl ( fd , SIOCSIFFLAGS , & ifr ) ! = 0 )
err ( 1 , " Bringing interface %s up " , devname ) ;
2007-07-26 21:41:03 +04:00
/* SIOC stands for Socket I/O Control. G means Get (vs S for Set
* above ) . IF means Interface , and HWADDR is hardware address .
* Simple ! */
2007-07-19 12:49:29 +04:00
if ( ioctl ( fd , SIOCGIFHWADDR , & ifr ) ! = 0 )
err ( 1 , " getting hw address for %s " , devname ) ;
memcpy ( hwaddr , ifr . ifr_hwaddr . sa_data , 6 ) ;
}
2007-10-22 05:24:22 +04:00
/*L:195 Our network is a Host<->Guest network. This can either use bridging or
* routing , but the principle is the same : it uses the " tun " device to inject
* packets into the Host as if they came in from a normal network card . We
* just shunt packets between the Guest and the tun device . */
static void setup_tun_net ( const char * arg )
2007-07-19 12:49:29 +04:00
{
struct device * dev ;
struct ifreq ifr ;
int netfd , ipfd ;
u32 ip ;
const char * br_name = NULL ;
2007-10-22 05:24:22 +04:00
u8 hwaddr [ 6 ] ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* We open the /dev/net/tun device and tell it we want a tap device. A
* tap device is like a tun device , only somehow different . To tell
* the truth , I completely blundered my way through this code , but it
* works now ! */
2007-07-19 12:49:29 +04:00
netfd = open_or_die ( " /dev/net/tun " , O_RDWR ) ;
memset ( & ifr , 0 , sizeof ( ifr ) ) ;
ifr . ifr_flags = IFF_TAP | IFF_NO_PI ;
strcpy ( ifr . ifr_name , " tap%d " ) ;
if ( ioctl ( netfd , TUNSETIFF , & ifr ) ! = 0 )
err ( 1 , " configuring /dev/net/tun " ) ;
2007-07-26 21:41:03 +04:00
/* We don't need checksums calculated for packets coming in this
* device : trust us ! */
2007-07-19 12:49:29 +04:00
ioctl ( netfd , TUNSETNOCSUM , 1 ) ;
2007-10-22 05:24:22 +04:00
/* First we create a new network device. */
dev = new_device ( " net " , VIRTIO_ID_NET , netfd , handle_tun_input ) ;
2007-07-26 21:41:03 +04:00
2007-10-22 05:24:22 +04:00
/* Network devices need a receive and a send queue. */
add_virtqueue ( dev , VIRTQUEUE_NUM , NULL ) ;
add_virtqueue ( dev , VIRTQUEUE_NUM , handle_net_output ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* We need a socket to perform the magic network ioctls to bring up the
* tap interface , connect to the bridge etc . Any socket will do ! */
2007-07-19 12:49:29 +04:00
ipfd = socket ( PF_INET , SOCK_DGRAM , IPPROTO_IP ) ;
if ( ipfd < 0 )
err ( 1 , " opening IP socket " ) ;
2007-07-26 21:41:03 +04:00
/* If the command line was --tunnet=bridge:<name> do bridging. */
2007-07-19 12:49:29 +04:00
if ( ! strncmp ( BRIDGE_PFX , arg , strlen ( BRIDGE_PFX ) ) ) {
ip = INADDR_ANY ;
br_name = arg + strlen ( BRIDGE_PFX ) ;
add_to_bridge ( ipfd , ifr . ifr_name , br_name ) ;
2007-07-26 21:41:03 +04:00
} else /* It is an IP address to set up the device with */
2007-07-19 12:49:29 +04:00
ip = str2ip ( arg ) ;
2007-10-22 05:24:22 +04:00
/* Set up the tun device, and get the mac address for the interface. */
configure_device ( ipfd , ifr . ifr_name , ip , hwaddr ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* Tell Guest what MAC address to use. */
add_desc_field ( dev , VIRTIO_CONFIG_NET_MAC_F , sizeof ( hwaddr ) , hwaddr ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* We don't seed the socket any more; setup is done. */
2007-07-19 12:49:29 +04:00
close ( ipfd ) ;
2007-10-22 05:24:22 +04:00
verbose ( " device %u: tun net %u.%u.%u.%u \n " ,
devices . device_num + + ,
( u8 ) ( ip > > 24 ) , ( u8 ) ( ip > > 16 ) , ( u8 ) ( ip > > 8 ) , ( u8 ) ip ) ;
2007-07-19 12:49:29 +04:00
if ( br_name )
verbose ( " attached to bridge: %s \n " , br_name ) ;
}
2007-10-22 05:24:22 +04:00
/*
* Block device .
*
* Serving a block device is really easy : the Guest asks for a block number and
* we read or write that position in the file .
*
* Unfortunately , this is amazingly slow : the Guest waits until the read is
* finished before running anything else , even if it could be doing useful
* work . We could use async I / O , except it ' s reputed to suck so hard that
* characters actually go missing from your code when you try to use it .
*
* So we farm the I / O out to thread , and communicate with it via a pipe . */
/* This hangs off device->priv, with the data. */
struct vblk_info
{
/* The size of the file. */
off64_t len ;
/* The file descriptor for the file. */
int fd ;
/* IO thread listens on this file descriptor [0]. */
int workpipe [ 2 ] ;
/* IO thread writes to this file descriptor to mark it done, then
* Launcher triggers interrupt to Guest . */
int done_fd ;
} ;
/* This is the core of the I/O thread. It returns true if it did something. */
static bool service_io ( struct device * dev )
{
struct vblk_info * vblk = dev - > priv ;
unsigned int head , out_num , in_num , wlen ;
int ret ;
struct virtio_blk_inhdr * in ;
struct virtio_blk_outhdr * out ;
struct iovec iov [ dev - > vq - > vring . num ] ;
off64_t off ;
head = get_vq_desc ( dev - > vq , iov , & out_num , & in_num ) ;
if ( head = = dev - > vq - > vring . num )
return false ;
if ( out_num = = 0 | | in_num = = 0 )
errx ( 1 , " Bad virtblk cmd %u out=%u in=%u " ,
head , out_num , in_num ) ;
out = convert ( & iov [ 0 ] , struct virtio_blk_outhdr ) ;
in = convert ( & iov [ out_num + in_num - 1 ] , struct virtio_blk_inhdr ) ;
off = out - > sector * 512 ;
/* This is how we implement barriers. Pretty poor, no? */
if ( out - > type & VIRTIO_BLK_T_BARRIER )
fdatasync ( vblk - > fd ) ;
if ( out - > type & VIRTIO_BLK_T_SCSI_CMD ) {
fprintf ( stderr , " Scsi commands unsupported \n " ) ;
in - > status = VIRTIO_BLK_S_UNSUPP ;
wlen = sizeof ( in ) ;
} else if ( out - > type & VIRTIO_BLK_T_OUT ) {
/* Write */
/* Move to the right location in the block file. This can fail
* if they try to write past end . */
if ( lseek64 ( vblk - > fd , off , SEEK_SET ) ! = off )
err ( 1 , " Bad seek to sector %llu " , out - > sector ) ;
ret = writev ( vblk - > fd , iov + 1 , out_num - 1 ) ;
verbose ( " WRITE to sector %llu: %i \n " , out - > sector , ret ) ;
/* Grr... Now we know how long the descriptor they sent was, we
* make sure they didn ' t try to write over the end of the block
* file ( possibly extending it ) . */
if ( ret > 0 & & off + ret > vblk - > len ) {
/* Trim it back to the correct length */
ftruncate64 ( vblk - > fd , vblk - > len ) ;
/* Die, bad Guest, die. */
errx ( 1 , " Write past end %llu+%u " , off , ret ) ;
}
wlen = sizeof ( in ) ;
in - > status = ( ret > = 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR ) ;
} else {
/* Read */
/* Move to the right location in the block file. This can fail
* if they try to read past end . */
if ( lseek64 ( vblk - > fd , off , SEEK_SET ) ! = off )
err ( 1 , " Bad seek to sector %llu " , out - > sector ) ;
ret = readv ( vblk - > fd , iov + 1 , in_num - 1 ) ;
verbose ( " READ from sector %llu: %i \n " , out - > sector , ret ) ;
if ( ret > = 0 ) {
wlen = sizeof ( in ) + ret ;
in - > status = VIRTIO_BLK_S_OK ;
} else {
wlen = sizeof ( in ) ;
in - > status = VIRTIO_BLK_S_IOERR ;
}
}
/* We can't trigger an IRQ, because we're not the Launcher. It does
* that when we tell it we ' re done . */
add_used ( dev - > vq , head , wlen ) ;
return true ;
}
/* This is the thread which actually services the I/O. */
static int io_thread ( void * _dev )
{
struct device * dev = _dev ;
struct vblk_info * vblk = dev - > priv ;
char c ;
/* Close other side of workpipe so we get 0 read when main dies. */
close ( vblk - > workpipe [ 1 ] ) ;
/* Close the other side of the done_fd pipe. */
close ( dev - > fd ) ;
/* When this read fails, it means Launcher died, so we follow. */
while ( read ( vblk - > workpipe [ 0 ] , & c , 1 ) = = 1 ) {
/* We acknowledge each request immediately, to reduce latency,
* rather than waiting until we ' ve done them all . I haven ' t
* measured to see if it makes any difference . */
while ( service_io ( dev ) )
write ( vblk - > done_fd , & c , 1 ) ;
}
return 0 ;
}
/* When the thread says some I/O is done, we interrupt the Guest. */
static bool handle_io_finish ( int fd , struct device * dev )
{
char c ;
/* If child died, presumably it printed message. */
if ( read ( dev - > fd , & c , 1 ) ! = 1 )
exit ( 1 ) ;
/* It did some work, so trigger the irq. */
trigger_irq ( fd , dev - > vq ) ;
return true ;
}
/* When the Guest submits some I/O, we wake the I/O thread. */
static void handle_virtblk_output ( int fd , struct virtqueue * vq )
{
struct vblk_info * vblk = vq - > dev - > priv ;
char c = 0 ;
/* Wake up I/O thread and tell it to go to work! */
if ( write ( vblk - > workpipe [ 1 ] , & c , 1 ) ! = 1 )
/* Presumably it indicated why it died. */
exit ( 1 ) ;
}
/* This creates a virtual block device. */
static void setup_block_file ( const char * filename )
{
int p [ 2 ] ;
struct device * dev ;
struct vblk_info * vblk ;
void * stack ;
u64 cap ;
unsigned int val ;
/* This is the pipe the I/O thread will use to tell us I/O is done. */
pipe ( p ) ;
/* The device responds to return from I/O thread. */
dev = new_device ( " block " , VIRTIO_ID_BLOCK , p [ 0 ] , handle_io_finish ) ;
/* The device has a virtqueue. */
add_virtqueue ( dev , VIRTQUEUE_NUM , handle_virtblk_output ) ;
/* Allocate the room for our own bookkeeping */
vblk = dev - > priv = malloc ( sizeof ( * vblk ) ) ;
/* First we open the file and store the length. */
vblk - > fd = open_or_die ( filename , O_RDWR | O_LARGEFILE ) ;
vblk - > len = lseek64 ( vblk - > fd , 0 , SEEK_END ) ;
/* Tell Guest how many sectors this device has. */
cap = cpu_to_le64 ( vblk - > len / 512 ) ;
add_desc_field ( dev , VIRTIO_CONFIG_BLK_F_CAPACITY , sizeof ( cap ) , & cap ) ;
/* Tell Guest not to put in too many descriptors at once: two are used
* for the in and out elements . */
val = cpu_to_le32 ( VIRTQUEUE_NUM - 2 ) ;
add_desc_field ( dev , VIRTIO_CONFIG_BLK_F_SEG_MAX , sizeof ( val ) , & val ) ;
/* The I/O thread writes to this end of the pipe when done. */
vblk - > done_fd = p [ 1 ] ;
/* This is how we tell the I/O thread about more work. */
pipe ( vblk - > workpipe ) ;
/* Create stack for thread and run it */
stack = malloc ( 32768 ) ;
if ( clone ( io_thread , stack + 32768 , CLONE_VM , dev ) = = - 1 )
err ( 1 , " Creating clone " ) ;
/* We don't need to keep the I/O thread's end of the pipes open. */
close ( vblk - > done_fd ) ;
close ( vblk - > workpipe [ 0 ] ) ;
verbose ( " device %u: virtblock %llu sectors \n " ,
devices . device_num , cap ) ;
}
2007-07-26 21:41:03 +04:00
/* That's the end of device setup. */
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
* its input and output , and finally , lays it to rest . */
2007-10-22 05:24:22 +04:00
static void __attribute__ ( ( noreturn ) ) run_guest ( int lguest_fd )
2007-07-19 12:49:29 +04:00
{
for ( ; ; ) {
2007-10-22 05:03:31 +04:00
unsigned long args [ ] = { LHREQ_BREAK , 0 } ;
2007-10-22 05:24:22 +04:00
unsigned long notify_addr ;
2007-07-19 12:49:29 +04:00
int readval ;
/* We read from the /dev/lguest device to run the Guest. */
2007-10-22 05:24:22 +04:00
readval = read ( lguest_fd , & notify_addr , sizeof ( notify_addr ) ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:24:22 +04:00
/* One unsigned long means the Guest did HCALL_NOTIFY */
if ( readval = = sizeof ( notify_addr ) ) {
verbose ( " Notify on address %#lx \n " , notify_addr ) ;
handle_output ( lguest_fd , notify_addr ) ;
2007-07-19 12:49:29 +04:00
continue ;
2007-07-26 21:41:03 +04:00
/* ENOENT means the Guest died. Reading tells us why. */
2007-07-19 12:49:29 +04:00
} else if ( errno = = ENOENT ) {
char reason [ 1024 ] = { 0 } ;
read ( lguest_fd , reason , sizeof ( reason ) - 1 ) ;
errx ( 1 , " %s " , reason ) ;
2007-07-26 21:41:03 +04:00
/* EAGAIN means the waker wanted us to look at some input.
* Anything else means a bug or incompatible change . */
2007-07-19 12:49:29 +04:00
} else if ( errno ! = EAGAIN )
err ( 1 , " Running guest failed " ) ;
2007-07-26 21:41:03 +04:00
/* Service input, then unset the BREAK which releases
* the Waker . */
2007-10-22 05:24:22 +04:00
handle_input ( lguest_fd ) ;
2007-07-19 12:49:29 +04:00
if ( write ( lguest_fd , args , sizeof ( args ) ) < 0 )
err ( 1 , " Resetting break " ) ;
}
}
2007-07-26 21:41:03 +04:00
/*
* This is the end of the Launcher .
*
* But wait ! We ' ve seen I / O from the Launcher , and we ' ve seen I / O from the
* Drivers . If we were to see the Host kernel I / O code , our understanding
* would be complete . . . : */
2007-07-19 12:49:29 +04:00
static struct option opts [ ] = {
{ " verbose " , 0 , NULL , ' v ' } ,
{ " tunnet " , 1 , NULL , ' t ' } ,
{ " block " , 1 , NULL , ' b ' } ,
{ " initrd " , 1 , NULL , ' i ' } ,
{ NULL } ,
} ;
static void usage ( void )
{
errx ( 1 , " Usage: lguest [--verbose] "
2007-10-22 05:24:22 +04:00
" [--tunnet=(<ipaddr>|bridge:<bridgename>) \n "
2007-07-19 12:49:29 +04:00
" |--block=<filename>|--initrd=<filename>]... \n "
" <mem-in-mb> vmlinux [args...] " ) ;
}
2007-10-22 05:03:26 +04:00
/*L:105 The main routine is where the real work begins: */
2007-07-19 12:49:29 +04:00
int main ( int argc , char * argv [ ] )
{
2007-10-22 05:03:36 +04:00
/* Memory, top-level pagetable, code startpoint and size of the
* ( optional ) initrd . */
unsigned long mem = 0 , pgdir , start , initrd_size = 0 ;
2007-07-26 21:41:03 +04:00
/* A temporary and the /dev/lguest file descriptor. */
2007-07-24 05:43:56 +04:00
int i , c , lguest_fd ;
2007-10-22 05:03:26 +04:00
/* The boot information for the Guest. */
void * boot ;
2007-07-26 21:41:03 +04:00
/* If they specify an initrd file to load. */
2007-07-19 12:49:29 +04:00
const char * initrd_name = NULL ;
2007-07-26 21:41:03 +04:00
/* First we initialize the device list. Since console and network
* device receive input from a file descriptor , we keep an fdset
* ( infds ) and the maximum fd number ( max_infd ) with the head of the
* list . We also keep a pointer to the last device , for easy appending
2007-10-22 05:24:22 +04:00
* to the list . Finally , we keep the next interrupt number to hand out
* ( 1 : remember that 0 is used by the timer ) . */
FD_ZERO ( & devices . infds ) ;
devices . max_infd = - 1 ;
devices . lastdev = & devices . dev ;
devices . next_irq = 1 ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* We need to know how much memory so we can set up the device
* descriptor and memory pages for the devices as we parse the command
* line . So we quickly look through the arguments to find the amount
* of memory now . */
2007-07-24 05:43:56 +04:00
for ( i = 1 ; i < argc ; i + + ) {
if ( argv [ i ] [ 0 ] ! = ' - ' ) {
2007-10-22 05:03:26 +04:00
mem = atoi ( argv [ i ] ) * 1024 * 1024 ;
/* We start by mapping anonymous pages over all of
* guest - physical memory range . This fills it with 0 ,
* and ensures that the Guest won ' t be killed when it
* tries to access it . */
guest_base = map_zeroed_pages ( mem / getpagesize ( )
+ DEVICE_PAGES ) ;
guest_limit = mem ;
guest_max = mem + DEVICE_PAGES * getpagesize ( ) ;
2007-10-22 05:24:22 +04:00
devices . descpage = get_pages ( 1 ) ;
2007-07-24 05:43:56 +04:00
break ;
}
}
2007-07-26 21:41:03 +04:00
/* The options are fairly straight-forward */
2007-07-19 12:49:29 +04:00
while ( ( c = getopt_long ( argc , argv , " v " , opts , NULL ) ) ! = EOF ) {
switch ( c ) {
case ' v ' :
verbose = true ;
break ;
case ' t ' :
2007-10-22 05:24:22 +04:00
setup_tun_net ( optarg ) ;
2007-07-19 12:49:29 +04:00
break ;
case ' b ' :
2007-10-22 05:24:22 +04:00
setup_block_file ( optarg ) ;
2007-07-19 12:49:29 +04:00
break ;
case ' i ' :
initrd_name = optarg ;
break ;
default :
warnx ( " Unknown argument %s " , argv [ optind ] ) ;
usage ( ) ;
}
}
2007-07-26 21:41:03 +04:00
/* After the other arguments we expect memory and kernel image name,
* followed by command line arguments for the kernel . */
2007-07-19 12:49:29 +04:00
if ( optind + 2 > argc )
usage ( ) ;
2007-10-22 05:03:26 +04:00
verbose ( " Guest base is at %p \n " , guest_base ) ;
2007-07-26 21:41:03 +04:00
/* We always have a console device */
2007-10-22 05:24:22 +04:00
setup_console ( ) ;
2007-07-19 12:49:29 +04:00
/* Now we load the kernel */
2007-10-22 05:03:36 +04:00
start = load_kernel ( open_or_die ( argv [ optind + 1 ] , O_RDONLY ) ) ;
2007-07-19 12:49:29 +04:00
2007-10-22 05:03:26 +04:00
/* Boot information is stashed at physical address 0 */
boot = from_guest_phys ( 0 ) ;
2007-07-26 21:41:03 +04:00
/* Map the initrd image if requested (at top of physical memory) */
2007-07-19 12:49:29 +04:00
if ( initrd_name ) {
initrd_size = load_initrd ( initrd_name , mem ) ;
2007-07-26 21:41:03 +04:00
/* These are the location in the Linux boot header where the
* start and size of the initrd are expected to be found . */
2007-07-19 12:49:29 +04:00
* ( unsigned long * ) ( boot + 0x218 ) = mem - initrd_size ;
* ( unsigned long * ) ( boot + 0x21c ) = initrd_size ;
2007-07-26 21:41:03 +04:00
/* The bootloader type 0xFF means "unknown"; that's OK. */
2007-07-19 12:49:29 +04:00
* ( unsigned char * ) ( boot + 0x210 ) = 0xFF ;
}
2007-07-26 21:41:03 +04:00
/* Set up the initial linear pagetables, starting below the initrd. */
2007-10-22 05:03:36 +04:00
pgdir = setup_pagetables ( mem , initrd_size ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* The Linux boot header contains an "E820" memory map: ours is a
* simple , single region . */
2007-07-19 12:49:29 +04:00
* ( char * ) ( boot + E820NR ) = 1 ;
* ( ( struct e820entry * ) ( boot + E820MAP ) )
= ( ( struct e820entry ) { 0 , mem , E820_RAM } ) ;
2007-07-26 21:41:03 +04:00
/* The boot header contains a command line pointer: we put the command
* line after the boot header ( at address 4096 ) */
2007-10-22 05:03:26 +04:00
* ( u32 * ) ( boot + 0x228 ) = 4096 ;
2007-07-19 12:49:29 +04:00
concat ( boot + 4096 , argv + optind + 2 ) ;
2007-07-26 21:41:03 +04:00
/* The guest type value of "1" tells the Guest it's under lguest. */
2007-07-19 12:49:29 +04:00
* ( int * ) ( boot + 0x23c ) = 1 ;
2007-07-26 21:41:03 +04:00
/* We tell the kernel to initialize the Guest: this returns the open
* / dev / lguest file descriptor . */
2007-10-22 05:03:36 +04:00
lguest_fd = tell_kernel ( pgdir , start ) ;
2007-07-26 21:41:03 +04:00
/* We fork off a child process, which wakes the Launcher whenever one
* of the input file descriptors needs attention . Otherwise we would
* run the Guest until it tries to output something . */
2007-10-22 05:24:22 +04:00
waker_fd = setup_waker ( lguest_fd ) ;
2007-07-19 12:49:29 +04:00
2007-07-26 21:41:03 +04:00
/* Finally, run the Guest. This doesn't return. */
2007-10-22 05:24:22 +04:00
run_guest ( lguest_fd ) ;
2007-07-19 12:49:29 +04:00
}
2007-07-26 21:41:05 +04:00
/*:*/
/*M:999
* Mastery is done : you now know everything I do .
*
* But surely you have seen code , features and bugs in your wanderings which
* you now yearn to attack ? That is the real game , and I look forward to you
* patching and forking lguest into the Your - Name - Here - visor .
*
* Farewell , and good coding !
* Rusty Russell .
*/