Based on kernel version 2.6.39.1. Page generated on 2011-06-03 13:47 EST.
1 /*P:100 2 * This is the Launcher code, a simple program which lays out the "physical" 3 * memory for the new Guest by mapping the kernel image and the virtual 4 * devices, then opens /dev/lguest to tell the kernel about the Guest and 5 * control it. 6 :*/ 7 #define _LARGEFILE64_SOURCE 8 #define _GNU_SOURCE 9 #include <stdio.h> 10 #include <string.h> 11 #include <unistd.h> 12 #include <err.h> 13 #include <stdint.h> 14 #include <stdlib.h> 15 #include <elf.h> 16 #include <sys/mman.h> 17 #include <sys/param.h> 18 #include <sys/types.h> 19 #include <sys/stat.h> 20 #include <sys/wait.h> 21 #include <sys/eventfd.h> 22 #include <fcntl.h> 23 #include <stdbool.h> 24 #include <errno.h> 25 #include <ctype.h> 26 #include <sys/socket.h> 27 #include <sys/ioctl.h> 28 #include <sys/time.h> 29 #include <time.h> 30 #include <netinet/in.h> 31 #include <net/if.h> 32 #include <linux/sockios.h> 33 #include <linux/if_tun.h> 34 #include <sys/uio.h> 35 #include <termios.h> 36 #include <getopt.h> 37 #include <assert.h> 38 #include <sched.h> 39 #include <limits.h> 40 #include <stddef.h> 41 #include <signal.h> 42 #include <pwd.h> 43 #include <grp.h> 44 45 #include <linux/virtio_config.h> 46 #include <linux/virtio_net.h> 47 #include <linux/virtio_blk.h> 48 #include <linux/virtio_console.h> 49 #include <linux/virtio_rng.h> 50 #include <linux/virtio_ring.h> 51 #include <asm/bootparam.h> 52 #include "../../include/linux/lguest_launcher.h" 53 /*L:110 54 * We can ignore the 42 include files we need for this program, but I do want 55 * to draw attention to the use of kernel-style types. 56 * 57 * As Linus said, "C is a Spartan language, and so should your naming be." I 58 * like these abbreviations, so we define them here. Note that u64 is always 59 * unsigned long long, which works on all Linux systems: this means that we can 60 * use %llu in printf for any u64. 61 */ 62 typedef unsigned long long u64; 63 typedef uint32_t u32; 64 typedef uint16_t u16; 65 typedef uint8_t u8; 66 /*:*/ 67 68 #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 69 #define BRIDGE_PFX "bridge:" 70 #ifndef SIOCBRADDIF 71 #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 72 #endif 73 /* We can have up to 256 pages for devices. */ 74 #define DEVICE_PAGES 256 75 /* This will occupy 3 pages: it must be a power of 2. */ 76 #define VIRTQUEUE_NUM 256 77 78 /*L:120 79 * verbose is both a global flag and a macro. The C preprocessor allows 80 * this, and although I wouldn't recommend it, it works quite nicely here. 81 */ 82 static bool verbose; 83 #define verbose(args...) \ 84 do { if (verbose) printf(args); } while(0) 85 /*:*/ 86 87 /* The pointer to the start of guest memory. */ 88 static void *guest_base; 89 /* The maximum guest physical address allowed, and maximum possible. */ 90 static unsigned long guest_limit, guest_max; 91 /* The /dev/lguest file descriptor. */ 92 static int lguest_fd; 93 94 /* a per-cpu variable indicating whose vcpu is currently running */ 95 static unsigned int __thread cpu_id; 96 97 /* This is our list of devices. */ 98 struct device_list { 99 /* Counter to assign interrupt numbers. */ 100 unsigned int next_irq; 101 102 /* Counter to print out convenient device numbers. */ 103 unsigned int device_num; 104 105 /* The descriptor page for the devices. */ 106 u8 *descpage; 107 108 /* A single linked list of devices. */ 109 struct device *dev; 110 /* And a pointer to the last device for easy append. */ 111 struct device *lastdev; 112 }; 113 114 /* The list of Guest devices, based on command line arguments. */ 115 static struct device_list devices; 116 117 /* The device structure describes a single device. */ 118 struct device { 119 /* The linked-list pointer. */ 120 struct device *next; 121 122 /* The device's descriptor, as mapped into the Guest. */ 123 struct lguest_device_desc *desc; 124 125 /* We can't trust desc values once Guest has booted: we use these. */ 126 unsigned int feature_len; 127 unsigned int num_vq; 128 129 /* The name of this device, for --verbose. */ 130 const char *name; 131 132 /* Any queues attached to this device */ 133 struct virtqueue *vq; 134 135 /* Is it operational */ 136 bool running; 137 138 /* Does Guest want an intrrupt on empty? */ 139 bool irq_on_empty; 140 141 /* Device-specific data. */ 142 void *priv; 143 }; 144 145 /* The virtqueue structure describes a queue attached to a device. */ 146 struct virtqueue { 147 struct virtqueue *next; 148 149 /* Which device owns me. */ 150 struct device *dev; 151 152 /* The configuration for this queue. */ 153 struct lguest_vqconfig config; 154 155 /* The actual ring of buffers. */ 156 struct vring vring; 157 158 /* Last available index we saw. */ 159 u16 last_avail_idx; 160 161 /* How many are used since we sent last irq? */ 162 unsigned int pending_used; 163 164 /* Eventfd where Guest notifications arrive. */ 165 int eventfd; 166 167 /* Function for the thread which is servicing this virtqueue. */ 168 void (*service)(struct virtqueue *vq); 169 pid_t thread; 170 }; 171 172 /* Remember the arguments to the program so we can "reboot" */ 173 static char **main_args; 174 175 /* The original tty settings to restore on exit. */ 176 static struct termios orig_term; 177 178 /* 179 * We have to be careful with barriers: our devices are all run in separate 180 * threads and so we need to make sure that changes visible to the Guest happen 181 * in precise order. 182 */ 183 #define wmb() __asm__ __volatile__("" : : : "memory") 184 #define mb() __asm__ __volatile__("" : : : "memory") 185 186 /* 187 * Convert an iovec element to the given type. 188 * 189 * This is a fairly ugly trick: we need to know the size of the type and 190 * alignment requirement to check the pointer is kosher. It's also nice to 191 * have the name of the type in case we report failure. 192 * 193 * Typing those three things all the time is cumbersome and error prone, so we 194 * have a macro which sets them all up and passes to the real function. 195 */ 196 #define convert(iov, type) \ 197 ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) 198 199 static void *_convert(struct iovec *iov, size_t size, size_t align, 200 const char *name) 201 { 202 if (iov->iov_len != size) 203 errx(1, "Bad iovec size %zu for %s", iov->iov_len, name); 204 if ((unsigned long)iov->iov_base % align != 0) 205 errx(1, "Bad alignment %p for %s", iov->iov_base, name); 206 return iov->iov_base; 207 } 208 209 /* Wrapper for the last available index. Makes it easier to change. */ 210 #define lg_last_avail(vq) ((vq)->last_avail_idx) 211 212 /* 213 * The virtio configuration space is defined to be little-endian. x86 is 214 * little-endian too, but it's nice to be explicit so we have these helpers. 215 */ 216 #define cpu_to_le16(v16) (v16) 217 #define cpu_to_le32(v32) (v32) 218 #define cpu_to_le64(v64) (v64) 219 #define le16_to_cpu(v16) (v16) 220 #define le32_to_cpu(v32) (v32) 221 #define le64_to_cpu(v64) (v64) 222 223 /* Is this iovec empty? */ 224 static bool iov_empty(const struct iovec iov[], unsigned int num_iov) 225 { 226 unsigned int i; 227 228 for (i = 0; i < num_iov; i++) 229 if (iov[i].iov_len) 230 return false; 231 return true; 232 } 233 234 /* Take len bytes from the front of this iovec. */ 235 static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) 236 { 237 unsigned int i; 238 239 for (i = 0; i < num_iov; i++) { 240 unsigned int used; 241 242 used = iov[i].iov_len < len ? iov[i].iov_len : len; 243 iov[i].iov_base += used; 244 iov[i].iov_len -= used; 245 len -= used; 246 } 247 assert(len == 0); 248 } 249 250 /* The device virtqueue descriptors are followed by feature bitmasks. */ 251 static u8 *get_feature_bits(struct device *dev) 252 { 253 return (u8 *)(dev->desc + 1) 254 + dev->num_vq * sizeof(struct lguest_vqconfig); 255 } 256 257 /*L:100 258 * The Launcher code itself takes us out into userspace, that scary place where 259 * pointers run wild and free! Unfortunately, like most userspace programs, 260 * it's quite boring (which is why everyone likes to hack on the kernel!). 261 * Perhaps if you make up an Lguest Drinking Game at this point, it will get 262 * you through this section. Or, maybe not. 263 * 264 * The Launcher sets up a big chunk of memory to be the Guest's "physical" 265 * memory and stores it in "guest_base". In other words, Guest physical == 266 * Launcher virtual with an offset. 267 * 268 * This can be tough to get your head around, but usually it just means that we 269 * use these trivial conversion functions when the Guest gives us its 270 * "physical" addresses: 271 */ 272 static void *from_guest_phys(unsigned long addr) 273 { 274 return guest_base + addr; 275 } 276 277 static unsigned long to_guest_phys(const void *addr) 278 { 279 return (addr - guest_base); 280 } 281 282 /*L:130 283 * Loading the Kernel. 284 * 285 * We start with couple of simple helper routines. open_or_die() avoids 286 * error-checking code cluttering the callers: 287 */ 288 static int open_or_die(const char *name, int flags) 289 { 290 int fd = open(name, flags); 291 if (fd < 0) 292 err(1, "Failed to open %s", name); 293 return fd; 294 } 295 296 /* map_zeroed_pages() takes a number of pages. */ 297 static void *map_zeroed_pages(unsigned int num) 298 { 299 int fd = open_or_die("/dev/zero", O_RDONLY); 300 void *addr; 301 302 /* 303 * We use a private mapping (ie. if we write to the page, it will be 304 * copied). We allocate an extra two pages PROT_NONE to act as guard 305 * pages against read/write attempts that exceed allocated space. 306 */ 307 addr = mmap(NULL, getpagesize() * (num+2), 308 PROT_NONE, MAP_PRIVATE, fd, 0); 309 310 if (addr == MAP_FAILED) 311 err(1, "Mmapping %u pages of /dev/zero", num); 312 313 if (mprotect(addr + getpagesize(), getpagesize() * num, 314 PROT_READ|PROT_WRITE) == -1) 315 err(1, "mprotect rw %u pages failed", num); 316 317 /* 318 * One neat mmap feature is that you can close the fd, and it 319 * stays mapped. 320 */ 321 close(fd); 322 323 /* Return address after PROT_NONE page */ 324 return addr + getpagesize(); 325 } 326 327 /* Get some more pages for a device. */ 328 static void *get_pages(unsigned int num) 329 { 330 void *addr = from_guest_phys(guest_limit); 331 332 guest_limit += num * getpagesize(); 333 if (guest_limit > guest_max) 334 errx(1, "Not enough memory for devices"); 335 return addr; 336 } 337 338 /* 339 * This routine is used to load the kernel or initrd. It tries mmap, but if 340 * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), 341 * it falls back to reading the memory in. 342 */ 343 static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) 344 { 345 ssize_t r; 346 347 /* 348 * We map writable even though for some segments are marked read-only. 349 * The kernel really wants to be writable: it patches its own 350 * instructions. 351 * 352 * MAP_PRIVATE means that the page won't be copied until a write is 353 * done to it. This allows us to share untouched memory between 354 * Guests. 355 */ 356 if (mmap(addr, len, PROT_READ|PROT_WRITE, 357 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) 358 return; 359 360 /* pread does a seek and a read in one shot: saves a few lines. */ 361 r = pread(fd, addr, len, offset); 362 if (r != len) 363 err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); 364 } 365 366 /* 367 * This routine takes an open vmlinux image, which is in ELF, and maps it into 368 * the Guest memory. ELF = Embedded Linking Format, which is the format used 369 * by all modern binaries on Linux including the kernel. 370 * 371 * The ELF headers give *two* addresses: a physical address, and a virtual 372 * address. We use the physical address; the Guest will map itself to the 373 * virtual address. 374 * 375 * We return the starting address. 376 */ 377 static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) 378 { 379 Elf32_Phdr phdr[ehdr->e_phnum]; 380 unsigned int i; 381 382 /* 383 * Sanity checks on the main ELF header: an x86 executable with a 384 * reasonable number of correctly-sized program headers. 385 */ 386 if (ehdr->e_type != ET_EXEC 387 || ehdr->e_machine != EM_386 388 || ehdr->e_phentsize != sizeof(Elf32_Phdr) 389 || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) 390 errx(1, "Malformed elf header"); 391 392 /* 393 * An ELF executable contains an ELF header and a number of "program" 394 * headers which indicate which parts ("segments") of the program to 395 * load where. 396 */ 397 398 /* We read in all the program headers at once: */ 399 if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) 400 err(1, "Seeking to program headers"); 401 if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) 402 err(1, "Reading program headers"); 403 404 /* 405 * Try all the headers: there are usually only three. A read-only one, 406 * a read-write one, and a "note" section which we don't load. 407 */ 408 for (i = 0; i < ehdr->e_phnum; i++) { 409 /* If this isn't a loadable segment, we ignore it */ 410 if (phdr[i].p_type != PT_LOAD) 411 continue; 412 413 verbose("Section %i: size %i addr %p\n", 414 i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); 415 416 /* We map this section of the file at its physical address. */ 417 map_at(elf_fd, from_guest_phys(phdr[i].p_paddr), 418 phdr[i].p_offset, phdr[i].p_filesz); 419 } 420 421 /* The entry point is given in the ELF header. */ 422 return ehdr->e_entry; 423 } 424 425 /*L:150 426 * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed 427 * to jump into it and it will unpack itself. We used to have to perform some 428 * hairy magic because the unpacking code scared me. 429 * 430 * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote 431 * a small patch to jump over the tricky bits in the Guest, so now we just read 432 * the funky header so we know where in the file to load, and away we go! 433 */ 434 static unsigned long load_bzimage(int fd) 435 { 436 struct boot_params boot; 437 int r; 438 /* Modern bzImages get loaded at 1M. */ 439 void *p = from_guest_phys(0x100000); 440 441 /* 442 * Go back to the start of the file and read the header. It should be 443 * a Linux boot header (see Documentation/x86/i386/boot.txt) 444 */ 445 lseek(fd, 0, SEEK_SET); 446 read(fd, &boot, sizeof(boot)); 447 448 /* Inside the setup_hdr, we expect the magic "HdrS" */ 449 if (memcmp(&boot.hdr.header, "HdrS", 4) != 0) 450 errx(1, "This doesn't look like a bzImage to me"); 451 452 /* Skip over the extra sectors of the header. */ 453 lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET); 454 455 /* Now read everything into memory. in nice big chunks. */ 456 while ((r = read(fd, p, 65536)) > 0) 457 p += r; 458 459 /* Finally, code32_start tells us where to enter the kernel. */ 460 return boot.hdr.code32_start; 461 } 462 463 /*L:140 464 * Loading the kernel is easy when it's a "vmlinux", but most kernels 465 * come wrapped up in the self-decompressing "bzImage" format. With a little 466 * work, we can load those, too. 467 */ 468 static unsigned long load_kernel(int fd) 469 { 470 Elf32_Ehdr hdr; 471 472 /* Read in the first few bytes. */ 473 if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) 474 err(1, "Reading kernel"); 475 476 /* If it's an ELF file, it starts with "\177ELF" */ 477 if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) 478 return map_elf(fd, &hdr); 479 480 /* Otherwise we assume it's a bzImage, and try to load it. */ 481 return load_bzimage(fd); 482 } 483 484 /* 485 * This is a trivial little helper to align pages. Andi Kleen hated it because 486 * it calls getpagesize() twice: "it's dumb code." 487 * 488 * Kernel guys get really het up about optimization, even when it's not 489 * necessary. I leave this code as a reaction against that. 490 */ 491 static inline unsigned long page_align(unsigned long addr) 492 { 493 /* Add upwards and truncate downwards. */ 494 return ((addr + getpagesize()-1) & ~(getpagesize()-1)); 495 } 496 497 /*L:180 498 * An "initial ram disk" is a disk image loaded into memory along with the 499 * kernel which the kernel can use to boot from without needing any drivers. 500 * Most distributions now use this as standard: the initrd contains the code to 501 * load the appropriate driver modules for the current machine. 502 * 503 * Importantly, James Morris works for RedHat, and Fedora uses initrds for its 504 * kernels. He sent me this (and tells me when I break it). 505 */ 506 static unsigned long load_initrd(const char *name, unsigned long mem) 507 { 508 int ifd; 509 struct stat st; 510 unsigned long len; 511 512 ifd = open_or_die(name, O_RDONLY); 513 /* fstat() is needed to get the file size. */ 514 if (fstat(ifd, &st) < 0) 515 err(1, "fstat() on initrd '%s'", name); 516 517 /* 518 * We map the initrd at the top of memory, but mmap wants it to be 519 * page-aligned, so we round the size up for that. 520 */ 521 len = page_align(st.st_size); 522 map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); 523 /* 524 * Once a file is mapped, you can close the file descriptor. It's a 525 * little odd, but quite useful. 526 */ 527 close(ifd); 528 verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); 529 530 /* We return the initrd size. */ 531 return len; 532 } 533 /*:*/ 534 535 /* 536 * Simple routine to roll all the commandline arguments together with spaces 537 * between them. 538 */ 539 static void concat(char *dst, char *args[]) 540 { 541 unsigned int i, len = 0; 542 543 for (i = 0; args[i]; i++) { 544 if (i) { 545 strcat(dst+len, " "); 546 len++; 547 } 548 strcpy(dst+len, args[i]); 549 len += strlen(args[i]); 550 } 551 /* In case it's empty. */ 552 dst[len] = '\0'; 553 } 554 555 /*L:185 556 * This is where we actually tell the kernel to initialize the Guest. We 557 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 558 * the base of Guest "physical" memory, the top physical page to allow and the 559 * entry point for the Guest. 560 */ 561 static void tell_kernel(unsigned long start) 562 { 563 unsigned long args[] = { LHREQ_INITIALIZE, 564 (unsigned long)guest_base, 565 guest_limit / getpagesize(), start }; 566 verbose("Guest: %p - %p (%#lx)\n", 567 guest_base, guest_base + guest_limit, guest_limit); 568 lguest_fd = open_or_die("/dev/lguest", O_RDWR); 569 if (write(lguest_fd, args, sizeof(args)) < 0) 570 err(1, "Writing to /dev/lguest"); 571 } 572 /*:*/ 573 574 /*L:200 575 * Device Handling. 576 * 577 * When the Guest gives us a buffer, it sends an array of addresses and sizes. 578 * We need to make sure it's not trying to reach into the Launcher itself, so 579 * we have a convenient routine which checks it and exits with an error message 580 * if something funny is going on: 581 */ 582 static void *_check_pointer(unsigned long addr, unsigned int size, 583 unsigned int line) 584 { 585 /* 586 * Check if the requested address and size exceeds the allocated memory, 587 * or addr + size wraps around. 588 */ 589 if ((addr + size) > guest_limit || (addr + size) < addr) 590 errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); 591 /* 592 * We return a pointer for the caller's convenience, now we know it's 593 * safe to use. 594 */ 595 return from_guest_phys(addr); 596 } 597 /* A macro which transparently hands the line number to the real function. */ 598 #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 599 600 /* 601 * Each buffer in the virtqueues is actually a chain of descriptors. This 602 * function returns the next descriptor in the chain, or vq->vring.num if we're 603 * at the end. 604 */ 605 static unsigned next_desc(struct vring_desc *desc, 606 unsigned int i, unsigned int max) 607 { 608 unsigned int next; 609 610 /* If this descriptor says it doesn't chain, we're done. */ 611 if (!(desc[i].flags & VRING_DESC_F_NEXT)) 612 return max; 613 614 /* Check they're not leading us off end of descriptors. */ 615 next = desc[i].next; 616 /* Make sure compiler knows to grab that: we don't want it changing! */ 617 wmb(); 618 619 if (next >= max) 620 errx(1, "Desc next is %u", next); 621 622 return next; 623 } 624 625 /* 626 * This actually sends the interrupt for this virtqueue, if we've used a 627 * buffer. 628 */ 629 static void trigger_irq(struct virtqueue *vq) 630 { 631 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 632 633 /* Don't inform them if nothing used. */ 634 if (!vq->pending_used) 635 return; 636 vq->pending_used = 0; 637 638 /* If they don't want an interrupt, don't send one... */ 639 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { 640 /* ... unless they've asked us to force one on empty. */ 641 if (!vq->dev->irq_on_empty 642 || lg_last_avail(vq) != vq->vring.avail->idx) 643 return; 644 } 645 646 /* Send the Guest an interrupt tell them we used something up. */ 647 if (write(lguest_fd, buf, sizeof(buf)) != 0) 648 err(1, "Triggering irq %i", vq->config.irq); 649 } 650 651 /* 652 * This looks in the virtqueue for the first available buffer, and converts 653 * it to an iovec for convenient access. Since descriptors consist of some 654 * number of output then some number of input descriptors, it's actually two 655 * iovecs, but we pack them into one and note how many of each there were. 656 * 657 * This function waits if necessary, and returns the descriptor number found. 658 */ 659 static unsigned wait_for_vq_desc(struct virtqueue *vq, 660 struct iovec iov[], 661 unsigned int *out_num, unsigned int *in_num) 662 { 663 unsigned int i, head, max; 664 struct vring_desc *desc; 665 u16 last_avail = lg_last_avail(vq); 666 667 /* There's nothing available? */ 668 while (last_avail == vq->vring.avail->idx) { 669 u64 event; 670 671 /* 672 * Since we're about to sleep, now is a good time to tell the 673 * Guest about what we've used up to now. 674 */ 675 trigger_irq(vq); 676 677 /* OK, now we need to know about added descriptors. */ 678 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; 679 680 /* 681 * They could have slipped one in as we were doing that: make 682 * sure it's written, then check again. 683 */ 684 mb(); 685 if (last_avail != vq->vring.avail->idx) { 686 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 687 break; 688 } 689 690 /* Nothing new? Wait for eventfd to tell us they refilled. */ 691 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) 692 errx(1, "Event read failed?"); 693 694 /* We don't need to be notified again. */ 695 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 696 } 697 698 /* Check it isn't doing very strange things with descriptor numbers. */ 699 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 700 errx(1, "Guest moved used index from %u to %u", 701 last_avail, vq->vring.avail->idx); 702 703 /* 704 * Grab the next descriptor number they're advertising, and increment 705 * the index we've seen. 706 */ 707 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 708 lg_last_avail(vq)++; 709 710 /* If their number is silly, that's a fatal mistake. */ 711 if (head >= vq->vring.num) 712 errx(1, "Guest says index %u is available", head); 713 714 /* When we start there are none of either input nor output. */ 715 *out_num = *in_num = 0; 716 717 max = vq->vring.num; 718 desc = vq->vring.desc; 719 i = head; 720 721 /* 722 * If this is an indirect entry, then this buffer contains a descriptor 723 * table which we handle as if it's any normal descriptor chain. 724 */ 725 if (desc[i].flags & VRING_DESC_F_INDIRECT) { 726 if (desc[i].len % sizeof(struct vring_desc)) 727 errx(1, "Invalid size for indirect buffer table"); 728 729 max = desc[i].len / sizeof(struct vring_desc); 730 desc = check_pointer(desc[i].addr, desc[i].len); 731 i = 0; 732 } 733 734 do { 735 /* Grab the first descriptor, and check it's OK. */ 736 iov[*out_num + *in_num].iov_len = desc[i].len; 737 iov[*out_num + *in_num].iov_base 738 = check_pointer(desc[i].addr, desc[i].len); 739 /* If this is an input descriptor, increment that count. */ 740 if (desc[i].flags & VRING_DESC_F_WRITE) 741 (*in_num)++; 742 else { 743 /* 744 * If it's an output descriptor, they're all supposed 745 * to come before any input descriptors. 746 */ 747 if (*in_num) 748 errx(1, "Descriptor has out after in"); 749 (*out_num)++; 750 } 751 752 /* If we've got too many, that implies a descriptor loop. */ 753 if (*out_num + *in_num > max) 754 errx(1, "Looped descriptor"); 755 } while ((i = next_desc(desc, i, max)) != max); 756 757 return head; 758 } 759 760 /* 761 * After we've used one of their buffers, we tell the Guest about it. Sometime 762 * later we'll want to send them an interrupt using trigger_irq(); note that 763 * wait_for_vq_desc() does that for us if it has to wait. 764 */ 765 static void add_used(struct virtqueue *vq, unsigned int head, int len) 766 { 767 struct vring_used_elem *used; 768 769 /* 770 * The virtqueue contains a ring of used buffers. Get a pointer to the 771 * next entry in that used ring. 772 */ 773 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 774 used->id = head; 775 used->len = len; 776 /* Make sure buffer is written before we update index. */ 777 wmb(); 778 vq->vring.used->idx++; 779 vq->pending_used++; 780 } 781 782 /* And here's the combo meal deal. Supersize me! */ 783 static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) 784 { 785 add_used(vq, head, len); 786 trigger_irq(vq); 787 } 788 789 /* 790 * The Console 791 * 792 * We associate some data with the console for our exit hack. 793 */ 794 struct console_abort { 795 /* How many times have they hit ^C? */ 796 int count; 797 /* When did they start? */ 798 struct timeval start; 799 }; 800 801 /* This is the routine which handles console input (ie. stdin). */ 802 static void console_input(struct virtqueue *vq) 803 { 804 int len; 805 unsigned int head, in_num, out_num; 806 struct console_abort *abort = vq->dev->priv; 807 struct iovec iov[vq->vring.num]; 808 809 /* Make sure there's a descriptor available. */ 810 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 811 if (out_num) 812 errx(1, "Output buffers in console in queue?"); 813 814 /* Read into it. This is where we usually wait. */ 815 len = readv(STDIN_FILENO, iov, in_num); 816 if (len <= 0) { 817 /* Ran out of input? */ 818 warnx("Failed to get console input, ignoring console."); 819 /* 820 * For simplicity, dying threads kill the whole Launcher. So 821 * just nap here. 822 */ 823 for (;;) 824 pause(); 825 } 826 827 /* Tell the Guest we used a buffer. */ 828 add_used_and_trigger(vq, head, len); 829 830 /* 831 * Three ^C within one second? Exit. 832 * 833 * This is such a hack, but works surprisingly well. Each ^C has to 834 * be in a buffer by itself, so they can't be too fast. But we check 835 * that we get three within about a second, so they can't be too 836 * slow. 837 */ 838 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { 839 abort->count = 0; 840 return; 841 } 842 843 abort->count++; 844 if (abort->count == 1) 845 gettimeofday(&abort->start, NULL); 846 else if (abort->count == 3) { 847 struct timeval now; 848 gettimeofday(&now, NULL); 849 /* Kill all Launcher processes with SIGINT, like normal ^C */ 850 if (now.tv_sec <= abort->start.tv_sec+1) 851 kill(0, SIGINT); 852 abort->count = 0; 853 } 854 } 855 856 /* This is the routine which handles console output (ie. stdout). */ 857 static void console_output(struct virtqueue *vq) 858 { 859 unsigned int head, out, in; 860 struct iovec iov[vq->vring.num]; 861 862 /* We usually wait in here, for the Guest to give us something. */ 863 head = wait_for_vq_desc(vq, iov, &out, &in); 864 if (in) 865 errx(1, "Input buffers in console output queue?"); 866 867 /* writev can return a partial write, so we loop here. */ 868 while (!iov_empty(iov, out)) { 869 int len = writev(STDOUT_FILENO, iov, out); 870 if (len <= 0) 871 err(1, "Write to stdout gave %i", len); 872 iov_consume(iov, out, len); 873 } 874 875 /* 876 * We're finished with that buffer: if we're going to sleep, 877 * wait_for_vq_desc() will prod the Guest with an interrupt. 878 */ 879 add_used(vq, head, 0); 880 } 881 882 /* 883 * The Network 884 * 885 * Handling output for network is also simple: we get all the output buffers 886 * and write them to /dev/net/tun. 887 */ 888 struct net_info { 889 int tunfd; 890 }; 891 892 static void net_output(struct virtqueue *vq) 893 { 894 struct net_info *net_info = vq->dev->priv; 895 unsigned int head, out, in; 896 struct iovec iov[vq->vring.num]; 897 898 /* We usually wait in here for the Guest to give us a packet. */ 899 head = wait_for_vq_desc(vq, iov, &out, &in); 900 if (in) 901 errx(1, "Input buffers in net output queue?"); 902 /* 903 * Send the whole thing through to /dev/net/tun. It expects the exact 904 * same format: what a coincidence! 905 */ 906 if (writev(net_info->tunfd, iov, out) < 0) 907 errx(1, "Write to tun failed?"); 908 909 /* 910 * Done with that one; wait_for_vq_desc() will send the interrupt if 911 * all packets are processed. 912 */ 913 add_used(vq, head, 0); 914 } 915 916 /* 917 * Handling network input is a bit trickier, because I've tried to optimize it. 918 * 919 * First we have a helper routine which tells is if from this file descriptor 920 * (ie. the /dev/net/tun device) will block: 921 */ 922 static bool will_block(int fd) 923 { 924 fd_set fdset; 925 struct timeval zero = { 0, 0 }; 926 FD_ZERO(&fdset); 927 FD_SET(fd, &fdset); 928 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 929 } 930 931 /* 932 * This handles packets coming in from the tun device to our Guest. Like all 933 * service routines, it gets called again as soon as it returns, so you don't 934 * see a while(1) loop here. 935 */ 936 static void net_input(struct virtqueue *vq) 937 { 938 int len; 939 unsigned int head, out, in; 940 struct iovec iov[vq->vring.num]; 941 struct net_info *net_info = vq->dev->priv; 942 943 /* 944 * Get a descriptor to write an incoming packet into. This will also 945 * send an interrupt if they're out of descriptors. 946 */ 947 head = wait_for_vq_desc(vq, iov, &out, &in); 948 if (out) 949 errx(1, "Output buffers in net input queue?"); 950 951 /* 952 * If it looks like we'll block reading from the tun device, send them 953 * an interrupt. 954 */ 955 if (vq->pending_used && will_block(net_info->tunfd)) 956 trigger_irq(vq); 957 958 /* 959 * Read in the packet. This is where we normally wait (when there's no 960 * incoming network traffic). 961 */ 962 len = readv(net_info->tunfd, iov, in); 963 if (len <= 0) 964 err(1, "Failed to read from tun."); 965 966 /* 967 * Mark that packet buffer as used, but don't interrupt here. We want 968 * to wait until we've done as much work as we can. 969 */ 970 add_used(vq, head, len); 971 } 972 /*:*/ 973 974 /* This is the helper to create threads: run the service routine in a loop. */ 975 static int do_thread(void *_vq) 976 { 977 struct virtqueue *vq = _vq; 978 979 for (;;) 980 vq->service(vq); 981 return 0; 982 } 983 984 /* 985 * When a child dies, we kill our entire process group with SIGTERM. This 986 * also has the side effect that the shell restores the console for us! 987 */ 988 static void kill_launcher(int signal) 989 { 990 kill(0, SIGTERM); 991 } 992 993 static void reset_device(struct device *dev) 994 { 995 struct virtqueue *vq; 996 997 verbose("Resetting device %s\n", dev->name); 998 999 /* Clear any features they've acked. */ 1000 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); 1001 1002 /* We're going to be explicitly killing threads, so ignore them. */ 1003 signal(SIGCHLD, SIG_IGN); 1004 1005 /* Zero out the virtqueues, get rid of their threads */ 1006 for (vq = dev->vq; vq; vq = vq->next) { 1007 if (vq->thread != (pid_t)-1) { 1008 kill(vq->thread, SIGTERM); 1009 waitpid(vq->thread, NULL, 0); 1010 vq->thread = (pid_t)-1; 1011 } 1012 memset(vq->vring.desc, 0, 1013 vring_size(vq->config.num, LGUEST_VRING_ALIGN)); 1014 lg_last_avail(vq) = 0; 1015 } 1016 dev->running = false; 1017 1018 /* Now we care if threads die. */ 1019 signal(SIGCHLD, (void *)kill_launcher); 1020 } 1021 1022 /*L:216 1023 * This actually creates the thread which services the virtqueue for a device. 1024 */ 1025 static void create_thread(struct virtqueue *vq) 1026 { 1027 /* 1028 * Create stack for thread. Since the stack grows upwards, we point 1029 * the stack pointer to the end of this region. 1030 */ 1031 char *stack = malloc(32768); 1032 unsigned long args[] = { LHREQ_EVENTFD, 1033 vq->config.pfn*getpagesize(), 0 }; 1034 1035 /* Create a zero-initialized eventfd. */ 1036 vq->eventfd = eventfd(0, 0); 1037 if (vq->eventfd < 0) 1038 err(1, "Creating eventfd"); 1039 args[2] = vq->eventfd; 1040 1041 /* 1042 * Attach an eventfd to this virtqueue: it will go off when the Guest 1043 * does an LHCALL_NOTIFY for this vq. 1044 */ 1045 if (write(lguest_fd, &args, sizeof(args)) != 0) 1046 err(1, "Attaching eventfd"); 1047 1048 /* 1049 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so 1050 * we get a signal if it dies. 1051 */ 1052 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1053 if (vq->thread == (pid_t)-1) 1054 err(1, "Creating clone"); 1055 1056 /* We close our local copy now the child has it. */ 1057 close(vq->eventfd); 1058 } 1059 1060 static bool accepted_feature(struct device *dev, unsigned int bit) 1061 { 1062 const u8 *features = get_feature_bits(dev) + dev->feature_len; 1063 1064 if (dev->feature_len < bit / CHAR_BIT) 1065 return false; 1066 return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); 1067 } 1068 1069 static void start_device(struct device *dev) 1070 { 1071 unsigned int i; 1072 struct virtqueue *vq; 1073 1074 verbose("Device %s OK: offered", dev->name); 1075 for (i = 0; i < dev->feature_len; i++) 1076 verbose(" %02x", get_feature_bits(dev)[i]); 1077 verbose(", accepted"); 1078 for (i = 0; i < dev->feature_len; i++) 1079 verbose(" %02x", get_feature_bits(dev) 1080 [dev->feature_len+i]); 1081 1082 dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); 1083 1084 for (vq = dev->vq; vq; vq = vq->next) { 1085 if (vq->service) 1086 create_thread(vq); 1087 } 1088 dev->running = true; 1089 } 1090 1091 static void cleanup_devices(void) 1092 { 1093 struct device *dev; 1094 1095 for (dev = devices.dev; dev; dev = dev->next) 1096 reset_device(dev); 1097 1098 /* If we saved off the original terminal settings, restore them now. */ 1099 if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) 1100 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); 1101 } 1102 1103 /* When the Guest tells us they updated the status field, we handle it. */ 1104 static void update_device_status(struct device *dev) 1105 { 1106 /* A zero status is a reset, otherwise it's a set of flags. */ 1107 if (dev->desc->status == 0) 1108 reset_device(dev); 1109 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 1110 warnx("Device %s configuration FAILED", dev->name); 1111 if (dev->running) 1112 reset_device(dev); 1113 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 1114 if (!dev->running) 1115 start_device(dev); 1116 } 1117 } 1118 1119 /*L:215 1120 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In 1121 * particular, it's used to notify us of device status changes during boot. 1122 */ 1123 static void handle_output(unsigned long addr) 1124 { 1125 struct device *i; 1126 1127 /* Check each device. */ 1128 for (i = devices.dev; i; i = i->next) { 1129 struct virtqueue *vq; 1130 1131 /* 1132 * Notifications to device descriptors mean they updated the 1133 * device status. 1134 */ 1135 if (from_guest_phys(addr) == i->desc) { 1136 update_device_status(i); 1137 return; 1138 } 1139 1140 /* 1141 * Devices *can* be used before status is set to DRIVER_OK. 1142 * The original plan was that they would never do this: they 1143 * would always finish setting up their status bits before 1144 * actually touching the virtqueues. In practice, we allowed 1145 * them to, and they do (eg. the disk probes for partition 1146 * tables as part of initialization). 1147 * 1148 * If we see this, we start the device: once it's running, we 1149 * expect the device to catch all the notifications. 1150 */ 1151 for (vq = i->vq; vq; vq = vq->next) { 1152 if (addr != vq->config.pfn*getpagesize()) 1153 continue; 1154 if (i->running) 1155 errx(1, "Notification on running %s", i->name); 1156 /* This just calls create_thread() for each virtqueue */ 1157 start_device(i); 1158 return; 1159 } 1160 } 1161 1162 /* 1163 * Early console write is done using notify on a nul-terminated string 1164 * in Guest memory. It's also great for hacking debugging messages 1165 * into a Guest. 1166 */ 1167 if (addr >= guest_limit) 1168 errx(1, "Bad NOTIFY %#lx", addr); 1169 1170 write(STDOUT_FILENO, from_guest_phys(addr), 1171 strnlen(from_guest_phys(addr), guest_limit - addr)); 1172 } 1173 1174 /*L:190 1175 * Device Setup 1176 * 1177 * All devices need a descriptor so the Guest knows it exists, and a "struct 1178 * device" so the Launcher can keep track of it. We have common helper 1179 * routines to allocate and manage them. 1180 */ 1181 1182 /* 1183 * The layout of the device page is a "struct lguest_device_desc" followed by a 1184 * number of virtqueue descriptors, then two sets of feature bits, then an 1185 * array of configuration bytes. This routine returns the configuration 1186 * pointer. 1187 */ 1188 static u8 *device_config(const struct device *dev) 1189 { 1190 return (void *)(dev->desc + 1) 1191 + dev->num_vq * sizeof(struct lguest_vqconfig) 1192 + dev->feature_len * 2; 1193 } 1194 1195 /* 1196 * This routine allocates a new "struct lguest_device_desc" from descriptor 1197 * table page just above the Guest's normal memory. It returns a pointer to 1198 * that descriptor. 1199 */ 1200 static struct lguest_device_desc *new_dev_desc(u16 type) 1201 { 1202 struct lguest_device_desc d = { .type = type }; 1203 void *p; 1204 1205 /* Figure out where the next device config is, based on the last one. */ 1206 if (devices.lastdev) 1207 p = device_config(devices.lastdev) 1208 + devices.lastdev->desc->config_len; 1209 else 1210 p = devices.descpage; 1211 1212 /* We only have one page for all the descriptors. */ 1213 if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) 1214 errx(1, "Too many devices"); 1215 1216 /* p might not be aligned, so we memcpy in. */ 1217 return memcpy(p, &d, sizeof(d)); 1218 } 1219 1220 /* 1221 * Each device descriptor is followed by the description of its virtqueues. We 1222 * specify how many descriptors the virtqueue is to have. 1223 */ 1224 static void add_virtqueue(struct device *dev, unsigned int num_descs, 1225 void (*service)(struct virtqueue *)) 1226 { 1227 unsigned int pages; 1228 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1229 void *p; 1230 1231 /* First we need some memory for this virtqueue. */ 1232 pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1) 1233 / getpagesize(); 1234 p = get_pages(pages); 1235 1236 /* Initialize the virtqueue */ 1237 vq->next = NULL; 1238 vq->last_avail_idx = 0; 1239 vq->dev = dev; 1240 1241 /* 1242 * This is the routine the service thread will run, and its Process ID 1243 * once it's running. 1244 */ 1245 vq->service = service; 1246 vq->thread = (pid_t)-1; 1247 1248 /* Initialize the configuration. */ 1249 vq->config.num = num_descs; 1250 vq->config.irq = devices.next_irq++; 1251 vq->config.pfn = to_guest_phys(p) / getpagesize(); 1252 1253 /* Initialize the vring. */ 1254 vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); 1255 1256 /* 1257 * Append virtqueue to this device's descriptor. We use 1258 * device_config() to get the end of the device's current virtqueues; 1259 * we check that we haven't added any config or feature information 1260 * yet, otherwise we'd be overwriting them. 1261 */ 1262 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1263 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1264 dev->num_vq++; 1265 dev->desc->num_vq++; 1266 1267 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1268 1269 /* 1270 * Add to tail of list, so dev->vq is first vq, dev->vq->next is 1271 * second. 1272 */ 1273 for (i = &dev->vq; *i; i = &(*i)->next); 1274 *i = vq; 1275 } 1276 1277 /* 1278 * The first half of the feature bitmask is for us to advertise features. The 1279 * second half is for the Guest to accept features. 1280 */ 1281 static void add_feature(struct device *dev, unsigned bit) 1282 { 1283 u8 *features = get_feature_bits(dev); 1284 1285 /* We can't extend the feature bits once we've added config bytes */ 1286 if (dev->desc->feature_len <= bit / CHAR_BIT) { 1287 assert(dev->desc->config_len == 0); 1288 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; 1289 } 1290 1291 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1292 } 1293 1294 /* 1295 * This routine sets the configuration fields for an existing device's 1296 * descriptor. It only works for the last device, but that's OK because that's 1297 * how we use it. 1298 */ 1299 static void set_config(struct device *dev, unsigned len, const void *conf) 1300 { 1301 /* Check we haven't overflowed our single page. */ 1302 if (device_config(dev) + len > devices.descpage + getpagesize()) 1303 errx(1, "Too many devices"); 1304 1305 /* Copy in the config information, and store the length. */ 1306 memcpy(device_config(dev), conf, len); 1307 dev->desc->config_len = len; 1308 1309 /* Size must fit in config_len field (8 bits)! */ 1310 assert(dev->desc->config_len == len); 1311 } 1312 1313 /* 1314 * This routine does all the creation and setup of a new device, including 1315 * calling new_dev_desc() to allocate the descriptor and device memory. We 1316 * don't actually start the service threads until later. 1317 * 1318 * See what I mean about userspace being boring? 1319 */ 1320 static struct device *new_device(const char *name, u16 type) 1321 { 1322 struct device *dev = malloc(sizeof(*dev)); 1323 1324 /* Now we populate the fields one at a time. */ 1325 dev->desc = new_dev_desc(type); 1326 dev->name = name; 1327 dev->vq = NULL; 1328 dev->feature_len = 0; 1329 dev->num_vq = 0; 1330 dev->running = false; 1331 1332 /* 1333 * Append to device list. Prepending to a single-linked list is 1334 * easier, but the user expects the devices to be arranged on the bus 1335 * in command-line order. The first network device on the command line 1336 * is eth0, the first block device /dev/vda, etc. 1337 */ 1338 if (devices.lastdev) 1339 devices.lastdev->next = dev; 1340 else 1341 devices.dev = dev; 1342 devices.lastdev = dev; 1343 1344 return dev; 1345 } 1346 1347 /* 1348 * Our first setup routine is the console. It's a fairly simple device, but 1349 * UNIX tty handling makes it uglier than it could be. 1350 */ 1351 static void setup_console(void) 1352 { 1353 struct device *dev; 1354 1355 /* If we can save the initial standard input settings... */ 1356 if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { 1357 struct termios term = orig_term; 1358 /* 1359 * Then we turn off echo, line buffering and ^C etc: We want a 1360 * raw input stream to the Guest. 1361 */ 1362 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1363 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1364 } 1365 1366 dev = new_device("console", VIRTIO_ID_CONSOLE); 1367 1368 /* We store the console state in dev->priv, and initialize it. */ 1369 dev->priv = malloc(sizeof(struct console_abort)); 1370 ((struct console_abort *)dev->priv)->count = 0; 1371 1372 /* 1373 * The console needs two virtqueues: the input then the output. When 1374 * they put something the input queue, we make sure we're listening to 1375 * stdin. When they put something in the output queue, we write it to 1376 * stdout. 1377 */ 1378 add_virtqueue(dev, VIRTQUEUE_NUM, console_input); 1379 add_virtqueue(dev, VIRTQUEUE_NUM, console_output); 1380 1381 verbose("device %u: console\n", ++devices.device_num); 1382 } 1383 /*:*/ 1384 1385 /*M:010 1386 * Inter-guest networking is an interesting area. Simplest is to have a 1387 * --sharenet=<name> option which opens or creates a named pipe. This can be 1388 * used to send packets to another guest in a 1:1 manner. 1389 * 1390 * More sopisticated is to use one of the tools developed for project like UML 1391 * to do networking. 1392 * 1393 * Faster is to do virtio bonding in kernel. Doing this 1:1 would be 1394 * completely generic ("here's my vring, attach to your vring") and would work 1395 * for any traffic. Of course, namespace and permissions issues need to be 1396 * dealt with. A more sophisticated "multi-channel" virtio_net.c could hide 1397 * multiple inter-guest channels behind one interface, although it would 1398 * require some manner of hotplugging new virtio channels. 1399 * 1400 * Finally, we could implement a virtio network switch in the kernel. 1401 :*/ 1402 1403 static u32 str2ip(const char *ipaddr) 1404 { 1405 unsigned int b[4]; 1406 1407 if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) 1408 errx(1, "Failed to parse IP address '%s'", ipaddr); 1409 return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; 1410 } 1411 1412 static void str2mac(const char *macaddr, unsigned char mac[6]) 1413 { 1414 unsigned int m[6]; 1415 if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", 1416 &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) 1417 errx(1, "Failed to parse mac address '%s'", macaddr); 1418 mac[0] = m[0]; 1419 mac[1] = m[1]; 1420 mac[2] = m[2]; 1421 mac[3] = m[3]; 1422 mac[4] = m[4]; 1423 mac[5] = m[5]; 1424 } 1425 1426 /* 1427 * This code is "adapted" from libbridge: it attaches the Host end of the 1428 * network device to the bridge device specified by the command line. 1429 * 1430 * This is yet another James Morris contribution (I'm an IP-level guy, so I 1431 * dislike bridging), and I just try not to break it. 1432 */ 1433 static void add_to_bridge(int fd, const char *if_name, const char *br_name) 1434 { 1435 int ifidx; 1436 struct ifreq ifr; 1437 1438 if (!*br_name) 1439 errx(1, "must specify bridge name"); 1440 1441 ifidx = if_nametoindex(if_name); 1442 if (!ifidx) 1443 errx(1, "interface %s does not exist!", if_name); 1444 1445 strncpy(ifr.ifr_name, br_name, IFNAMSIZ); 1446 ifr.ifr_name[IFNAMSIZ-1] = '\0'; 1447 ifr.ifr_ifindex = ifidx; 1448 if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) 1449 err(1, "can't add %s to bridge %s", if_name, br_name); 1450 } 1451 1452 /* 1453 * This sets up the Host end of the network device with an IP address, brings 1454 * it up so packets will flow, the copies the MAC address into the hwaddr 1455 * pointer. 1456 */ 1457 static void configure_device(int fd, const char *tapif, u32 ipaddr) 1458 { 1459 struct ifreq ifr; 1460 struct sockaddr_in sin; 1461 1462 memset(&ifr, 0, sizeof(ifr)); 1463 strcpy(ifr.ifr_name, tapif); 1464 1465 /* Don't read these incantations. Just cut & paste them like I did! */ 1466 sin.sin_family = AF_INET; 1467 sin.sin_addr.s_addr = htonl(ipaddr); 1468 memcpy(&ifr.ifr_addr, &sin, sizeof(sin)); 1469 if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) 1470 err(1, "Setting %s interface address", tapif); 1471 ifr.ifr_flags = IFF_UP; 1472 if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) 1473 err(1, "Bringing interface %s up", tapif); 1474 } 1475 1476 static int get_tun_device(char tapif[IFNAMSIZ]) 1477 { 1478 struct ifreq ifr; 1479 int netfd; 1480 1481 /* Start with this zeroed. Messy but sure. */ 1482 memset(&ifr, 0, sizeof(ifr)); 1483 1484 /* 1485 * We open the /dev/net/tun device and tell it we want a tap device. A 1486 * tap device is like a tun device, only somehow different. To tell 1487 * the truth, I completely blundered my way through this code, but it 1488 * works now! 1489 */ 1490 netfd = open_or_die("/dev/net/tun", O_RDWR); 1491 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; 1492 strcpy(ifr.ifr_name, "tap%d"); 1493 if (ioctl(netfd, TUNSETIFF, &ifr) != 0) 1494 err(1, "configuring /dev/net/tun"); 1495 1496 if (ioctl(netfd, TUNSETOFFLOAD, 1497 TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) 1498 err(1, "Could not set features for tun device"); 1499 1500 /* 1501 * We don't need checksums calculated for packets coming in this 1502 * device: trust us! 1503 */ 1504 ioctl(netfd, TUNSETNOCSUM, 1); 1505 1506 memcpy(tapif, ifr.ifr_name, IFNAMSIZ); 1507 return netfd; 1508 } 1509 1510 /*L:195 1511 * Our network is a Host<->Guest network. This can either use bridging or 1512 * routing, but the principle is the same: it uses the "tun" device to inject 1513 * packets into the Host as if they came in from a normal network card. We 1514 * just shunt packets between the Guest and the tun device. 1515 */ 1516 static void setup_tun_net(char *arg) 1517 { 1518 struct device *dev; 1519 struct net_info *net_info = malloc(sizeof(*net_info)); 1520 int ipfd; 1521 u32 ip = INADDR_ANY; 1522 bool bridging = false; 1523 char tapif[IFNAMSIZ], *p; 1524 struct virtio_net_config conf; 1525 1526 net_info->tunfd = get_tun_device(tapif); 1527 1528 /* First we create a new network device. */ 1529 dev = new_device("net", VIRTIO_ID_NET); 1530 dev->priv = net_info; 1531 1532 /* Network devices need a recv and a send queue, just like console. */ 1533 add_virtqueue(dev, VIRTQUEUE_NUM, net_input); 1534 add_virtqueue(dev, VIRTQUEUE_NUM, net_output); 1535 1536 /* 1537 * We need a socket to perform the magic network ioctls to bring up the 1538 * tap interface, connect to the bridge etc. Any socket will do! 1539 */ 1540 ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); 1541 if (ipfd < 0) 1542 err(1, "opening IP socket"); 1543 1544 /* If the command line was --tunnet=bridge:<name> do bridging. */ 1545 if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { 1546 arg += strlen(BRIDGE_PFX); 1547 bridging = true; 1548 } 1549 1550 /* A mac address may follow the bridge name or IP address */ 1551 p = strchr(arg, ':'); 1552 if (p) { 1553 str2mac(p+1, conf.mac); 1554 add_feature(dev, VIRTIO_NET_F_MAC); 1555 *p = '\0'; 1556 } 1557 1558 /* arg is now either an IP address or a bridge name */ 1559 if (bridging) 1560 add_to_bridge(ipfd, tapif, arg); 1561 else 1562 ip = str2ip(arg); 1563 1564 /* Set up the tun device. */ 1565 configure_device(ipfd, tapif, ip); 1566 1567 add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); 1568 /* Expect Guest to handle everything except UFO */ 1569 add_feature(dev, VIRTIO_NET_F_CSUM); 1570 add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); 1571 add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); 1572 add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); 1573 add_feature(dev, VIRTIO_NET_F_GUEST_ECN); 1574 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 1575 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 1576 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 1577 /* We handle indirect ring entries */ 1578 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); 1579 set_config(dev, sizeof(conf), &conf); 1580 1581 /* We don't need the socket any more; setup is done. */ 1582 close(ipfd); 1583 1584 devices.device_num++; 1585 1586 if (bridging) 1587 verbose("device %u: tun %s attached to bridge: %s\n", 1588 devices.device_num, tapif, arg); 1589 else 1590 verbose("device %u: tun %s: %s\n", 1591 devices.device_num, tapif, arg); 1592 } 1593 /*:*/ 1594 1595 /* This hangs off device->priv. */ 1596 struct vblk_info { 1597 /* The size of the file. */ 1598 off64_t len; 1599 1600 /* The file descriptor for the file. */ 1601 int fd; 1602 1603 }; 1604 1605 /*L:210 1606 * The Disk 1607 * 1608 * The disk only has one virtqueue, so it only has one thread. It is really 1609 * simple: the Guest asks for a block number and we read or write that position 1610 * in the file. 1611 * 1612 * Before we serviced each virtqueue in a separate thread, that was unacceptably 1613 * slow: the Guest waits until the read is finished before running anything 1614 * else, even if it could have been doing useful work. 1615 * 1616 * We could have used async I/O, except it's reputed to suck so hard that 1617 * characters actually go missing from your code when you try to use it. 1618 */ 1619 static void blk_request(struct virtqueue *vq) 1620 { 1621 struct vblk_info *vblk = vq->dev->priv; 1622 unsigned int head, out_num, in_num, wlen; 1623 int ret; 1624 u8 *in; 1625 struct virtio_blk_outhdr *out; 1626 struct iovec iov[vq->vring.num]; 1627 off64_t off; 1628 1629 /* 1630 * Get the next request, where we normally wait. It triggers the 1631 * interrupt to acknowledge previously serviced requests (if any). 1632 */ 1633 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1634 1635 /* 1636 * Every block request should contain at least one output buffer 1637 * (detailing the location on disk and the type of request) and one 1638 * input buffer (to hold the result). 1639 */ 1640 if (out_num == 0 || in_num == 0) 1641 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1642 head, out_num, in_num); 1643 1644 out = convert(&iov[0], struct virtio_blk_outhdr); 1645 in = convert(&iov[out_num+in_num-1], u8); 1646 /* 1647 * For historical reasons, block operations are expressed in 512 byte 1648 * "sectors". 1649 */ 1650 off = out->sector * 512; 1651 1652 /* 1653 * In general the virtio block driver is allowed to try SCSI commands. 1654 * It'd be nice if we supported eject, for example, but we don't. 1655 */ 1656 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1657 fprintf(stderr, "Scsi commands unsupported\n"); 1658 *in = VIRTIO_BLK_S_UNSUPP; 1659 wlen = sizeof(*in); 1660 } else if (out->type & VIRTIO_BLK_T_OUT) { 1661 /* 1662 * Write 1663 * 1664 * Move to the right location in the block file. This can fail 1665 * if they try to write past end. 1666 */ 1667 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1668 err(1, "Bad seek to sector %llu", out->sector); 1669 1670 ret = writev(vblk->fd, iov+1, out_num-1); 1671 verbose("WRITE to sector %llu: %i\n", out->sector, ret); 1672 1673 /* 1674 * Grr... Now we know how long the descriptor they sent was, we 1675 * make sure they didn't try to write over the end of the block 1676 * file (possibly extending it). 1677 */ 1678 if (ret > 0 && off + ret > vblk->len) { 1679 /* Trim it back to the correct length */ 1680 ftruncate64(vblk->fd, vblk->len); 1681 /* Die, bad Guest, die. */ 1682 errx(1, "Write past end %llu+%u", off, ret); 1683 } 1684 1685 wlen = sizeof(*in); 1686 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1687 } else if (out->type & VIRTIO_BLK_T_FLUSH) { 1688 /* Flush */ 1689 ret = fdatasync(vblk->fd); 1690 verbose("FLUSH fdatasync: %i\n", ret); 1691 wlen = sizeof(*in); 1692 *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); 1693 } else { 1694 /* 1695 * Read 1696 * 1697 * Move to the right location in the block file. This can fail 1698 * if they try to read past end. 1699 */ 1700 if (lseek64(vblk->fd, off, SEEK_SET) != off) 1701 err(1, "Bad seek to sector %llu", out->sector); 1702 1703 ret = readv(vblk->fd, iov+1, in_num-1); 1704 verbose("READ from sector %llu: %i\n", out->sector, ret); 1705 if (ret >= 0) { 1706 wlen = sizeof(*in) + ret; 1707 *in = VIRTIO_BLK_S_OK; 1708 } else { 1709 wlen = sizeof(*in); 1710 *in = VIRTIO_BLK_S_IOERR; 1711 } 1712 } 1713 1714 /* Finished that request. */ 1715 add_used(vq, head, wlen); 1716 } 1717 1718 /*L:198 This actually sets up a virtual block device. */ 1719 static void setup_block_file(const char *filename) 1720 { 1721 struct device *dev; 1722 struct vblk_info *vblk; 1723 struct virtio_blk_config conf; 1724 1725 /* Creat the device. */ 1726 dev = new_device("block", VIRTIO_ID_BLOCK); 1727 1728 /* The device has one virtqueue, where the Guest places requests. */ 1729 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); 1730 1731 /* Allocate the room for our own bookkeeping */ 1732 vblk = dev->priv = malloc(sizeof(*vblk)); 1733 1734 /* First we open the file and store the length. */ 1735 vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); 1736 vblk->len = lseek64(vblk->fd, 0, SEEK_END); 1737 1738 /* We support FLUSH. */ 1739 add_feature(dev, VIRTIO_BLK_F_FLUSH); 1740 1741 /* Tell Guest how many sectors this device has. */ 1742 conf.capacity = cpu_to_le64(vblk->len / 512); 1743 1744 /* 1745 * Tell Guest not to put in too many descriptors at once: two are used 1746 * for the in and out elements. 1747 */ 1748 add_feature(dev, VIRTIO_BLK_F_SEG_MAX); 1749 conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); 1750 1751 /* Don't try to put whole struct: we have 8 bit limit. */ 1752 set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); 1753 1754 verbose("device %u: virtblock %llu sectors\n", 1755 ++devices.device_num, le64_to_cpu(conf.capacity)); 1756 } 1757 1758 /*L:211 1759 * Our random number generator device reads from /dev/random into the Guest's 1760 * input buffers. The usual case is that the Guest doesn't want random numbers 1761 * and so has no buffers although /dev/random is still readable, whereas 1762 * console is the reverse. 1763 * 1764 * The same logic applies, however. 1765 */ 1766 struct rng_info { 1767 int rfd; 1768 }; 1769 1770 static void rng_input(struct virtqueue *vq) 1771 { 1772 int len; 1773 unsigned int head, in_num, out_num, totlen = 0; 1774 struct rng_info *rng_info = vq->dev->priv; 1775 struct iovec iov[vq->vring.num]; 1776 1777 /* First we need a buffer from the Guests's virtqueue. */ 1778 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1779 if (out_num) 1780 errx(1, "Output buffers in rng?"); 1781 1782 /* 1783 * Just like the console write, we loop to cover the whole iovec. 1784 * In this case, short reads actually happen quite a bit. 1785 */ 1786 while (!iov_empty(iov, in_num)) { 1787 len = readv(rng_info->rfd, iov, in_num); 1788 if (len <= 0) 1789 err(1, "Read from /dev/random gave %i", len); 1790 iov_consume(iov, in_num, len); 1791 totlen += len; 1792 } 1793 1794 /* Tell the Guest about the new input. */ 1795 add_used(vq, head, totlen); 1796 } 1797 1798 /*L:199 1799 * This creates a "hardware" random number device for the Guest. 1800 */ 1801 static void setup_rng(void) 1802 { 1803 struct device *dev; 1804 struct rng_info *rng_info = malloc(sizeof(*rng_info)); 1805 1806 /* Our device's privat info simply contains the /dev/random fd. */ 1807 rng_info->rfd = open_or_die("/dev/random", O_RDONLY); 1808 1809 /* Create the new device. */ 1810 dev = new_device("rng", VIRTIO_ID_RNG); 1811 dev->priv = rng_info; 1812 1813 /* The device has one virtqueue, where the Guest places inbufs. */ 1814 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); 1815 1816 verbose("device %u: rng\n", devices.device_num++); 1817 } 1818 /* That's the end of device setup. */ 1819 1820 /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ 1821 static void __attribute__((noreturn)) restart_guest(void) 1822 { 1823 unsigned int i; 1824 1825 /* 1826 * Since we don't track all open fds, we simply close everything beyond 1827 * stderr. 1828 */ 1829 for (i = 3; i < FD_SETSIZE; i++) 1830 close(i); 1831 1832 /* Reset all the devices (kills all threads). */ 1833 cleanup_devices(); 1834 1835 execv(main_args[0], main_args); 1836 err(1, "Could not exec %s", main_args[0]); 1837 } 1838 1839 /*L:220 1840 * Finally we reach the core of the Launcher which runs the Guest, serves 1841 * its input and output, and finally, lays it to rest. 1842 */ 1843 static void __attribute__((noreturn)) run_guest(void) 1844 { 1845 for (;;) { 1846 unsigned long notify_addr; 1847 int readval; 1848 1849 /* We read from the /dev/lguest device to run the Guest. */ 1850 readval = pread(lguest_fd, ¬ify_addr, 1851 sizeof(notify_addr), cpu_id); 1852 1853 /* One unsigned long means the Guest did HCALL_NOTIFY */ 1854 if (readval == sizeof(notify_addr)) { 1855 verbose("Notify on address %#lx\n", notify_addr); 1856 handle_output(notify_addr); 1857 /* ENOENT means the Guest died. Reading tells us why. */ 1858 } else if (errno == ENOENT) { 1859 char reason[1024] = { 0 }; 1860 pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); 1861 errx(1, "%s", reason); 1862 /* ERESTART means that we need to reboot the guest */ 1863 } else if (errno == ERESTART) { 1864 restart_guest(); 1865 /* Anything else means a bug or incompatible change. */ 1866 } else 1867 err(1, "Running guest failed"); 1868 } 1869 } 1870 /*L:240 1871 * This is the end of the Launcher. The good news: we are over halfway 1872 * through! The bad news: the most fiendish part of the code still lies ahead 1873 * of us. 1874 * 1875 * Are you ready? Take a deep breath and join me in the core of the Host, in 1876 * "make Host". 1877 :*/ 1878 1879 static struct option opts[] = { 1880 { "verbose", 0, NULL, 'v' }, 1881 { "tunnet", 1, NULL, 't' }, 1882 { "block", 1, NULL, 'b' }, 1883 { "rng", 0, NULL, 'r' }, 1884 { "initrd", 1, NULL, 'i' }, 1885 { "username", 1, NULL, 'u' }, 1886 { "chroot", 1, NULL, 'c' }, 1887 { NULL }, 1888 }; 1889 static void usage(void) 1890 { 1891 errx(1, "Usage: lguest [--verbose] " 1892 "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" 1893 "|--block=<filename>|--initrd=<filename>]...\n" 1894 "<mem-in-mb> vmlinux [args...]"); 1895 } 1896 1897 /*L:105 The main routine is where the real work begins: */ 1898 int main(int argc, char *argv[]) 1899 { 1900 /* Memory, code startpoint and size of the (optional) initrd. */ 1901 unsigned long mem = 0, start, initrd_size = 0; 1902 /* Two temporaries. */ 1903 int i, c; 1904 /* The boot information for the Guest. */ 1905 struct boot_params *boot; 1906 /* If they specify an initrd file to load. */ 1907 const char *initrd_name = NULL; 1908 1909 /* Password structure for initgroups/setres[gu]id */ 1910 struct passwd *user_details = NULL; 1911 1912 /* Directory to chroot to */ 1913 char *chroot_path = NULL; 1914 1915 /* Save the args: we "reboot" by execing ourselves again. */ 1916 main_args = argv; 1917 1918 /* 1919 * First we initialize the device list. We keep a pointer to the last 1920 * device, and the next interrupt number to use for devices (1: 1921 * remember that 0 is used by the timer). 1922 */ 1923 devices.lastdev = NULL; 1924 devices.next_irq = 1; 1925 1926 /* We're CPU 0. In fact, that's the only CPU possible right now. */ 1927 cpu_id = 0; 1928 1929 /* 1930 * We need to know how much memory so we can set up the device 1931 * descriptor and memory pages for the devices as we parse the command 1932 * line. So we quickly look through the arguments to find the amount 1933 * of memory now. 1934 */ 1935 for (i = 1; i < argc; i++) { 1936 if (argv[i][0] != '-') { 1937 mem = atoi(argv[i]) * 1024 * 1024; 1938 /* 1939 * We start by mapping anonymous pages over all of 1940 * guest-physical memory range. This fills it with 0, 1941 * and ensures that the Guest won't be killed when it 1942 * tries to access it. 1943 */ 1944 guest_base = map_zeroed_pages(mem / getpagesize() 1945 + DEVICE_PAGES); 1946 guest_limit = mem; 1947 guest_max = mem + DEVICE_PAGES*getpagesize(); 1948 devices.descpage = get_pages(1); 1949 break; 1950 } 1951 } 1952 1953 /* The options are fairly straight-forward */ 1954 while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) { 1955 switch (c) { 1956 case 'v': 1957 verbose = true; 1958 break; 1959 case 't': 1960 setup_tun_net(optarg); 1961 break; 1962 case 'b': 1963 setup_block_file(optarg); 1964 break; 1965 case 'r': 1966 setup_rng(); 1967 break; 1968 case 'i': 1969 initrd_name = optarg; 1970 break; 1971 case 'u': 1972 user_details = getpwnam(optarg); 1973 if (!user_details) 1974 err(1, "getpwnam failed, incorrect username?"); 1975 break; 1976 case 'c': 1977 chroot_path = optarg; 1978 break; 1979 default: 1980 warnx("Unknown argument %s", argv[optind]); 1981 usage(); 1982 } 1983 } 1984 /* 1985 * After the other arguments we expect memory and kernel image name, 1986 * followed by command line arguments for the kernel. 1987 */ 1988 if (optind + 2 > argc) 1989 usage(); 1990 1991 verbose("Guest base is at %p\n", guest_base); 1992 1993 /* We always have a console device */ 1994 setup_console(); 1995 1996 /* Now we load the kernel */ 1997 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1998 1999 /* Boot information is stashed at physical address 0 */ 2000 boot = from_guest_phys(0); 2001 2002 /* Map the initrd image if requested (at top of physical memory) */ 2003 if (initrd_name) { 2004 initrd_size = load_initrd(initrd_name, mem); 2005 /* 2006 * These are the location in the Linux boot header where the 2007 * start and size of the initrd are expected to be found. 2008 */ 2009 boot->hdr.ramdisk_image = mem - initrd_size; 2010 boot->hdr.ramdisk_size = initrd_size; 2011 /* The bootloader type 0xFF means "unknown"; that's OK. */ 2012 boot->hdr.type_of_loader = 0xFF; 2013 } 2014 2015 /* 2016 * The Linux boot header contains an "E820" memory map: ours is a 2017 * simple, single region. 2018 */ 2019 boot->e820_entries = 1; 2020 boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); 2021 /* 2022 * The boot header contains a command line pointer: we put the command 2023 * line after the boot header. 2024 */ 2025 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 2026 /* We use a simple helper to copy the arguments separated by spaces. */ 2027 concat((char *)(boot + 1), argv+optind+2); 2028 2029 /* Boot protocol version: 2.07 supports the fields for lguest. */ 2030 boot->hdr.version = 0x207; 2031 2032 /* The hardware_subarch value of "1" tells the Guest it's an lguest. */ 2033 boot->hdr.hardware_subarch = 1; 2034 2035 /* Tell the entry path not to try to reload segment registers. */ 2036 boot->hdr.loadflags |= KEEP_SEGMENTS; 2037 2038 /* 2039 * We tell the kernel to initialize the Guest: this returns the open 2040 * /dev/lguest file descriptor. 2041 */ 2042 tell_kernel(start); 2043 2044 /* Ensure that we terminate if a device-servicing child dies. */ 2045 signal(SIGCHLD, kill_launcher); 2046 2047 /* If we exit via err(), this kills all the threads, restores tty. */ 2048 atexit(cleanup_devices); 2049 2050 /* If requested, chroot to a directory */ 2051 if (chroot_path) { 2052 if (chroot(chroot_path) != 0) 2053 err(1, "chroot(\"%s\") failed", chroot_path); 2054 2055 if (chdir("/") != 0) 2056 err(1, "chdir(\"/\") failed"); 2057 2058 verbose("chroot done\n"); 2059 } 2060 2061 /* If requested, drop privileges */ 2062 if (user_details) { 2063 uid_t u; 2064 gid_t g; 2065 2066 u = user_details->pw_uid; 2067 g = user_details->pw_gid; 2068 2069 if (initgroups(user_details->pw_name, g) != 0) 2070 err(1, "initgroups failed"); 2071 2072 if (setresgid(g, g, g) != 0) 2073 err(1, "setresgid failed"); 2074 2075 if (setresuid(u, u, u) != 0) 2076 err(1, "setresuid failed"); 2077 2078 verbose("Dropping privileges completed\n"); 2079 } 2080 2081 /* Finally, run the Guest. This doesn't return. */ 2082 run_guest(); 2083 } 2084 /*:*/ 2085 2086 /*M:999 2087 * Mastery is done: you now know everything I do. 2088 * 2089 * But surely you have seen code, features and bugs in your wanderings which 2090 * you now yearn to attack? That is the real game, and I look forward to you 2091 * patching and forking lguest into the Your-Name-Here-visor. 2092 * 2093 * Farewell, and good coding! 2094 * Rusty Russell. 2095 */