diff --git a/Makefile b/Makefile
index 6d89f421076fb50f2d663cb538d902d08f67b837..5943a7ee15dc4cd12b2e446abbc1459ca5259789 100644
--- a/Makefile
+++ b/Makefile
@@ -23,6 +23,7 @@ OBJS = \
 	timer.o\
 	trapasm.o\
 	trap.o\
+	uart.o\
 	vectors.o\
 
 # Cross-compiling (e.g., on Mac OS X)
@@ -139,6 +140,9 @@ bochs : fs.img xv6.img
 qemu: fs.img xv6.img
 	qemu -parallel stdio -hdb fs.img xv6.img
 
+qemutty: fs.img xv6.img
+	qemu -nographic -smp 2 -hdb fs.img xv6.img
+
 # CUT HERE
 # prepare dist for students
 # after running make dist, probably want to
diff --git a/defs.h b/defs.h
index c3543715bb55016ce6dd20de6e05b73e2a98f9fd..fe863cd0f8e374677db5196bac82334c2caa0cfe 100644
--- a/defs.h
+++ b/defs.h
@@ -73,6 +73,7 @@ extern volatile uint*    lapic;
 void            lapiceoi(void);
 void            lapicinit(int);
 void            lapicstartap(uchar, uint);
+void            microdelay(int);
 
 // mp.c
 extern int      ismp;
@@ -92,14 +93,14 @@ int             pipewrite(struct pipe*, char*, int);
 
 // proc.c
 struct proc*    copyproc(struct proc*);
-struct proc*    curproc(void);
 void            exit(void);
 int             growproc(int);
 int             kill(int);
 void            pinit(void);
 void            procdump(void);
 void            scheduler(void) __attribute__((noreturn));
-void            setupsegs(struct proc*);
+void            ksegment(void);
+void            usegment(void);
 void            sleep(void*, struct spinlock*);
 void            userinit(void);
 int             wait(void);
@@ -144,6 +145,12 @@ extern int      ticks;
 void            tvinit(void);
 extern struct spinlock tickslock;
 
+// uart.c
+void		uartinit(void);
+void		uartintr(void);
+void		uartputc(int);
+
+
 // number of elements in fixed-size array
 #define NELEM(x) (sizeof(x)/sizeof((x)[0]))
 
diff --git a/exec.c b/exec.c
index e5d6fffa1bb92d907ce52f41fba68e7d540bef53..98c5d4ceaaca6616ab6ecec3120ffb591a06fdbb 100644
--- a/exec.c
+++ b/exec.c
@@ -104,7 +104,7 @@ exec(char *path, char **argv)
   cp->sz = sz;
   cp->tf->eip = elf.entry;  // main
   cp->tf->esp = sp;
-  setupsegs(cp);
+  usegment();
   return 0;
 
  bad:
diff --git a/lapic.c b/lapic.c
index 915765a08d690527df472095f9bd75716ce06604..f64f54b8cccb49c90947e2567bd0c39d7151d31a 100644
--- a/lapic.c
+++ b/lapic.c
@@ -121,7 +121,7 @@ lapiceoi(void)
 
 // Spin for a given number of microseconds.
 // On real hardware would want to tune this dynamically.
-static void
+void
 microdelay(int us)
 {
   volatile int j = 0;
diff --git a/main.c b/main.c
index f4914dbf1596a17647191db9fd2321090cfa1fa8..18e17900016784882126bb43c0417fef9de3cf88 100644
--- a/main.c
+++ b/main.c
@@ -5,6 +5,9 @@
 #include "proc.h"
 #include "x86.h"
 
+__thread struct cpu *c;
+__thread struct proc *cp;
+
 static void bootothers(void);
 static void mpmain(void) __attribute__((noreturn));
 
@@ -14,20 +17,22 @@ main(void)
 {
   mpinit(); // collect info about this machine
   lapicinit(mpbcpu());
+  ksegment();
+  picinit();       // interrupt controller
+  ioapicinit();    // another interrupt controller
+  consoleinit();   // I/O devices & their interrupts
+  uartinit();      // serial port
   cprintf("\ncpu%d: starting xv6\n\n", cpu());
 
-  pinit();         // process table
-  binit();         // buffer cache
-  picinit();      // interrupt controller
-  ioapicinit();   // another interrupt controller
   kinit();         // physical memory allocator
+  pinit();         // process table
   tvinit();        // trap vectors
+  binit();         // buffer cache
   fileinit();      // file table
   iinit();         // inode cache
-  consoleinit();  // I/O devices & their interrupts
-  ideinit();      // disk
+  ideinit();       // disk
   if(!ismp)
-    timerinit();  // uniprocessor timer
+    timerinit();   // uniprocessor timer
   userinit();      // first user process
   bootothers();    // start other processors
 
@@ -40,12 +45,12 @@ main(void)
 static void
 mpmain(void)
 {
-  cprintf("cpu%d: mpmain\n", cpu());
-  idtinit();
   if(cpu() != mpbcpu())
     lapicinit(cpu());
-  setupsegs(0);
-  xchg(&cpus[cpu()].booted, 1);
+  ksegment();
+  cprintf("cpu%d: mpmain\n", cpu());
+  idtinit();
+  xchg(&c->booted, 1);
 
   cprintf("cpu%d: scheduling\n", cpu());
   scheduler();
diff --git a/proc.c b/proc.c
index 28365a23591379c848eb297091549526aeb4e345..20f4be978c70c133155d971c0ec814410dbb8a18 100644
--- a/proc.c
+++ b/proc.c
@@ -36,16 +36,31 @@ allocproc(void)
     if(p->state == UNUSED){
       p->state = EMBRYO;
       p->pid = nextpid++;
-      release(&proc_table_lock);
-      return p;
+      goto found;
     }
   }
   release(&proc_table_lock);
   return 0;
+
+found:
+  release(&proc_table_lock);
+
+  // Allocate kernel stack if necessary.
+  if((p->kstack = kalloc(KSTACKSIZE)) == 0){
+    p->state = UNUSED;
+    return 0;
+  }
+  p->tf = (struct trapframe*)(p->kstack + KSTACKSIZE) - 1;
+
+  // Set up new context to start executing at forkret (see below).
+  p->context = (struct context *)p->tf - 1;
+  memset(p->context, 0, sizeof(*p->context));
+  p->context->eip = (uint)forkret;
+  return p;
 }
 
 // Grow current process's memory by n bytes.
-// Return old size on success, -1 on failure.
+// Return 0 on success, -1 on failure.
 int
 growproc(int n)
 {
@@ -59,37 +74,53 @@ growproc(int n)
   kfree(cp->mem, cp->sz);
   cp->mem = newmem;
   cp->sz += n;
-  setupsegs(cp);
-  return cp->sz - n;
+  usegment();
+  return 0;
 }
 
-// Set up CPU's segment descriptors and task state for a given process.
-// If p==0, set up for "idle" state for when scheduler() is running.
+// Set up CPU's kernel segment descriptors.
 void
-setupsegs(struct proc *p)
+ksegment(void)
 {
-  struct cpu *c;
+  struct cpu *c1;
+
+  c1 = &cpus[cpu()];
+  c1->gdt[0] = SEG_NULL;
+  c1->gdt[SEG_KCODE] = SEG(STA_X|STA_R, 0, 0x100000 + 64*1024-1, 0);
+  c1->gdt[SEG_KDATA] = SEG(STA_W, 0, 0xffffffff, 0);
+  c1->gdt[SEG_KCPU] = SEG(STA_W, (uint)&c1->tls+sizeof(c1->tls), 0xffffffff, 0);
+  c1->gdt[SEG_UCODE] = SEG_NULL;
+  c1->gdt[SEG_UDATA] = SEG_NULL;
+  c1->gdt[SEG_TSS] = SEG_NULL;
+  lgdt(c1->gdt, sizeof(c1->gdt));
   
+  // Initialize cpu-local variables.
+  setgs(SEG_KCPU << 3);
+  c = c1;
+  cp = 0;
+}
+
+// Set up CPU's segment descriptors and task state for the current process.
+// If cp==0, set up for "idle" state for when scheduler() is running.
+void
+usegment(void)
+{
   pushcli();
-  c = &cpus[cpu()];
   c->ts.ss0 = SEG_KDATA << 3;
-  if(p)
-    c->ts.esp0 = (uint)(p->kstack + KSTACKSIZE);
+  if(cp)
+    c->ts.esp0 = (uint)(cp->kstack + KSTACKSIZE);
   else
     c->ts.esp0 = 0xffffffff;
 
-  c->gdt[0] = SEG_NULL;
-  c->gdt[SEG_KCODE] = SEG(STA_X|STA_R, 0, 0x100000 + 64*1024-1, 0);
-  c->gdt[SEG_KDATA] = SEG(STA_W, 0, 0xffffffff, 0);
-  c->gdt[SEG_TSS] = SEG16(STS_T32A, (uint)&c->ts, sizeof(c->ts)-1, 0);
-  c->gdt[SEG_TSS].s = 0;
-  if(p){
-    c->gdt[SEG_UCODE] = SEG(STA_X|STA_R, (uint)p->mem, p->sz-1, DPL_USER);
-    c->gdt[SEG_UDATA] = SEG(STA_W, (uint)p->mem, p->sz-1, DPL_USER);
+  if(cp){
+    c->gdt[SEG_UCODE] = SEG(STA_X|STA_R, (uint)cp->mem, cp->sz-1, DPL_USER);
+    c->gdt[SEG_UDATA] = SEG(STA_W, (uint)cp->mem, cp->sz-1, DPL_USER);
   } else {
     c->gdt[SEG_UCODE] = SEG_NULL;
     c->gdt[SEG_UDATA] = SEG_NULL;
   }
+  c->gdt[SEG_TSS] = SEG16(STS_T32A, (uint)&c->ts, sizeof(c->ts)-1, 0);
+  c->gdt[SEG_TSS].s = 0;
 
   lgdt(c->gdt, sizeof(c->gdt));
   ltr(SEG_TSS << 3);
@@ -109,40 +140,23 @@ copyproc(struct proc *p)
   if((np = allocproc()) == 0)
     return 0;
 
-  // Allocate kernel stack.
-  if((np->kstack = kalloc(KSTACKSIZE)) == 0){
+  // Copy process state from p.
+  np->sz = p->sz;
+  if((np->mem = kalloc(np->sz)) == 0){
+    kfree(np->kstack, KSTACKSIZE);
+    np->kstack = 0;
     np->state = UNUSED;
     return 0;
   }
-  np->tf = (struct trapframe*)(np->kstack + KSTACKSIZE) - 1;
+  memmove(np->mem, p->mem, np->sz);
+  np->parent = p;
+  *np->tf = *p->tf;
 
-  if(p){  // Copy process state from p.
-    np->parent = p;
-    memmove(np->tf, p->tf, sizeof(*np->tf));
-  
-    np->sz = p->sz;
-    if((np->mem = kalloc(np->sz)) == 0){
-      kfree(np->kstack, KSTACKSIZE);
-      np->kstack = 0;
-      np->state = UNUSED;
-      np->parent = 0;
-      return 0;
-    }
-    memmove(np->mem, p->mem, np->sz);
+  for(i = 0; i < NOFILE; i++)
+    if(p->ofile[i])
+      np->ofile[i] = filedup(p->ofile[i]);
+  np->cwd = idup(p->cwd);
 
-    for(i = 0; i < NOFILE; i++)
-      if(p->ofile[i])
-        np->ofile[i] = filedup(p->ofile[i]);
-    np->cwd = idup(p->cwd);
-  }
-
-  // Set up new context to start executing at forkret (see below).
-  np->context = (struct context *)np->tf - 1;
-  memset(np->context, 0, sizeof(*np->context));
-  np->context->eip = (uint)forkret;
-
-  // Clear %eax so that fork system call returns 0 in child.
-  np->tf->eax = 0;
   return np;
 }
 
@@ -153,10 +167,14 @@ userinit(void)
   struct proc *p;
   extern uchar _binary_initcode_start[], _binary_initcode_size[];
   
-  p = copyproc(0);
+  p = allocproc();
+  initproc = p;
+
+  // Initialize memory from initcode.S
   p->sz = PAGE;
   p->mem = kalloc(p->sz);
-  p->cwd = namei("/");
+  memmove(p->mem, _binary_initcode_start, (int)_binary_initcode_size);
+
   memset(p->tf, 0, sizeof(*p->tf));
   p->tf->cs = (SEG_UCODE << 3) | DPL_USER;
   p->tf->ds = (SEG_UDATA << 3) | DPL_USER;
@@ -164,30 +182,12 @@ userinit(void)
   p->tf->ss = p->tf->ds;
   p->tf->eflags = FL_IF;
   p->tf->esp = p->sz;
-  
-  // Make return address readable; needed for some gcc.
-  p->tf->esp -= 4;
-  *(uint*)(p->mem + p->tf->esp) = 0xefefefef;
+  p->tf->eip = 0;  // beginning of initcode.S
 
-  // On entry to user space, start executing at beginning of initcode.S.
-  p->tf->eip = 0;
-  memmove(p->mem, _binary_initcode_start, (int)_binary_initcode_size);
   safestrcpy(p->name, "initcode", sizeof(p->name));
-  p->state = RUNNABLE;
-  
-  initproc = p;
-}
-
-// Return currently running process.
-struct proc*
-curproc(void)
-{
-  struct proc *p;
+  p->cwd = namei("/");
 
-  pushcli();
-  p = cpus[cpu()].curproc;
-  popcli();
-  return p;
+  p->state = RUNNABLE;
 }
 
 //PAGEBREAK: 42
@@ -202,10 +202,8 @@ void
 scheduler(void)
 {
   struct proc *p;
-  struct cpu *c;
   int i;
 
-  c = &cpus[cpu()];
   for(;;){
     // Enable interrupts on this processor, in lieu of saving intena.
     sti();
@@ -220,15 +218,15 @@ scheduler(void)
       // Switch to chosen process.  It is the process's job
       // to release proc_table_lock and then reacquire it
       // before jumping back to us.
-      c->curproc = p;
-      setupsegs(p);
+      cp = p;
+      usegment();
       p->state = RUNNING;
       swtch(&c->context, &p->context);
 
       // Process is done running for now.
       // It should have changed its p->state before coming back.
-      c->curproc = 0;
-      setupsegs(0);
+      cp = 0;
+      usegment();
     }
     release(&proc_table_lock);
 
@@ -236,7 +234,7 @@ scheduler(void)
 }
 
 // Enter scheduler.  Must already hold proc_table_lock
-// and have changed curproc[cpu()]->state.
+// and have changed cp->state.
 void
 sched(void)
 {
@@ -248,12 +246,12 @@ sched(void)
     panic("sched running");
   if(!holding(&proc_table_lock))
     panic("sched proc_table_lock");
-  if(cpus[cpu()].ncli != 1)
+  if(c->ncli != 1)
     panic("sched locks");
 
-  intena = cpus[cpu()].intena;
-  swtch(&cp->context, &cpus[cpu()].context);
-  cpus[cpu()].intena = intena;
+  intena = c->intena;
+  swtch(&cp->context, &c->context);
+  c->intena = intena;
 }
 
 // Give up the CPU for one scheduling round.
@@ -421,6 +419,7 @@ wait(void)
       if(p->state == UNUSED)
         continue;
       if(p->parent == cp){
+        havekids = 1;
         if(p->state == ZOMBIE){
           // Found one.
           kfree(p->mem, p->sz);
@@ -433,7 +432,6 @@ wait(void)
           release(&proc_table_lock);
           return pid;
         }
-        havekids = 1;
       }
     }
 
diff --git a/proc.h b/proc.h
index 7269b66ef04472e3edf80b879c9e9bc509de5632..eebfa236772f1a2ba799daa1ce0c77cb4cebf5d9 100644
--- a/proc.h
+++ b/proc.h
@@ -1,17 +1,21 @@
-// Segments in proc->gdt
+// Segments in proc->gdt.
+// Also known to bootasm.S and trapasm.S
 #define SEG_KCODE 1  // kernel code
 #define SEG_KDATA 2  // kernel data+stack
-#define SEG_UCODE 3
-#define SEG_UDATA 4
-#define SEG_TSS   5  // this process's task state
-#define NSEGS     6
+#define SEG_KCPU  3  // kernel per-cpu data
+#define SEG_UCODE 4
+#define SEG_UDATA 5
+#define SEG_TSS   6  // this process's task state
+#define NSEGS     7
 
 // Saved registers for kernel context switches.
 // Don't need to save all the segment registers (%cs, etc),
 // because they are constant across kernel contexts.
-// Stack pointer is encoded in the address of context,
-// which must be placed at the bottom of the stack.
-// The layout of context must match code in swtch.S.
+// Don't need to save %eax, %ecx, %edx, because the
+// x86 convention is that the caller has saved them.
+// Contexts are stored at the bottom of the stack they
+// describe; the stack pointer is the address of the context.
+// The layout of the context must match the code in swtch.S.
 struct context {
   uint edi;
   uint esi;
@@ -30,12 +34,12 @@ struct proc {
   enum proc_state state;       // Process state
   int pid;                     // Process ID
   struct proc *parent;         // Parent process
+  struct trapframe *tf;        // Trap frame for current syscall
+  struct context *context;     // Switch here to run process
   void *chan;                  // If non-zero, sleeping on chan
   int killed;                  // If non-zero, have been killed
   struct file *ofile[NOFILE];  // Open files
   struct inode *cwd;           // Current directory
-  struct context *context;     // Switch here to run process
-  struct trapframe *tf;        // Trap frame for current syscall
   char name[16];               // Process name (debugging)
 };
 
@@ -48,18 +52,23 @@ struct proc {
 // Per-CPU state
 struct cpu {
   uchar apicid;                // Local APIC ID
-  struct proc *curproc;        // Process currently running.
   struct context *context;     // Switch here to enter scheduler
   struct taskstate ts;         // Used by x86 to find stack for interrupt
   struct segdesc gdt[NSEGS];   // x86 global descriptor table
   volatile uint booted;        // Has the CPU started?
   int ncli;                    // Depth of pushcli nesting.
-  int intena;                  // Were interrupts enabled before pushcli? 
+  int intena;                  // Were interrupts enabled before pushcli?
+  void *tls[2];
 };
 
 extern struct cpu cpus[NCPU];
 extern int ncpu;
 
-// "cp" is a short alias for curproc().
-// It gets used enough to make this worthwhile.
-#define cp curproc()
+// Per-CPU variables, holding pointers to the
+// current cpu and to the current process.
+// The __thread prefix tells gcc to refer to them in the segment
+// pointed at by gs; the name __thread derives from the use
+// of the same mechanism to provide per-thread storage in
+// multithreaded user programs.
+extern __thread struct cpu *c;       // This cpu.
+extern __thread struct proc *cp;     // Current process on this cpu.
diff --git a/spinlock.c b/spinlock.c
index 3784b2433e56b78346e2d4b2290d291945798fe4..d6e952d29ff6c9c065112f48066fdbda0ee050a9 100644
--- a/spinlock.c
+++ b/spinlock.c
@@ -102,8 +102,8 @@ pushcli(void)
   
   eflags = readeflags();
   cli();
-  if(cpus[cpu()].ncli++ == 0)
-    cpus[cpu()].intena = eflags & FL_IF;
+  if(c->ncli++ == 0)
+    c->intena = eflags & FL_IF;
 }
 
 void
@@ -111,9 +111,9 @@ popcli(void)
 {
   if(readeflags()&FL_IF)
     panic("popcli - interruptible");
-  if(--cpus[cpu()].ncli < 0)
+  if(--c->ncli < 0)
     panic("popcli");
-  if(cpus[cpu()].ncli == 0 && cpus[cpu()].intena)
+  if(c->ncli == 0 && c->intena)
     sti();
 }
 
diff --git a/x86.h b/x86.h
index 80d7487c578ae5d82a4400c723bcfa5d98d060b1..ecb5d2a8fce5821c93fc10ba220ce4c0316881c1 100644
--- a/x86.h
+++ b/x86.h
@@ -103,6 +103,12 @@ xchg(volatile uint *addr, uint newval)
   return result;
 }
 
+static inline void
+setgs(ushort gs)
+{
+  asm volatile("movw %0, %%gs" : : "r" (gs));
+}
+
 static inline void
 cli(void)
 {