xv6

port of xv6 to x86-64
git clone http://frotz.net/git/xv6.git
Log | Files | Refs | README | LICENSE

README.CLS (4738B)


      1 
      2 CPU LOCAL STORAGE
      3 
      4 Basically a similar concept to thread local storage, but in Xv6's case
      5 these are per-cpu, not per-thread.  
      6 
      7 GLOBAL REGISTER VARIABLES
      8 -------------------------
      9 
     10 Xv6:32 uses this gcc trick to generate GS: relative access to a few
     11 globals for cpu-local-storage:
     12 
     13   extern struct cpu *cpu asm("%gs:0);
     14 
     15 Sadly this does not work on x86-64, instead generating a pc-relative 
     16 load and various unhappiness results.  In this case and the other
     17 options I explored, I took a look at a chunk of code generated by
     18 a common expression using a structure from cpu local storage:
     19  
     20   if (proc->killed) ...
     21 
     22 with asm("%gs:4") on i386
     23 
     24 :       65 a1 04 00 00 00       mov    %gs:0x4,%eax
     25 :       8b 40 24                mov    0x24(%eax),%eax
     26 :       85 c0                   test   %eax,%eax
     27 
     28 with asm("%gs:8") on x86-64
     29 
     30 :       65 48 8b 05 04 00 00    mov    %gs:0x8(%rip),%rax
     31 :       00 
     32 :       8b 40 50                mov    0x50(%rax),%eax
     33 :       85 c0                   test   %eax,%eax
     34 
     35 This results in rax = [ gs + rip + 8 ] which is never what we want...
     36 
     37 With -O1, in both cases the mov and test are combined into something like
     38 
     39 :       65 a1 04 00 00 00       mov    %gs:0x4,%eax
     40 :       83 78 24 00             cmpl   $0x0,0x24(%eax)
     41 
     42 
     43 __THREAD MODIFIER
     44 -----------------
     45 
     46 gcc supports a construct for thread-local variables:
     47 
     48   extern __thread struct cpu *cpu;
     49 
     50 with __thread and -mtls-direct-seg-refs on i386
     51 
     52 :       48 c7 c0 f8 ff ff ff    mov    $0xfffffffffffffff8,%rax
     53 :       64 48 8b 00             mov    %fs:(%rax),%rax
     54 :       8b 40 50                mov    0x50(%rax),%eax
     55 :       85 c0                   test   %eax,%eax
     56 
     57 with __thread and -mtls-direct-seg-refs on x86-64
     58 
     59 :       b8 fc ff ff ff          mov    $0xfffffffc,%eax
     60 :       65 8b 00                mov    %gs:(%eax),%eax
     61 :       8b 40 24                mov    0x24(%eax),%eax
     62 :       85 c0                   test   %eax,%eax
     63 
     64 The choice of segment (fs or gs) seems to be baked into gcc and
     65 is chosen based on 32bit or 64bit compilation mode.
     66 
     67 The linker generates an TLS section:
     68 
     69   Type           Offset             VirtAddr           PhysAddr
     70                  FileSiz            MemSiz              Flags  Align
     71   LOAD           0x0000000000001000 0xffffffff80100000 0x0000000000100000
     72                  0x000000000000da00 0x0000000000016778  RWE    1000
     73   TLS            0x000000000000ea00 0xffffffff8010da00 0x000000000010da00
     74                  0x0000000000000000 0x0000000000000010  R      8
     75 and TLS symbols:
     76 
     77    168: 0000000000000008     8 TLS     GLOBAL DEFAULT    5 proc
     78    233: 0000000000000000     8 TLS     GLOBAL DEFAULT    5 cpu
     79 
     80 In this model I just point fs (or gs) at the top of a page of local
     81 storage space I allocate (since I only have a handful of local items
     82 to track).
     83 
     84 These are a bit less convenient because of the negative indexing and
     85 the fact that you're at the compiler and linker's whim for where things
     86 end up.  Also they require longer (and probably slower) instruction
     87 sequences to allow the local index to be patched up by the linker.
     88 
     89 Lack of control over which segment register is used is a further
     90 downside.
     91 
     92 
     93 MACROS AND INLINE ASSEMBLY
     94 --------------------------
     95 
     96 #define __local_get(n) ({ \
     97   uint64 res; \
     98   asm ("mov %%gs:" #n ",%0" : "=r" (res)); \
     99   res; \
    100 })
    101 
    102 #define __local_put(n, v) ({ \
    103   uint64 val = v; \
    104   asm ("mov %0, %%gs:" #n : : "r" (val)); \
    105 })
    106 
    107 #define __proc() ((struct proc*) __local_get(4))
    108 
    109   if (__proc()->killed) ...
    110 
    111 x86-64 without optimization:
    112 
    113 :       65 48 8b 04 25 08 00    mov    %gs:0x4,%rax
    114 :       00 00 
    115 :       48 89 45 d0             mov    %rax,-0x30(%rbp)
    116 :       48 8b 45 d0             mov    -0x30(%rbp),%rax
    117 :       8b 40 50                mov    0x50(%rax),%eax
    118 :       85 c0                   test   %eax,%eax
    119 
    120 x86-64 with -O1:
    121 
    122 :       65 48 8b 04 25 08 00    mov    %gs:0x4,%rax
    123 :       00 00 
    124 :       83 78 50 00             cmpl   $0x0,0x50(%rax)
    125 
    126 i386 without optimization:
    127 
    128 :       65 8b 1d 04 00 00 00    mov    %gs:0x4,%ebx
    129 :       89 5d f4                mov    %ebx,-0xc(%ebp)
    130 :       8b 45 f4                mov    -0xc(%ebp),%eax
    131 :       8b 40 24                mov    0x24(%eax),%eax
    132 :       85 c0                   test   %eax,%eax
    133 
    134 i386 with -O1:
    135 
    136 :       65 a1 04 00 00 00       mov    %gs:0x4,%eax
    137 :       83 78 24 00             cmpl   $0x0,0x24(%eax)
    138 
    139 These are less efficient than the others when compiling unoptimized
    140 (though that's an unusual state), but they cost no more than the
    141 global register variable trick originally used and have the benefit
    142 of generating correct code for both 32 and 64 bit modes.
    143 
    144 They do have the downside that you can't use one construct
    145 for both setting and getting the contents of a local storage
    146 variable.