Notes on the adventures of cpu-local-storage on x86-64 - xv6 - Unnamed repository; edit this file 'description' to name the repository.

commit 916fd4062da5cd25d5fe2b8922895de54714e84d
parent 87e4d9e6ab49713018d74732bb248f99078fbfb1
Author: Brian Swetland <swetland@frotz.net>
Date:   Mon, 30 Dec 2013 06:02:33 -0800

Notes on the adventures of cpu-local-storage on x86-64

Diffstat:
M README.64BIT  | 2 ++
A README.CLS  | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 148 insertions(+), 0 deletions(-)
diff --git a/README.64BIT b/README.64BIT
@@ -61,6 +61,8 @@ LESSONS LEARNED / UNRESOLVED
  * Xv6:32 creates kernel mappings (separate copies) in every process's
    page table, roughly 70k pages worth, consuming about 280MB of ram
    for page tables.
+ * Xv6:32's trick for cpu-local storage does not work on gcc x86-64
+   - see README.CLS for an exploration of alternate options
 
 IMPLEMENTATION NOTES
  * use "kernel" addressing mode for kernel compilation
diff --git a/README.CLS b/README.CLS
@@ -0,0 +1,146 @@
+
+CPU LOCAL STORAGE
+
+Basically a similar concept to thread local storage, but in Xv6's case
+these are per-cpu, not per-thread.  
+
+GLOBAL REGISTER VARIABLES
+-------------------------
+
+Xv6:32 uses this gcc trick to generate GS: relative access to a few
+globals for cpu-local-storage:
+
+  extern struct cpu *cpu asm("%gs:0);
+
+Sadly this does not work on x86-64, instead generating a pc-relative 
+load and various unhappiness results.  In this case and the other
+options I explored, I took a look at a chunk of code generated by
+a common expression using a structure from cpu local storage:
+ 
+  if (proc->killed) ...
+
+with asm("%gs:4") on i386
+
+:       65 a1 04 00 00 00       mov    %gs:0x4,%eax
+:       8b 40 24                mov    0x24(%eax),%eax
+:       85 c0                   test   %eax,%eax
+
+with asm("%gs:8") on x86-64
+
+:       65 48 8b 05 04 00 00    mov    %gs:0x8(%rip),%rax
+:       00 
+:       8b 40 50                mov    0x50(%rax),%eax
+:       85 c0                   test   %eax,%eax
+
+This results in rax = [ gs + rip + 8 ] which is never what we want...
+
+With -O1, in both cases the mov and test are combined into something like
+
+:       65 a1 04 00 00 00       mov    %gs:0x4,%eax
+:       83 78 24 00             cmpl   $0x0,0x24(%eax)
+
+
+__THREAD MODIFIER
+-----------------
+
+gcc supports a construct for thread-local variables:
+
+  extern __thread struct cpu *cpu;
+
+with __thread and -mtls-direct-seg-refs on i386
+
+:       48 c7 c0 f8 ff ff ff    mov    $0xfffffffffffffff8,%rax
+:       64 48 8b 00             mov    %fs:(%rax),%rax
+:       8b 40 50                mov    0x50(%rax),%eax
+:       85 c0                   test   %eax,%eax
+
+with __thread and -mtls-direct-seg-refs on x86-64
+
+:       b8 fc ff ff ff          mov    $0xfffffffc,%eax
+:       65 8b 00                mov    %gs:(%eax),%eax
+:       8b 40 24                mov    0x24(%eax),%eax
+:       85 c0                   test   %eax,%eax
+
+The choice of segment (fs or gs) seems to be baked into gcc and
+is chosen based on 32bit or 64bit compilation mode.
+
+The linker generates an TLS section:
+
+  Type           Offset             VirtAddr           PhysAddr
+                 FileSiz            MemSiz              Flags  Align
+  LOAD           0x0000000000001000 0xffffffff80100000 0x0000000000100000
+                 0x000000000000da00 0x0000000000016778  RWE    1000
+  TLS            0x000000000000ea00 0xffffffff8010da00 0x000000000010da00
+                 0x0000000000000000 0x0000000000000010  R      8
+and TLS symbols:
+
+   168: 0000000000000008     8 TLS     GLOBAL DEFAULT    5 proc
+   233: 0000000000000000     8 TLS     GLOBAL DEFAULT    5 cpu
+
+In this model I just point fs (or gs) at the top of a page of local
+storage space I allocate (since I only have a handful of local items
+to track).
+
+These are a bit less convenient because of the negative indexing and
+the fact that you're at the compiler and linker's whim for where things
+end up.  Also they require longer (and probably slower) instruction
+sequences to allow the local index to be patched up by the linker.
+
+Lack of control over which segment register is used is a further
+downside.
+
+
+MACROS AND INLINE ASSEMBLY
+--------------------------
+
+#define __local_get(n) ({ \
+  uint64 res; \
+  asm ("mov %%gs:" #n ",%0" : "=r" (res)); \
+  res; \
+})
+
+#define __local_put(n, v) ({ \
+  uint64 val = v; \
+  asm ("mov %0, %%gs:" #n : : "r" (val)); \
+})
+
+#define __proc() ((struct proc*) __local_get(4))
+
+  if (__proc()->killed) ...
+
+x86-64 without optimization:
+
+:       65 48 8b 04 25 08 00    mov    %gs:0x4,%rax
+:       00 00 
+:       48 89 45 d0             mov    %rax,-0x30(%rbp)
+:       48 8b 45 d0             mov    -0x30(%rbp),%rax
+:       8b 40 50                mov    0x50(%rax),%eax
+:       85 c0                   test   %eax,%eax
+
+x86-64 with -O1:
+
+:       65 48 8b 04 25 08 00    mov    %gs:0x4,%rax
+:       00 00 
+:       83 78 50 00             cmpl   $0x0,0x50(%rax)
+
+i386 without optimization:
+
+:       65 8b 1d 04 00 00 00    mov    %gs:0x4,%ebx
+:       89 5d f4                mov    %ebx,-0xc(%ebp)
+:       8b 45 f4                mov    -0xc(%ebp),%eax
+:       8b 40 24                mov    0x24(%eax),%eax
+:       85 c0                   test   %eax,%eax
+
+i386 with -O1:
+
+:       65 a1 04 00 00 00       mov    %gs:0x4,%eax
+:       83 78 24 00             cmpl   $0x0,0x24(%eax)
+
+These are less efficient than the others when compiling unoptimized
+(though that's an unusual state), but they cost no more than the
+global register variable trick originally used and have the benefit
+of generating correct code for both 32 and 64 bit modes.
+
+They do have the downside that you can't use one construct
+for both setting and getting the contents of a local storage
+variable.

	xv6 Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	README.64BIT	\|	2	++
A	README.CLS	\|	146	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++