commit 916fd4062da5cd25d5fe2b8922895de54714e84d
parent 87e4d9e6ab49713018d74732bb248f99078fbfb1
Author: Brian Swetland <swetland@frotz.net>
Date: Mon, 30 Dec 2013 06:02:33 -0800
Notes on the adventures of cpu-local-storage on x86-64
Diffstat:
M | README.64BIT | | | 2 | ++ |
A | README.CLS | | | 146 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 148 insertions(+), 0 deletions(-)
diff --git a/README.64BIT b/README.64BIT
@@ -61,6 +61,8 @@ LESSONS LEARNED / UNRESOLVED
* Xv6:32 creates kernel mappings (separate copies) in every process's
page table, roughly 70k pages worth, consuming about 280MB of ram
for page tables.
+ * Xv6:32's trick for cpu-local storage does not work on gcc x86-64
+ - see README.CLS for an exploration of alternate options
IMPLEMENTATION NOTES
* use "kernel" addressing mode for kernel compilation
diff --git a/README.CLS b/README.CLS
@@ -0,0 +1,146 @@
+
+CPU LOCAL STORAGE
+
+Basically a similar concept to thread local storage, but in Xv6's case
+these are per-cpu, not per-thread.
+
+GLOBAL REGISTER VARIABLES
+-------------------------
+
+Xv6:32 uses this gcc trick to generate GS: relative access to a few
+globals for cpu-local-storage:
+
+ extern struct cpu *cpu asm("%gs:0);
+
+Sadly this does not work on x86-64, instead generating a pc-relative
+load and various unhappiness results. In this case and the other
+options I explored, I took a look at a chunk of code generated by
+a common expression using a structure from cpu local storage:
+
+ if (proc->killed) ...
+
+with asm("%gs:4") on i386
+
+: 65 a1 04 00 00 00 mov %gs:0x4,%eax
+: 8b 40 24 mov 0x24(%eax),%eax
+: 85 c0 test %eax,%eax
+
+with asm("%gs:8") on x86-64
+
+: 65 48 8b 05 04 00 00 mov %gs:0x8(%rip),%rax
+: 00
+: 8b 40 50 mov 0x50(%rax),%eax
+: 85 c0 test %eax,%eax
+
+This results in rax = [ gs + rip + 8 ] which is never what we want...
+
+With -O1, in both cases the mov and test are combined into something like
+
+: 65 a1 04 00 00 00 mov %gs:0x4,%eax
+: 83 78 24 00 cmpl $0x0,0x24(%eax)
+
+
+__THREAD MODIFIER
+-----------------
+
+gcc supports a construct for thread-local variables:
+
+ extern __thread struct cpu *cpu;
+
+with __thread and -mtls-direct-seg-refs on i386
+
+: 48 c7 c0 f8 ff ff ff mov $0xfffffffffffffff8,%rax
+: 64 48 8b 00 mov %fs:(%rax),%rax
+: 8b 40 50 mov 0x50(%rax),%eax
+: 85 c0 test %eax,%eax
+
+with __thread and -mtls-direct-seg-refs on x86-64
+
+: b8 fc ff ff ff mov $0xfffffffc,%eax
+: 65 8b 00 mov %gs:(%eax),%eax
+: 8b 40 24 mov 0x24(%eax),%eax
+: 85 c0 test %eax,%eax
+
+The choice of segment (fs or gs) seems to be baked into gcc and
+is chosen based on 32bit or 64bit compilation mode.
+
+The linker generates an TLS section:
+
+ Type Offset VirtAddr PhysAddr
+ FileSiz MemSiz Flags Align
+ LOAD 0x0000000000001000 0xffffffff80100000 0x0000000000100000
+ 0x000000000000da00 0x0000000000016778 RWE 1000
+ TLS 0x000000000000ea00 0xffffffff8010da00 0x000000000010da00
+ 0x0000000000000000 0x0000000000000010 R 8
+and TLS symbols:
+
+ 168: 0000000000000008 8 TLS GLOBAL DEFAULT 5 proc
+ 233: 0000000000000000 8 TLS GLOBAL DEFAULT 5 cpu
+
+In this model I just point fs (or gs) at the top of a page of local
+storage space I allocate (since I only have a handful of local items
+to track).
+
+These are a bit less convenient because of the negative indexing and
+the fact that you're at the compiler and linker's whim for where things
+end up. Also they require longer (and probably slower) instruction
+sequences to allow the local index to be patched up by the linker.
+
+Lack of control over which segment register is used is a further
+downside.
+
+
+MACROS AND INLINE ASSEMBLY
+--------------------------
+
+#define __local_get(n) ({ \
+ uint64 res; \
+ asm ("mov %%gs:" #n ",%0" : "=r" (res)); \
+ res; \
+})
+
+#define __local_put(n, v) ({ \
+ uint64 val = v; \
+ asm ("mov %0, %%gs:" #n : : "r" (val)); \
+})
+
+#define __proc() ((struct proc*) __local_get(4))
+
+ if (__proc()->killed) ...
+
+x86-64 without optimization:
+
+: 65 48 8b 04 25 08 00 mov %gs:0x4,%rax
+: 00 00
+: 48 89 45 d0 mov %rax,-0x30(%rbp)
+: 48 8b 45 d0 mov -0x30(%rbp),%rax
+: 8b 40 50 mov 0x50(%rax),%eax
+: 85 c0 test %eax,%eax
+
+x86-64 with -O1:
+
+: 65 48 8b 04 25 08 00 mov %gs:0x4,%rax
+: 00 00
+: 83 78 50 00 cmpl $0x0,0x50(%rax)
+
+i386 without optimization:
+
+: 65 8b 1d 04 00 00 00 mov %gs:0x4,%ebx
+: 89 5d f4 mov %ebx,-0xc(%ebp)
+: 8b 45 f4 mov -0xc(%ebp),%eax
+: 8b 40 24 mov 0x24(%eax),%eax
+: 85 c0 test %eax,%eax
+
+i386 with -O1:
+
+: 65 a1 04 00 00 00 mov %gs:0x4,%eax
+: 83 78 24 00 cmpl $0x0,0x24(%eax)
+
+These are less efficient than the others when compiling unoptimized
+(though that's an unusual state), but they cost no more than the
+global register variable trick originally used and have the benefit
+of generating correct code for both 32 and 64 bit modes.
+
+They do have the downside that you can't use one construct
+for both setting and getting the contents of a local storage
+variable.