# HG changeset patch
# User Tassilo Philipp
# Date 1574513495 -3600
# Node ID 74c056b597b73d4df21c93cfd4a6fb8541db2e07
# Parent  4e6f63b7020e1c02c82bbeb2c55200e10b885954
- disassembly example annotations
- callconv appendix in doc:
  * ppc64 chapter
  * some cleanups for consistency

diff -r 4e6f63b7020e -r 74c056b597b7 doc/disas_examples/ppc.darwin.disas
--- a/doc/disas_examples/ppc.darwin.disas	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/disas_examples/ppc.darwin.disas	Sat Nov 23 13:51:35 2019 +0100
@@ -50,7 +50,7 @@
       5c:       91 1e 00 7c     stw 8, 124(30)  ; |
       60:       91 3e 00 80     stw 9, 128(30)  ; |
       64:       91 5e 00 84     stw 10, 132(30) ; |
-      68:       80 01 00 00     lwz 0, 0(1)     ; fetch sp saved on stack of top by prolog -> gpr0, and ...
+      68:       80 01 00 00     lwz 0, 0(1)     ; fetch back-chain ptr (parent frame's sp) from stack of top by prolog -> gpr0, and ...
       6c:       94 01 ff 10     stwu 0, -240(1) ; ... update it further up the stack for alloca(220) - with padding to guarantee alignment
       70:       38 41 00 40     addi 2, 1, 64   ; |
       74:       38 02 00 0f     addi 0, 2, 15   ; | start of alloca()'d memory -> gpr2, by ...
@@ -152,7 +152,7 @@
       60:       91 1e 00 8c     stw 8, 140(30)  ; |
       64:       91 3e 00 90     stw 9, 144(30)  ; |
       68:       91 5e 00 94     stw 10, 148(30) ; |
-      6c:       80 01 00 00     lwz 0, 0(1)     ; fetch sp saved on stack of top by prolog -> gpr0, and ...
+      6c:       80 01 00 00     lwz 0, 0(1)     ; fetch back-chain ptr (parent frame's sp) from stack of top by prolog -> gpr0, and ...
       70:       94 01 ff 10     stwu 0, -240(1) ; ... update it further up the stack for alloca(220) - with padding to guarantee alignment
       74:       38 41 00 50     addi 2, 1, 80   ; |
       78:       38 02 00 0f     addi 0, 2, 15   ; | start of alloca()'d memory -> gpr2, by ...
@@ -335,7 +335,7 @@
      150:       c8 09 00 00     lfd 0, 0(9)     ; |
      154:       fc 00 00 18     frsp 0, 0       ; |
      158:       d0 1e 00 70     stfs 0, 112(30) ; /
-     15c:       80 01 00 00     lwz 0, 0(1)     ; fetch sp saved on stack of top by prolog -> g
+     15c:       80 01 00 00     lwz 0, 0(1)     ; fetch back-chain ptr (parent frame's sp) from stack of top by prolog -> gpr0, and ...
      160:       94 01 ff 10     stwu 0, -240(1) ; ... update it further up the stack for alloca
      164:       38 41 00 50     addi 2, 1, 80   ; |
      168:       38 02 00 0f     addi 0, 2, 15   ; | start of alloca()'d memory -> gpr2, by ...
diff -r 4e6f63b7020e -r 74c056b597b7 doc/disas_examples/ppc64.elfabi.disas
--- a/doc/disas_examples/ppc64.elfabi.disas	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/disas_examples/ppc64.elfabi.disas	Sat Nov 23 13:51:35 2019 +0100
@@ -42,89 +42,232 @@
   48:	80 01 00 01 	lwz     r0,1(r1)
 
 000000000000004c <.nonleaf_call>:
-  4c:	7c 08 02 a6 	mflr    r0
-  50:	fb e1 ff f8 	std     r31,-8(r1)
-  54:	f8 01 00 10 	std     r0,16(r1)
-  58:	f8 21 ff 71 	stdu    r1,-144(r1)
-  5c:	7c 3f 0b 78 	mr      r31,r1
-  60:	7c 60 1b 78 	mr      r0,r3
-  64:	7c 8b 23 78 	mr      r11,r4
-  68:	90 1f 00 c0 	stw     r0,192(r31)
-  6c:	91 7f 00 c8 	stw     r11,200(r31)
-  70:	90 bf 00 d0 	stw     r5,208(r31)
-  74:	90 df 00 d8 	stw     r6,216(r31)
-  78:	90 ff 00 e0 	stw     r7,224(r31)
-  7c:	91 1f 00 e8 	stw     r8,232(r31)
-  80:	91 3f 00 f0 	stw     r9,240(r31)
-  84:	91 5f 00 f8 	stw     r10,248(r31)
-  88:	e8 01 00 00 	ld      r0,0(r1)
-  8c:	f8 01 ff 11 	stdu    r0,-240(r1)
-  90:	39 21 00 70 	addi    r9,r1,112
-  94:	f9 3f 00 70 	std     r9,112(r31)
-  98:	e9 3f 00 70 	ld      r9,112(r31)
-  9c:	38 09 00 0f 	addi    r0,r9,15
-  a0:	78 00 e1 02 	rldicl  r0,r0,60,4
-  a4:	78 00 26 e4 	rldicr  r0,r0,4,59
-  a8:	f8 1f 00 70 	std     r0,112(r31)
-  ac:	e9 3f 00 70 	ld      r9,112(r31)
-  b0:	38 00 00 4c 	li      r0,76
-  b4:	98 09 00 00 	stb     r0,0(r9)
-  b8:	80 1f 00 c8 	lwz     r0,200(r31)
-  bc:	7c 08 07 b4 	extsw   r8,r0
-  c0:	80 1f 00 d0 	lwz     r0,208(r31)
-  c4:	7c 07 07 b4 	extsw   r7,r0
-  c8:	80 1f 00 d8 	lwz     r0,216(r31)
-  cc:	7c 06 07 b4 	extsw   r6,r0
-  d0:	80 1f 00 e0 	lwz     r0,224(r31)
-  d4:	7c 09 07 b4 	extsw   r9,r0
-  d8:	80 1f 00 e8 	lwz     r0,232(r31)
-  dc:	7c 0b 07 b4 	extsw   r11,r0
-  e0:	80 1f 00 f0 	lwz     r0,240(r31)
-  e4:	7c 0a 07 b4 	extsw   r10,r0
-  e8:	80 1f 00 f8 	lwz     r0,248(r31)
-  ec:	7c 00 07 b4 	extsw   r0,r0
-  f0:	7d 03 43 78 	mr      r3,r8
-  f4:	7c e4 3b 78 	mr      r4,r7
-  f8:	7c c5 33 78 	mr      r5,r6
-  fc:	7d 26 4b 78 	mr      r6,r9
- 100:	7d 67 5b 78 	mr      r7,r11
- 104:	7d 48 53 78 	mr      r8,r10
- 108:	7c 09 03 78 	mr      r9,r0
- 10c:	48 00 00 01 	bl      10c <.nonleaf_call+0xc0>
- 110:	e8 21 00 00 	ld      r1,0(r1)
- 114:	e8 01 00 10 	ld      r0,16(r1)
- 118:	7c 08 03 a6 	mtlr    r0
- 11c:	eb e1 ff f8 	ld      r31,-8(r1)
- 120:	4e 80 00 20 	blr
- 124:	00 00 00 00 	.long 0x0
- 128:	00 00 00 01 	.long 0x1
- 12c:	80 01 00 01 	lwz     r0,1(r1)
+  4c:	7c 08 02 a6 	mflr    r0                       ; |
+  50:	fb e1 ff f8 	std     r31,-8(r1)               ; |
+  54:	f8 01 00 10 	std     r0,16(r1)                ; | prolog
+  58:	f8 21 ff 71 	stdu    r1,-144(r1)              ; |
+  5c:	7c 3f 0b 78 	mr      r31,r1                   ; use gpr31 as sort of frame pointer, below
+  60:	7c 60 1b 78 	mr      r0,r3                    ; in arg 0 -> gpr0
+  64:	7c 8b 23 78 	mr      r11,r4                   ; in arg 1 -> gpr11
+  68:	90 1f 00 c0 	stw     r0,192(r31)              ; |
+  6c:	91 7f 00 c8 	stw     r11,200(r31)             ; |
+  70:	90 bf 00 d0 	stw     r5,208(r31)              ; |
+  74:	90 df 00 d8 	stw     r6,216(r31)              ; |
+  78:	90 ff 00 e0 	stw     r7,224(r31)              ; | all in args -> spill area in prev frame (jump over own frame (144) + linkage area of prev frame (48) = 192)
+  7c:	91 1f 00 e8 	stw     r8,232(r31)              ; |
+  80:	91 3f 00 f0 	stw     r9,240(r31)              ; |
+  84:	91 5f 00 f8 	stw     r10,248(r31)             ; |
+  88:	e8 01 00 00 	ld      r0,0(r1)                 ; fetch back-chain ptr (parent frame's sp) from stack of top by prolog -> gpr0, and ...
+  8c:	f8 01 ff 11 	stdu    r0,-240(r1)              ; ... update it further up the stack for alloca(220) - with padding to guarantee alignment
+  90:	39 21 00 70 	addi    r9,r1,112                ; |
+  94:	f9 3f 00 70 	std     r9,112(r31)              ; |
+  98:	e9 3f 00 70 	ld      r9,112(r31)              ; |
+  9c:	38 09 00 0f 	addi    r0,r9,15                 ; | start of alloca()'d memory -> gpr9, by ...
+  a0:	78 00 e1 02 	rldicl  r0,r0,60,4               ; | ... using gpr0 as helper to align to 16b, leaving at least 112b at top of stack
+  a4:	78 00 26 e4 	rldicr  r0,r0,4,59               ; |
+  a8:	f8 1f 00 70 	std     r0,112(r31)              ; |
+  ac:	e9 3f 00 70 	ld      r9,112(r31)              ; |
+  b0:	38 00 00 4c 	li      r0,76                    ; 'L' -> gpr0, and ...
+  b4:	98 09 00 00 	stb     r0,0(r9)                 ; ... store in local area (of alloca()'d space)
+  b8:	80 1f 00 c8 	lwz     r0,200(r31)              ; prep arg 0 (from prev frame's spill area), ...
+  bc:	7c 08 07 b4 	extsw   r8,r0                    ; ... -> gpr8 (w/ sign extension b/c int param in 64bit reg)
+  c0:	80 1f 00 d0 	lwz     r0,208(r31)              ; prep arg 1 (from prev frame's spill area), ...
+  c4:	7c 07 07 b4 	extsw   r7,r0                    ; ... -> gpr7
+  c8:	80 1f 00 d8 	lwz     r0,216(r31)              ; prep arg 2 (from prev frame's spill area), ...
+  cc:	7c 06 07 b4 	extsw   r6,r0                    ; ... -> gpr6
+  d0:	80 1f 00 e0 	lwz     r0,224(r31)              ; prep arg 3 (from prev frame's spill area), ...
+  d4:	7c 09 07 b4 	extsw   r9,r0                    ; ... -> gpr9
+  d8:	80 1f 00 e8 	lwz     r0,232(r31)              ; prep arg 4 (from prev frame's spill area), ...
+  dc:	7c 0b 07 b4 	extsw   r11,r0                   ; ... -> gpr11
+  e0:	80 1f 00 f0 	lwz     r0,240(r31)              ; prep arg 5 (from prev frame's spill area), ...
+  e4:	7c 0a 07 b4 	extsw   r10,r0                   ; ... -> gpr10
+  e8:	80 1f 00 f8 	lwz     r0,248(r31)              ; prep arg 6 (from prev frame's spill area), ...
+  ec:	7c 00 07 b4 	extsw   r0,r0                    ; ... -> gpr0
+  f0:	7d 03 43 78 	mr      r3,r8                    ; arg 0
+  f4:	7c e4 3b 78 	mr      r4,r7                    ; arg 1
+  f8:	7c c5 33 78 	mr      r5,r6                    ; arg 2
+  fc:	7d 26 4b 78 	mr      r6,r9                    ; arg 3
+ 100:	7d 67 5b 78 	mr      r7,r11                   ; arg 4
+ 104:	7d 48 53 78 	mr      r8,r10                   ; arg 5
+ 108:	7c 09 03 78 	mr      r9,r0                    ; arg 6
+ 10c:	48 00 00 01 	bl      10c <.nonleaf_call+0xc0> ; call and put return address -> lr
+ 110:	e8 21 00 00 	ld      r1,0(r1)                 ; |
+ 114:	e8 01 00 10 	ld      r0,16(r1)                ; |
+ 118:	7c 08 03 a6 	mtlr    r0                       ; | epilog
+ 11c:	eb e1 ff f8 	ld      r31,-8(r1)               ; |
+ 120:	4e 80 00 20 	blr                              ; |
+ 124:	00 00 00 00 	.long 0x0                        ; data
+ 128:	00 00 00 01 	.long 0x1                        ; data
+ 12c:	80 01 00 01 	lwz     r0,1(r1)                 ; unsure@@@. data?
 
 0000000000000130 <.main>:
- 130:	7c 08 02 a6 	mflr    r0
- 134:	fb e1 ff f8 	std     r31,-8(r1)
- 138:	f8 01 00 10 	std     r0,16(r1)
- 13c:	f8 21 ff 81 	stdu    r1,-128(r1)
- 140:	7c 3f 0b 78 	mr      r31,r1
- 144:	38 60 00 00 	li      r3,0
- 148:	38 80 00 01 	li      r4,1
- 14c:	38 a0 00 02 	li      r5,2
- 150:	38 c0 00 03 	li      r6,3
- 154:	38 e0 00 04 	li      r7,4
- 158:	39 00 00 05 	li      r8,5
- 15c:	39 20 00 06 	li      r9,6
- 160:	39 40 00 07 	li      r10,7
- 164:	48 00 00 01 	bl      164 <.main+0x34>
- 168:	38 00 00 00 	li      r0,0
- 16c:	7c 03 03 78 	mr      r3,r0
- 170:	e8 21 00 00 	ld      r1,0(r1)
- 174:	e8 01 00 10 	ld      r0,16(r1)
- 178:	7c 08 03 a6 	mtlr    r0
- 17c:	eb e1 ff f8 	ld      r31,-8(r1)
- 180:	4e 80 00 20 	blr
- 184:	00 00 00 00 	.long 0x0
- 188:	00 00 00 01 	.long 0x1
- 18c:	80 01 00 01 	lwz     r0,1(r1)
+ 130:	7c 08 02 a6 	mflr    r0                       ; |             lr -> gpr0
+ 134:	fb e1 ff f8 	std     r31,-8(r1)               ; |             preseve gpr31 (as used in func as helper addr)
+ 138:	f8 01 00 10 	std     r0,16(r1)                ; | prolog      store lr
+ 13c:	f8 21 ff 81 	stdu    r1,-128(r1)              ; |             open frame
+ 140:	7c 3f 0b 78 	mr      r31,r1                   ; use gpr31 as sort of frame pointer, below
+ 144:	38 60 00 00 	li      r3,0                     ; arg 0
+ 148:	38 80 00 01 	li      r4,1                     ; arg 1
+ 14c:	38 a0 00 02 	li      r5,2                     ; arg 2
+ 150:	38 c0 00 03 	li      r6,3                     ; arg 3
+ 154:	38 e0 00 04 	li      r7,4                     ; arg 4
+ 158:	39 00 00 05 	li      r8,5                     ; arg 5
+ 15c:	39 20 00 06 	li      r9,6                     ; arg 6
+ 160:	39 40 00 07 	li      r10,7                    ; arg 7
+ 164:	48 00 00 01 	bl      164 <.main+0x34>         ; call and put return address -> lr
+ 168:	38 00 00 00 	li      r0,0                     ; return value ...
+ 16c:	7c 03 03 78 	mr      r3,r0                    ; ... in gpr3
+ 170:	e8 21 00 00 	ld      r1,0(r1)                 ; |
+ 174:	e8 01 00 10 	ld      r0,16(r1)                ; |
+ 178:	7c 08 03 a6 	mtlr    r0                       ; | epilog
+ 17c:	eb e1 ff f8 	ld      r31,-8(r1)               ; |
+ 180:	4e 80 00 20 	blr                              ; |
+ 184:	00 00 00 00 	.long 0x0                        ; data
+ 188:	00 00 00 01 	.long 0x1                        ; data
+ 18c:	80 01 00 01 	lwz     r0,1(r1)                 ; unsure@@@. data?
+
+
+
+; ------------- ints and floats, var args, struct return value (meaning implicit first param), more than 8 params (11, with implicit return value ptr) ----------->
+
+; #include <stdlib.h>
+; #include <stdarg.h>
+; 
+; void leaf_call(int b, float c, int d, float e, int f, float g, float h, int i, float j)
+; {
+; }
+; 
+; struct aggr { int x; int y; int z; };
+; 
+; struct aggr nonleaf_call(int a, int b, float c, int d, float e, int f, ...)
+; {
+;     va_list v;
+;     int i;
+;     float g, h, j;
+;     struct aggr st = { b, d, f };
+;     va_start(v, f);
+;     g = va_arg(v, float);
+;     h = va_arg(v, float);
+;     i = va_arg(v, int);
+;     h = va_arg(v, float);
+;     /* use some local data */
+;     *(char*)alloca(220) = 'L';
+;     leaf_call(b, c, d, e, f, g, h, i, j);
+; 
+;     return st;
+; }
+; 
+; int main()
+; {
+;     struct aggr st = nonleaf_call(0, 1, 2.f, 3, 4.f, 5, 6.f, 7.f, 8, 9.f);
+;     return 0;
+; }
+
+
+
+; output from freebsd-11.0-ppc64 w/ gcc 4.2.1
+
+0000000000000000 <.leaf_call>:
+   0:	fb e1 ff f8 	std     r31,-8(r1)
+   4:	f8 21 ff c1 	stdu    r1,-64(r1)
+   8:	7c 3f 0b 78 	mr      r31,r1
+   c:	7c 69 1b 78 	mr      r9,r3
+  10:	d0 3f 00 78 	stfs    f1,120(r31)
+  14:	7c ab 2b 78 	mr      r11,r5
+  18:	d0 5f 00 88 	stfs    f2,136(r31)
+  1c:	7c e8 3b 78 	mr      r8,r7
+  20:	d0 7f 00 98 	stfs    f3,152(r31)
+  24:	d0 9f 00 a0 	stfs    f4,160(r31)
+  28:	7d 40 53 78 	mr      r0,r10
+  2c:	d0 bf 00 b0 	stfs    f5,176(r31)
+  30:	91 3f 00 70 	stw     r9,112(r31)
+  34:	91 7f 00 80 	stw     r11,128(r31)
+  38:	91 1f 00 90 	stw     r8,144(r31)
+  3c:	90 1f 00 a8 	stw     r0,168(r31)
+  40:	e8 21 00 00 	ld      r1,0(r1)
+  44:	eb e1 ff f8 	ld      r31,-8(r1)
+  48:	4e 80 00 20 	blr
+	...
+  54:	80 01 00 01 	lwz     r0,1(r1)
+
+0000000000000058 <.nonleaf_call>:
+  58:	fb e1 ff f8 	std     r31,-8(r1)       ; |
+  5c:	f8 21 ff 91 	stdu    r1,-112(r1)      ; | prolog
+  60:	7c 3f 0b 78 	mr      r31,r1           ; use gpr31 as sort of frame pointer, below
+  64:	7c 8b 23 78 	mr      r11,r4           ; in arg 1 (first explicit arg, b/c of struct return value ptr being arg0) -> r11
+  68:	7c a8 2b 78 	mr      r8,r5            ; in arg 2 -> r8 (free reg, was skipped for float param)
+  6c:	d0 3f 00 b8 	stfs    f1,184(r31)      ; |                  in arg 3 (float) -> prev frame's spill area: 184 = 112 (frame) + 48 (prev frame's linkage area) + 8 (arg 0 = return value ptr) + 16 (first two explicit args)
+  70:	d0 5f 00 c8 	stfs    f2,200(r31)      ; |                  in arg 5 (float) -> prev frame's spill area
+  74:	f9 5f 00 d8 	std     r10,216(r31)     ; |                  in arg 7 (float, also held in gpr reg b/c vararg) -> prev frame's spill area
+  78:	7d 20 4b 78 	mr      r0,r9            ; | spilling         in arg 6 in gpr0 (spilled below)
+  7c:	91 7f 00 a8 	stw     r11,168(r31)     ; |                  in arg 1 (int) -> prev frame's spill area
+  80:	91 1f 00 b0 	stw     r8,176(r31)      ; |                  in arg 2 (int) -> prev frame's spill area
+  84:	90 ff 00 c0 	stw     r7,192(r31)      ; |                  in arg 4 (int) -> prev frame's spill area
+  88:	90 1f 00 d0 	stw     r0,208(r31)      ; /                  in arg 6 (int) -> prev frame's spill area
+  8c:	80 1f 00 b0 	lwz     r0,176(r31)      ; \
+  90:	90 1f 00 48 	stw     r0,72(r31)       ; |
+  94:	80 1f 00 c0 	lwz     r0,192(r31)      ; |
+  98:	90 1f 00 4c 	stw     r0,76(r31)       ; | filling struct with 3 int input args
+  9c:	80 1f 00 d0 	lwz     r0,208(r31)      ; |
+  a0:	90 1f 00 50 	stw     r0,80(r31)       ; |
+  a4:	38 1f 00 d8 	addi    r0,r31,216       ;
+  a8:	f8 1f 00 40 	std     r0,64(r31)       ;     .
+  ac:	7f e0 00 08 	trap                     ;     .
+	...                                          ;     .
+  b8:	80 01 00 01 	lwz     r0,1(r1)         ;
+
+00000000000000bc <.main>:
+  bc:	7c 08 02 a6 	mflr    r0               ; |             lr -> gpr0
+  c0:	fb e1 ff f8 	std     r31,-8(r1)       ; |             preseve gpr31 (as used in func as helper addr)
+  c4:	f8 01 00 10 	std     r0,16(r1)        ; | prolog      store lr
+  c8:	f8 21 ff 41 	stdu    r1,-192(r1)      ; |             open frame
+  cc:	7c 3f 0b 78 	mr      r31,r1           ; use gpr31 as sort of frame pointer, below
+  d0:	39 61 00 30 	addi    r11,r1,48        ; ptr to param area -> r11
+  d4:	e9 22 00 00 	ld      r9,0(r2)         ; prep arg 3 (=explicit arg 2, b/c of implicit return value pointer), ... 
+  d8:	c1 a9 00 00 	lfs     f13,0(r9)        ; ... load from static data -> f13
+  dc:	e9 22 00 08 	ld      r9,8(r2)         ; prep arg 5, ...
+  e0:	c1 89 00 00 	lfs     f12,0(r9)        ; ... load from static data -> f12
+  e4:	e9 22 00 10 	ld      r9,16(r2)        ; prep arg 7, ...
+  e8:	c8 09 00 00 	lfd     f0,0(r9)         ; ... load from static data -> f0
+  ec:	d8 1f 00 a0 	stfd    f0,160(r31)      ; |
+  f0:	e8 1f 00 a0 	ld      r0,160(r31)      ; |
+  f4:	7c 09 03 78 	mr      r9,r0            ; | also hold it in f11 (temporarily, before copying to fpr3 below)
+  f8:	7d 2a 4b 78 	mr      r10,r9           ; | and gpr10 (instead of skipping that int reg, for straightforward spilling)
+  fc:	f8 1f 00 a0 	std     r0,160(r31)      ; | (uses temp space to copy between fpr and gpr regs)
+ 100:	c8 1f 00 a0 	lfd     f0,160(r31)      ; |
+ 104:	fd 60 00 90 	fmr     f11,f0           ; |
+ 108:	e9 22 00 18 	ld      r9,24(r2)        ; prep arg 8, ...
+ 10c:	c8 09 00 00 	lfd     f0,0(r9)         ; ... load from static data -> fpr0, and ...
+ 110:	d8 0b 00 40 	stfd    f0,64(r11)       ; ... "pushed" onto stack (in param area past spill area) and ...
+ 114:	c9 4b 00 40 	lfd     f10,64(r11)      ; ... also held in f10 (prep, see where it's used below)
+ 118:	38 00 00 08 	li      r0,8             ; arg 9, ...
+ 11c:	f8 0b 00 48 	std     r0,72(r11)       ; ... "pushed" onto stack
+ 120:	e9 22 00 20 	ld      r9,32(r2)        ; arg 10 (float, promoted to double), ...
+ 124:	c8 09 00 00 	lfd     f0,0(r9)         ; ... load from static data -> fpr0, and ...
+ 128:	d8 0b 00 50 	stfd    f0,80(r11)       ; ... "pushed" onto stack
+ 12c:	c8 0b 00 50 	lfd     f0,80(r11)       ; ... also held in f0 (prep, see where it's used below), in theory pointless reload of arg10 -> fpr0
+ 130:	38 1f 00 90 	addi    r0,r31,144       ; ptr to return value struct in local space -> gpr0
+ 134:	7c 03 03 78 	mr      r3,r0            ; arg 0 (this is the pointer to the struct return value)
+ 138:	38 80 00 00 	li      r4,0             ; arg 1
+ 13c:	38 a0 00 01 	li      r5,1             ; arg 2
+ 140:	fc 20 68 90 	fmr     f1,f13           ; arg 3 (float, in 1st double reg)
+ 144:	38 e0 00 03 	li      r7,3             ; arg 4 (skipping gpr6 b/c of float arg)
+ 148:	fc 40 60 90 	fmr     f2,f12           ; arg 5 (float, in 2nd double reg)
+ 14c:	39 20 00 05 	li      r9,5             ; arg 6 (skipping gpr8 b/c of float arg, vararg)
+ 150:	fc 60 58 90 	fmr     f3,f11           ; arg 7 (float, in 3rd double reg, promoted to double anyways b/c vararg)
+ 154:	fc 80 50 90 	fmr     f4,f10           ; arg 8 (float, in 4th double reg, promoted to double anyways b/c vararg)
+ 158:	fc a0 00 90 	fmr     f5,f0            ; arg 10 (float, in 5th double reg, promoted to double anyways b/c vararg)
+ 15c:	48 00 00 01 	bl      15c <.main+0xa0> ; call and put return address -> lr
+ 160:	38 00 00 00 	li      r0,0             ; return value ...
+ 164:	7c 03 03 78 	mr      r3,r0            ; ... in gpr3
+ 168:	e8 21 00 00 	ld      r1,0(r1)         ; |
+ 16c:	e8 01 00 10 	ld      r0,16(r1)        ; |
+ 170:	7c 08 03 a6 	mtlr    r0               ; | epilog
+ 174:	eb e1 ff f8 	ld      r31,-8(r1)       ; |
+ 178:	4e 80 00 20 	blr                      ; |
+ 17c:	00 00 00 00 	.long 0x0                ; data
+ 180:	00 00 00 01 	.long 0x1                ; data
+ 184:	80 01 00 01 	lwz     r0,1(r1)         ; unsure@@@. data?
 
 ; vim: ft=asm
 
diff -r 4e6f63b7020e -r 74c056b597b7 doc/disas_examples/sparc64.sparc64.disas
--- a/doc/disas_examples/sparc64.sparc64.disas	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/disas_examples/sparc64.sparc64.disas	Sat Nov 23 13:51:35 2019 +0100
@@ -499,7 +499,7 @@
   90:   c4 27 a7 fb     st  %g2, [ %fp + 0x7fb ]        ; ... copied to local space (0x7fb - bias = -4) helper var (probably int g)
   94:   82 00 60 08     add  %g1, 8, %g1                ; point read ptr in g1 to second unnamed param (float, promoted to double), ...
   98:   c2 77 a7 e7     stx  %g1, [ %fp + 0x7e7 ]       ; ... store in local space (0x7fb - bias = -24)
-  9c:   91 d0 20 05     ta  5                           ; trap - not sure what else is involved (objdump was made from .o, not finally linked exec)
+  9c:   91 d0 20 05     ta  5                           ; trap - not sure what else is involved (objdump was made from .o, not finally linked exec) - maybe just b/c objdump skipped this for the output?
 
 00000000000000a0 <main>:
   a0:   9d e3 bf 30     save  %sp, -208, %sp            ; prolog
diff -r 4e6f63b7020e -r 74c056b597b7 doc/manual/callconvs/callconv_ppc32.tex
--- a/doc/manual/callconvs/callconv_ppc32.tex	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/manual/callconvs/callconv_ppc32.tex	Sat Nov 23 13:51:35 2019 +0100
@@ -28,9 +28,10 @@
 \item Word size is 32 bits
 \item Big endian (MSB) and litte endian (LSB) operating modes.
 \item Processor operates on floats in double precision floating point arithmetc (IEEE-754) values directly (single precision is converted on the fly)
-\item Apple Mac OS X/Darwin PPC is specified in "Mac OS X ABI Function Call Guide"\cite{ppcMacOSX}. It uses Big Endian (MSB).
-\item Linux PPC 32-bit ABI is specified in "LSB for PPC"\cite{ppc32LSB} which is based on "System V ABI". It uses Big Endian (MSB).
-\item PowerPC EABI is defined in the "PowerPC Embedded Application Binary Interface 32-Bit Implementation".
+\item Apple macos/Mac OS X/Darwin PPC is specified in "Mac OS X ABI Function Call Guide"\cite{ppcMacOSX}. It uses Big Endian (MSB)
+\item Linux PPC 32-bit ABI is specified in "LSB for PPC"\cite{ppc32LSB} which is based on "System V ABI". It uses Big Endian (MSB)
+\item PowerPC EABI is defined in the "PowerPC Embedded Application Binary Interface 32-Bit Implementation"\cite{ppceabi}
+\item There is also the "PowerOpen ABI"\cite{poabi}, a nearly identical version of it is used in AIX % more info: http://www.ingallegri.com/public/ppc.html
 \end{itemize}
 
 \paragraph{\product{dyncall} support}
@@ -76,19 +77,19 @@
 \item stack parameter order: right-to-left
 \item caller cleans up the stack
 \item the first 8 integer parameters are passed in registers gpr3-gpr10
-\item the first 12 floating point parameters are passed in registers fpr1-fpr13
+\item the first 13 floating point parameters are passed in registers fpr1-fpr13
 \item 64 bit arguments are passed as if they were two 32 bit arguments, without skipping registers for alignment (this means passing half via a register and half via the stack is allowed)
 \item if a float parameter is passed via a register, gpr registers are skipped for subsequent integer parameters (based on the size of
 the float - 1 register for single precision and 2 for double precision floating point values)
 \item the caller pushes subsequent parameters onto the stack
 \item for every parameter passed via a register, space is reserved in the stack parameter area (in order to spill the parameters if
 needed - e.g. varargs)
-\item ellipsis calls take floating point values in int and float registers (single precision floats are promoted to double precision
-as defined for ellipsis calls)
+\item ellipsis calls take floating point values in int and float registers (single precision floats are promoted to double precision as
+required by ellipsis calls)
 \item all nonvector parameters are aligned on 4-byte boundaries
 \item vector parameters are aligned on 16-byte boundaries
 \item composite parameters with size of 1 or 2 bytes occupy low-order bytes of their 4-byte area. INCONSISTENT with other 32-bit PPC
-binary interfaces. In AIX and OS 9, padding bytes always follow the data structure
+binary interfaces. In AIX and mac OS 9, padding bytes always follow the data structure
 \item composite parameters 3 bytes or larger in size occupy high-order bytes
 \item integer parameters \textless\ 32 bit are right-justified (meaning occupy higher-address bytes) in their 4-byte slot on the stack, requiring extra-care for big-endian targets
 \end{itemize}
diff -r 4e6f63b7020e -r 74c056b597b7 doc/manual/callconvs/callconv_ppc64.tex
--- a/doc/manual/callconvs/callconv_ppc64.tex	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/manual/callconvs/callconv_ppc64.tex	Sat Nov 23 13:51:35 2019 +0100
@@ -25,7 +25,7 @@
 \paragraph{Overview}
 
 \begin{itemize}
-\item Word size is 64 bits
+\item Word size is 32 bits for historical reasons
 \item Big endian (MSB) and litte endian (LSB) operating modes.
 \item Apple Mac OS X/Darwin PPC is specified in "Mac OS X ABI Function Call Guide"\cite{ppcMacOSX}. It uses Big Endian (MSB).
 \item Linux PPC 64-bit ABI is specified in "64-bit PowerPC ELF Application Binary Interface Supplement"\cite{ppcelf64abi} which is based on "System V ABI".
@@ -40,23 +40,115 @@
 
 \paragraph{Registers and register usage}
 
-@@@
+\begin{table}[h]
+\begin{tabular*}{0.95\textwidth}{3 B}
+Name                & Brief description\\
+\hline
+{\bf gpr0}          & scratch\\
+{\bf gpr1}          & stack pointer\\
+{\bf gpr2}          & TOC base ptr (offset table and data for position independent code), scratch\\
+{\bf gpr3}          & return value, parameter 0 for integer or pointer, scratch\\
+{\bf gpr4-gpr10}    & parameter 1-7 for integer or pointer parameters, scratch\\
 
+{\bf gpr11}         & env pointer if needed, scratch\\
+{\bf gpr12}         & used for exception handling and glink code, scratch\\
+{\bf gpr13}         & used for system thread ID, preserve\\
+{\bf gpr14-31}      & preserve\\
+{\bf fpr0}          & scratch\\
+{\bf fpr1-fpr4}     & floating point return value, floating point parameter 0-3 (always double precision)\\
+{\bf fpr5-fpr13}    & floating point parameters 4-12 (always double precision)\\
+{\bf fpr14-fpr31}   & preserve\\
+{\bf v0-v1}         & scratch\\
+{\bf v2-v13}        & vector parameters\\
+{\bf v14-v19}       & scratch\\
+{\bf v20-v31}       & preserve\\
+{\bf lr}            & link-register, scratch\\
+{\bf ctr}           & count-register, scratch\\
+{\bf xer}           & fixed point exception register, scratch\\
+{\bf fpscr}         & floating point status and control register, scratch\\
+{\bf cr0-cr7}       & conditional register fields, each 4-bit wide (cr0-cr1 and cr5-cr7 are scratch)\\
+\end{tabular*}
+\caption{Register usage on PowerPC 64-Bit ELF ABI}
+\end{table}
 
 \paragraph{Parameter passing}
 
-@@@
 \begin{itemize}
+\item stack grows down
+\item stack parameter order: right-to-left
+\item caller cleans up the stack
+\item stack is always 16 byte aligned
+\item the stack pointer must be atomically updated (to avoid any timing window in which an interrupt can occur with a partially updated stack), usually with the stdu (store doubleword with update) instruction
+\item the first 8 integer parameters are passed in registers gpr3-gpr10
+\item the first 13 floating point parameters are passed in registers fpr1-fpr13
+\item preserved registers are saved using a defined order (from high to low addresses):
+ fpr* (64bit aligned),
+ gpr*,
+ VRSAVE save word (32 bits),
+ padding for alignment (4 or 12 bytes),
+ v* (128bit aligned)
+\item if a floating point parameter is passed via a register, a gpr registers is skipped for subsequent integer parameters
+\item the caller pushes subsequent parameters onto the stack
+\item single precision floating point values use the second word in a doubleword 
+\item a quad precision floating point argument is passed as two consecutive double precision ones
+\item integer types \textless\ 64 bit are sign or zero extended and use a doubleword
+\item ellipsis calls take floating point values in int and float registers (single precision floats are promoted to double precision as
+required by ellipsis calls)
+\item space for all potential gpr* register passed arguments is reserved in the stack parameter area (in order to spill the parameters if
+needed - e.g. varargs), meaning a minimum of 64 bytes to hold gpr3-gpr10
+\item all nonvector parameters are aligned on 8-byte boundaries
+\item vector parameters are aligned on 16-byte boundaries
 \item integer parameters \textless\ 64 bit are right-justified (meaning occupy higher-address bytes) in their 8-byte slot on the stack, requiring extra-care for big-endian targets
 \end{itemize}
 
 
 \paragraph{Return values}
 
-@@@
+\begin{itemize}
+\item return values of integer \textless=\ 32bit or pointer type use gpr3 and are zero or sign extended depending on their type
+\item 64 bit integers use gpr3
+\item floating point values are returned via fpr1
+\item character arrays \textless=\ 8 bytes use gpr3, and are right justified
+\item for all structs/unions (regardless of size) or character arrays \textgreater\ 8 bytes, a secret first parameter with an address to a caller allocated space is passed as first argument to the function (meaning in gpr3), which is written to by the callee
+\end{itemize}
 
 
 \paragraph{Stack layout}
 
-@@@
+Stack frame is always 16-byte aligned.
+% verified/amended: TP nov 2019 (see also doc/disas_examples/ppc64.elfabi.disas)
+Stack directly after function prolog:\\
 
+\begin{figure}[h]
+\begin{tabular}{5|3|1 1}
+                                  & \vdots                        &                                      &                               \\
+\hhline{~=~~}
+register save area                & \hspace{4cm}                  &                                      & \mrrbrace{14}{caller's frame} \\
+\hhline{~-~~}
+local data                        &                               &                                      &                               \\
+\hhline{~-~~}
+\mrlbrace{6}{parameter area}      & last arg                      & \mrrbrace{3}{stack parameters}       &                               \\
+                                  & \ldots                        &                                      &                               \\
+                                  & arg 8                         &                                      &                               \\
+                                  & gpr10                         & \mrrbrace{3}{spill area (as needed)} &                               \\
+                                  & \ldots                        &                                      &                               \\
+                                  & gpr3                          &                                      &                               \\
+\hhline{~-~~}
+\mrlbrace{6}{linkage area}        & TOC ptr reg                   &                                      &                               \\
+                                  & reserved                      &                                      &                               \\
+                                  & reserved                      &                                      &                               \\
+                                  & return address (callee saved) &                                      &                               \\
+                                  & condition reg (callee saved)  &                                      &                               \\
+                                  & parent stack frame pointer    &                                      &                               \\
+\hhline{~=~~}
+register save area                &                               &                                      & \mrrbrace{4}{current frame}   \\
+\hhline{~-~~}
+local data                        &                               &                                      &                               \\
+\hhline{~-~~}
+parameter area                    &                               &                                      &                               \\
+\hhline{~-~~}
+linkage area                      & \vdots                        &                                      &                               \\
+\end{tabular}
+\caption{Stack layout on ppc64 ELF ABI}
+\end{figure}
+
diff -r 4e6f63b7020e -r 74c056b597b7 doc/manual/callconvs/callconv_x64.tex
--- a/doc/manual/callconvs/callconv_x64.tex	Fri Nov 22 23:28:17 2019 +0100
+++ b/doc/manual/callconvs/callconv_x64.tex	Sat Nov 23 13:51:35 2019 +0100
@@ -97,8 +97,8 @@
 \item caller cleans up the stack, not the callee (like cdecl)
 \item stack is always 16byte aligned - since return address is 64 bits in size, stacks with an odd number of parameters are
 already aligned
-\item ellipsis calls take floating point values in int and float registers (single precision floats are promoted to double precision
-as defined for ellipsis calls)
+\item ellipsis calls take floating point values in int and float registers (single precision floats are promoted to double precision as
+required by ellipsis calls)
 \item if size of parameters \textgreater\ 1 page of memory (usually between 4k and 64k), chkstk must be called
 \end{itemize}