changeset 481:0fc22b5feac7

- arm related doc addition about aggregates
author Tassilo Philipp
date Wed, 02 Mar 2022 17:30:51 +0100
parents cc78e34958e5
children 0f3b6898078d
files doc/disas_examples/arm.armhf.disas doc/disas_examples/arm.atpcs_arm.disas doc/disas_examples/arm64.aapcs.disas doc/manual/callconvs/callconv_arm32.tex doc/manual/callconvs/callconv_arm64.tex
diffstat 5 files changed, 1113 insertions(+), 18 deletions(-) [+]
line wrap: on
line diff
--- a/doc/disas_examples/arm.armhf.disas	Tue Mar 01 21:02:10 2022 +0100
+++ b/doc/disas_examples/arm.armhf.disas	Wed Mar 02 17:30:51 2022 +0100
@@ -87,5 +87,531 @@
   dc:   e24bd004        sub     sp, fp, #4        ; |
   e0:   e8bd8800        pop     {fp, pc}          ; | epilog
 
+
+
+; ---------- passing structs with only fp parts ---------->
+;
+; struct A { float a; };
+; struct B { float a, b; };
+; struct C { float a, b, c; };
+; struct D { double a; };
+; struct E { double a, b; };
+; struct F { double a, b, c; };
+;
+; void leaf_call(struct A a, struct B b, struct C c, struct D d, struct E e, struct F f)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct A){1.f}, (struct B){2.f,3.f}, (struct C){4.f,5.f,6.f}, (struct D){1.}, (struct E){2.,3.}, (struct F){4.,5.,6.});
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd034        sub     sp, sp, #52
+   c:   ed0b0a02        vstr    s0, [fp, #-8]
+  10:   ed0b3b09        vstr    d3, [fp, #-36]
+  14:   eeb06b44        vmov.f64        d6, d4
+  18:   eeb07b45        vmov.f64        d7, d5
+  1c:   ed4b0a04        vstr    s1, [fp, #-16]
+  20:   ed0b1a03        vstr    s2, [fp, #-12]
+  24:   ed4b1a07        vstr    s3, [fp, #-28]
+  28:   ed0b2a06        vstr    s4, [fp, #-24]
+  2c:   ed4b2a05        vstr    s5, [fp, #-20]
+  30:   ed0b6b0d        vstr    d6, [fp, #-52]
+  34:   ed0b7b0b        vstr    d7, [fp, #-44]
+  38:   e1a00000        nop
+  3c:   e28bd000        add     sp, fp, #0
+  40:   e49db004        pop     {fp}
+  44:   e12fff1e        bx      lr
+
+00000048 <main>:
+  48:   e92d4800        push    {fp, lr}              ;
+  4c:   e28db004        add     fp, sp, #4            ;
+  50:   e24dd058        sub     sp, sp, #88           ;
+  54:   ed9f0a29        vldr    s0, [pc, #164]        ; arg 0 (struct A), fetch from data below: (pc:=0x54+0x8)+164=0x100
+  58:   e59f20a4        ldr     r2, [pc, #164]        ;
+  5c:   e24b300c        sub     r3, fp, #12           ;
+  60:   e8920003        ldm     r2, {r0, r1}          ;
+  64:   e8830003        stm     r3, {r0, r1}          ;
+  68:   e59f2098        ldr     r2, [pc, #152]        ;
+  6c:   e24b3018        sub     r3, fp, #24           ;
+  70:   e8920007        ldm     r2, {r0, r1, r2}      ;
+  74:   e8830007        stm     r3, {r0, r1, r2}      ;
+  78:   ed9f3b1e        vldr    d3, [pc, #120]        ; arg 3 (struct D), via fregs, fetch from data below: (pc:=0x78+0x8)+120=0xf8
+  7c:   e59f3088        ldr     r3, [pc, #136]        ;
+  80:   e24bc02c        sub     ip, fp, #44           ;
+  84:   e893000f        ldm     r3, {r0, r1, r2, r3}  ;
+  88:   e88c000f        stm     ip, {r0, r1, r2, r3}  ;
+  8c:   e59f307c        ldr     r3, [pc, #124]        ;
+  90:   e24bc044        sub     ip, fp, #68           ;
+  94:   e1a0e003        mov     lr, r3                ;
+  98:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  9c:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  a0:   e89e0003        ldm     lr, {r0, r1}          ;
+  a4:   e88c0003        stm     ip, {r0, r1}          ;
+  a8:   ed1b6b0b        vldr    d6, [fp, #-44]        ; \ prep arg 5 (struct E)
+  ac:   ed1b7b09        vldr    d7, [fp, #-36]        ; /                                        b
+  b0:   ed5b1a06        vldr    s3, [fp, #-24]        ; \                                a
+  b4:   ed1b2a05        vldr    s4, [fp, #-20]        ; | arg 2 (struct C), via fregs    b
+  b8:   ed5b2a04        vldr    s5, [fp, #-16]        ; /                                c
+  bc:   ed5b0a03        vldr    s1, [fp, #-12]        ; \                                a
+  c0:   ed1b1a02        vldr    s2, [fp, #-8]         ; / arg 1 (struct B), via fregs    b
+  c4:   e1a0e00d        mov     lr, sp                ; \    write ptr (to stack top)
+  c8:   e24bc044        sub     ip, fp, #68           ; |    read ptr
+  cc:   e8bc000f        ldm     ip!, {r0, r1, r2, r3} ; | arg 6 (struct F), entirely via stack (not split)
+  d0:   e8ae000f        stmia   lr!, {r0, r1, r2, r3} ; |
+  d4:   e89c0003        ldm     ip, {r0, r1}          ; |
+  d8:   e88e0003        stm     lr, {r0, r1}          ; /
+  dc:   eeb04b46        vmov.f64        d4, d6        ; \ arg 5 (struct E), via fregs    a
+  e0:   eeb05b47        vmov.f64        d5, d7        ; /                                b
+  e4:   ebfffffe        bl      0 <leaf_call>         ; return address -> r14/lr, and call
+  e8:   e3a03000        mov     r3, #0                ; return value (0) via r3 ... (a bit unoptimal)
+  ec:   e1a00003        mov     r0, r3                ; ... to r0
+  f0:   e24bd004        sub     sp, fp, #4            ; | epilog
+  f4:   e8bd8800        pop     {fp, pc}              ; /
+  f8:   00000000        .word   0x00000000            ; \           |
+  fc:   3ff00000        .word   0x3ff00000            ; |           | 1.0
+ 100:   3f800000        .word   0x3f800000            ; |           1.f
+ 104:   00000000        .word   0x00000000            ; | data
+ 108:   00000008        .word   0x00000008            ; |
+ 10c:   00000018        .word   0x00000018            ; |
+ 110:   00000028        .word   0x00000028            ; |
+
+
+
+; ---------- passing structs with mixed fp/int parts ---------->
+;
+; struct A { float a; char b; };
+; struct B { int a; double b; };
+; struct C { double a, b; int c; };
+; struct D { double a, b, c; long long d; };
+;
+; void leaf_call(struct A a, struct B b, struct C c, struct D d)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct A){1.f,2}, (struct B){2,3.}, (struct C){4.,5.,6}, (struct D){7.,8.,9.,10});
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e24dd008        sub     sp, sp, #8            ;
+   4:   e52db004        push    {fp}                  ;
+   8:   e28db000        add     fp, sp, #0            ;
+   c:   e24dd00c        sub     sp, sp, #12           ;
+  10:   e24bc00c        sub     ip, fp, #12           ;
+  14:   e88c0003        stm     ip, {r0, r1}          ;
+  18:   e28b1004        add     r1, fp, #4            ;
+  1c:   e881000c        stm     r1, {r2, r3}          ;
+  20:   e1a00000        nop                           ;
+  24:   e28bd000        add     sp, fp, #0            ;
+  28:   e49db004        pop     {fp}                  ;
+  2c:   e28dd008        add     sp, sp, #8            ;
+  30:   e12fff1e        bx      lr                    ;
+
+00000034 <main>:
+  34:   e92d4800        push    {fp, lr}              ; |
+  38:   e28db004        add     fp, sp, #4            ; | prolog
+  3c:   e24dd090        sub     sp, sp, #144          ; /
+  40:   e59f20b4        ldr     r2, [pc, #180]        ; \        read ptr to data after func
+  44:   e24b300c        sub     r3, fp, #12           ; |        write ptr to local area
+  48:   e8920003        ldm     r2, {r0, r1}          ; | struct A -> local area
+  4c:   e8830003        stm     r3, {r0, r1}          ; /
+  50:   e59f30a8        ldr     r3, [pc, #168]        ; \      read ptr to data after func
+  54:   e24bc01c        sub     ip, fp, #28           ; |      write ptr
+  58:   e893000f        ldm     r3, {r0, r1, r2, r3}  ; | struct B -> local area
+  5c:   e88c000f        stm     ip, {r0, r1, r2, r3}  ; /
+  60:   e59f309c        ldr     r3, [pc, #156]        ; \      read ptr to data after func
+  64:   e24bc034        sub     ip, fp, #52           ; |      write ptr
+  68:   e1a0e003        mov     lr, r3                ; |
+  6c:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ; | struct C -> local area
+  70:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ; |
+  74:   e89e0003        ldm     lr, {r0, r1}          ; |
+  78:   e88c0003        stm     ip, {r0, r1}          ; /
+  7c:   e59f3084        ldr     r3, [pc, #132]        ; \      read ptr to data after func
+  80:   e24bc054        sub     ip, fp, #84           ; |      write ptr
+  84:   e1a0e003        mov     lr, r3                ; |
+  88:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ; |
+  8c:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ; |
+  90:   e89e000f        ldm     lr, {r0, r1, r2, r3}  ; |
+  94:   e88c000f        stm     ip, {r0, r1, r2, r3}  ; | struct D -> local area
+  98:   e28de020        add     lr, sp, #32           ; |
+  9c:   e24bc054        sub     ip, fp, #84           ; |
+  a0:   e8bc000f        ldm     ip!, {r0, r1, r2, r3} ; |
+  a4:   e8ae000f        stmia   lr!, {r0, r1, r2, r3} ; |
+  a8:   e89c000f        ldm     ip, {r0, r1, r2, r3}  ; |
+  ac:   e88e000f        stm     lr, {r0, r1, r2, r3}  ; /
+  b0:   e28de008        add     lr, sp, #8            ; \
+  b4:   e24bc034        sub     ip, fp, #52           ; |
+  b8:   e8bc000f        ldm     ip!, {r0, r1, r2, r3} ; |
+  bc:   e8ae000f        stmia   lr!, {r0, r1, r2, r3} ; | arg 2 (struct C)
+  c0:   e89c0003        ldm     ip, {r0, r1}          ; |
+  c4:   e88e0003        stm     lr, {r0, r1}          ; /
+  c8:   e1a0200d        mov     r2, sp                ; \                    |
+  cc:   e24b3014        sub     r3, fp, #20           ; |                    | via stack (second half)
+  d0:   e8930003        ldm     r3, {r0, r1}          ; |                    |
+  d4:   e8820003        stm     r2, {r0, r1}          ; | arg 1 (struct B), split via regs and stack as 2 words each
+  d8:   e24b301c        sub     r3, fp, #28           ; |
+  dc:   e893000c        ldm     r3, {r2, r3}          ; /                    via regs (first half)
+  e0:   e24b100c        sub     r1, fp, #12           ; \
+  e4:   e8910003        ldm     r1, {r0, r1}          ; | arg 0 (struct A), via regs as 2 words
+  e8:   ebfffffe        bl      0 <leaf_call>         ; return address -> r14/lr, and call
+  ec:   e3a03000        mov     r3, #0                ; return value (0) via r3 ... (a bit unoptimal)
+  f0:   e1a00003        mov     r0, r3                ; ... to r0
+  f4:   e24bd004        sub     sp, fp, #4            ; |
+  f8:   e8bd8800        pop     {fp, pc}              ; | epilog
+  fc:   00000000        .word   0x00000000            ; 0
+ 100:   00000008        .word   0x00000008            ; 8
+ 104:   00000018        .word   0x00000018            ; 24
+ 108:   00000030        .word   0x00000030            ; 48
+
+
+
+; ---------- passing 3-field fp-only struct (HVA) which is bigger than 16b ---------->
+;
+; struct A { double a, b, c; }; /* bigger than 16b */
+;
+; void leaf_call(struct A a)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct A){1.,2.,3.});
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}                  ;
+   4:   e28db000        add     fp, sp, #0            ;
+   8:   e24dd01c        sub     sp, sp, #28           ;
+   c:   eeb05b40        vmov.f64        d5, d0        ;
+  10:   eeb06b41        vmov.f64        d6, d1        ;
+  14:   eeb07b42        vmov.f64        d7, d2        ;
+  18:   ed0b5b07        vstr    d5, [fp, #-28]        ;
+  1c:   ed0b6b05        vstr    d6, [fp, #-20]        ;
+  20:   ed0b7b03        vstr    d7, [fp, #-12]        ;
+  24:   e1a00000        nop                           ;
+  28:   e28bd000        add     sp, fp, #0            ;
+  2c:   e49db004        pop     {fp}                  ;
+  30:   e12fff1e        bx      lr                    ;
+
+00000034 <main>:
+  34:   e92d4800        push    {fp, lr}              ;
+  38:   e28db004        add     fp, sp, #4            ;
+  3c:   e24dd018        sub     sp, sp, #24           ;
+  40:   e59f3040        ldr     r3, [pc, #64]         ;
+  44:   e24bc01c        sub     ip, fp, #28           ;
+  48:   e1a0e003        mov     lr, r3                ;
+  4c:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  50:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  54:   e89e0003        ldm     lr, {r0, r1}          ;
+  58:   e88c0003        stm     ip, {r0, r1}          ;
+  5c:   ed1b5b07        vldr    d5, [fp, #-28]        ;
+  60:   ed1b6b05        vldr    d6, [fp, #-20]        ;
+  64:   ed1b7b03        vldr    d7, [fp, #-12]        ;
+  68:   eeb00b45        vmov.f64        d0, d5        ; |
+  6c:   eeb01b46        vmov.f64        d1, d6        ; | arg 0, via fpregs
+  70:   eeb02b47        vmov.f64        d2, d7        ; |
+  74:   ebfffffe        bl      0 <leaf_call>         ;
+  78:   e3a03000        mov     r3, #0                ;
+  7c:   e1a00003        mov     r0, r3                ;
+  80:   e24bd004        sub     sp, fp, #4            ;
+  84:   e8bd8800        pop     {fp, pc}              ;
+  88:   00000000        .word   0x00000000            ;
+
+
+
+; ---------- passing 5-field fp-only struct (HVA) ---------->
+;
+; struct A { double a, b, c, d, e; };
+;
+; void leaf_call(struct A a)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct A){1.,2.,3.,4.,5.});
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e24dd010        sub     sp, sp, #16
+   4:   e52db004        push    {fp}
+   8:   e28db000        add     fp, sp, #0
+   c:   e28bc004        add     ip, fp, #4
+  10:   e88c000f        stm     ip, {r0, r1, r2, r3}
+  14:   e1a00000        nop
+  18:   e28bd000        add     sp, fp, #0
+  1c:   e49db004        pop     {fp}
+  20:   e28dd010        add     sp, sp, #16
+  24:   e12fff1e        bx      lr
+
+00000028 <main>:
+  28:   e92d4800        push    {fp, lr}              ;
+  2c:   e28db004        add     fp, sp, #4            ;
+  30:   e24dd040        sub     sp, sp, #64           ;
+  34:   e59f3050        ldr     r3, [pc, #80]         ;
+  38:   e24bc02c        sub     ip, fp, #44           ;
+  3c:   e1a0e003        mov     lr, r3                ;
+  40:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  44:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  48:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  4c:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  50:   e89e0003        ldm     lr, {r0, r1}          ;
+  54:   e88c0003        stm     ip, {r0, r1}          ;
+  58:   e1a0e00d        mov     lr, sp                ;
+  5c:   e24bc01c        sub     ip, fp, #28           ;
+  60:   e8bc000f        ldm     ip!, {r0, r1, r2, r3} ;
+  64:   e8ae000f        stmia   lr!, {r0, r1, r2, r3} ;
+  68:   e89c0003        ldm     ip, {r0, r1}          ;
+  6c:   e88e0003        stm     lr, {r0, r1}          ;
+  70:   e24b302c        sub     r3, fp, #44           ;
+  74:   e893000f        ldm     r3, {r0, r1, r2, r3}  ; arg 0's a and b passed in int regs, as more than 4 fields, and splitting is allowed
+  78:   ebfffffe        bl      0 <leaf_call>         ;
+  7c:   e3a03000        mov     r3, #0                ;
+  80:   e1a00003        mov     r0, r3                ;
+  84:   e24bd004        sub     sp, fp, #4            ;
+  88:   e8bd8800        pop     {fp, pc}              ;
+  8c:   00000000        .word   0x00000000            ;
+
+
+
+; ---------- returning struct with 4 only-fp fields (HVA) by value ---------->
+;
+; struct A { double a, b, c, d; };
+;
+; struct A leaf_call()
+; {
+;         return (struct A){1.,2.,3.,4.};
+; }
+;
+; int main()
+; {
+;     leaf_call();
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e92d48f0        push    {r4, r5, r6, r7, fp, lr} ;
+   4:   e28db014        add     fp, sp, #20              ;
+   8:   e24dd060        sub     sp, sp, #96              ;
+   c:   e59f304c        ldr     r3, [pc, #76]            ;
+  10:   e24bc034        sub     ip, fp, #52              ;
+  14:   e1a0e003        mov     lr, r3                   ;
+  18:   e8be000f        ldm     lr!, {r0, r1, r2, r3}    ;
+  1c:   e8ac000f        stmia   ip!, {r0, r1, r2, r3}    ;
+  20:   e89e000f        ldm     lr, {r0, r1, r2, r3}     ;
+  24:   e88c000f        stm     ip, {r0, r1, r2, r3}     ;
+  28:   e14b63d4        ldrd    r6, [fp, #-52]           ;
+  2c:   e14b42dc        ldrd    r4, [fp, #-44]           ;
+  30:   e14b02d4        ldrd    r0, [fp, #-36]           ;
+  34:   e14b21dc        ldrd    r2, [fp, #-28]           ;
+  38:   ec476b14        vmov    d4, r6, r7               ;
+  3c:   ec454b15        vmov    d5, r4, r5               ;
+  40:   ec410b16        vmov    d6, r0, r1               ;
+  44:   ec432b17        vmov    d7, r2, r3               ;
+  48:   eeb00b44        vmov.f64        d0, d4           ; |
+  4c:   eeb01b45        vmov.f64        d1, d5           ; |
+  50:   eeb02b46        vmov.f64        d2, d6           ; | return value via regs
+  54:   eeb03b47        vmov.f64        d3, d7           ; |
+  58:   e24bd014        sub     sp, fp, #20              ;
+  5c:   e8bd88f0        pop     {r4, r5, r6, r7, fp, pc} ;
+  60:   00000000        .word   0x00000000               ;
+
+00000064 <main>:
+  64:   e92d4800        push    {fp, lr}
+  68:   e28db004        add     fp, sp, #4
+  6c:   ebfffffe        bl      0 <leaf_call>
+  70:   e3a03000        mov     r3, #0
+  74:   e1a00003        mov     r0, r3
+  78:   e8bd8800        pop     {fp, pc}
+
+
+
+; ---------- returning struct with 5 only-fp fields (HVA) by value, not returned via regs ---------->
+;
+; struct A { double a, b, c, d, e; };
+;
+; struct A leaf_call()
+; {
+;         return (struct A){1.,2.,3.,4.,5.};
+; }
+;
+; int main()
+; {
+;     leaf_call();
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e92d4800        push    {fp, lr}              ;
+   4:   e28db004        add     fp, sp, #4            ;
+   8:   e24dd030        sub     sp, sp, #48           ;
+   c:   e50b0030        str     r0, [fp, #-48]        ;
+  10:   e51b3030        ldr     r3, [fp, #-48]        ;
+  14:   e59f2028        ldr     r2, [pc, #40]         ;
+  18:   e1a0c003        mov     ip, r3                ;
+  1c:   e1a0e002        mov     lr, r2                ;
+  20:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  24:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  28:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  2c:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  30:   e89e0003        ldm     lr, {r0, r1}          ;
+  34:   e88c0003        stm     ip, {r0, r1}          ;
+  38:   e51b0030        ldr     r0, [fp, #-48]        ; hidden arg ptr returned in r0
+  3c:   e24bd004        sub     sp, fp, #4            ;
+  40:   e8bd8800        pop     {fp, pc}              ;
+  44:   00000000        .word   0x00000000            ;
+
+00000048 <main>:
+  48:   e92d4800        push    {fp, lr}              ;
+  4c:   e28db004        add     fp, sp, #4            ;
+  50:   e24dd028        sub     sp, sp, #40           ;
+  54:   e24b302c        sub     r3, fp, #44           ;
+  58:   e1a00003        mov     r0, r3                ; hidden first arg, ptr to retval struct data
+  5c:   ebfffffe        bl      0 <leaf_call>         ;
+  60:   e3a03000        mov     r3, #0                ;
+  64:   e1a00003        mov     r0, r3                ;
+  68:   e24bd004        sub     sp, fp, #4            ;
+  6c:   e8bd8800        pop     {fp, pc}              ;
+
+
+
+; ---------- returning struct of 4b via reg ---------->
+;
+;
+; struct A { short a, b; };
+;
+; struct A leaf_call()
+; {
+;         return (struct A){1,2};
+; }
+;
+; int main()
+; {
+;     leaf_call();
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd00c        sub     sp, sp, #12
+   c:   e59f3040        ldr     r3, [pc, #64]
+  10:   e5933000        ldr     r3, [r3]
+  14:   e50b3008        str     r3, [fp, #-8]
+  18:   e3a03000        mov     r3, #0
+  1c:   e15b20b8        ldrh    r2, [fp, #-8]
+  20:   e1a02802        lsl     r2, r2, #16
+  24:   e1a03823        lsr     r3, r3, #16
+  28:   e1833002        orr     r3, r3, r2
+  2c:   e1a03863        ror     r3, r3, #16
+  30:   e15b20b6        ldrh    r2, [fp, #-6]
+  34:   e1a02802        lsl     r2, r2, #16
+  38:   e1a03803        lsl     r3, r3, #16
+  3c:   e1a03823        lsr     r3, r3, #16
+  40:   e1833002        orr     r3, r3, r2
+  44:   e1a00003        mov     r0, r3
+  48:   e28bd000        add     sp, fp, #0
+  4c:   e49db004        pop     {fp}
+  50:   e12fff1e        bx      lr
+  54:   00000000        .word   0x00000000
+
+00000058 <main>:
+  58:   e92d4800        push    {fp, lr}
+  5c:   e28db004        add     fp, sp, #4
+  60:   ebfffffe        bl      0 <leaf_call>
+  64:   e3a03000        mov     r3, #0
+  68:   e1a00003        mov     r0, r3
+  6c:   e8bd8800        pop     {fp, pc}
+
+
+
+; ---------- returning struct of > 4b indirectly via hidden pointer arg ---------->
+;
+; struct A { short a, b, c; };
+;
+; struct A leaf_call()
+; {
+;         return (struct A){1,2,3};
+; }
+;
+; int main()
+; {
+;     leaf_call();
+;     return 0;
+; }
+
+
+
+; output from raspbian-11-armelhf w/ gcc 10.2.1
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd014        sub     sp, sp, #20
+   c:   e50b0010        str     r0, [fp, #-16]
+  10:   e51b3010        ldr     r3, [fp, #-16]
+  14:   e59f201c        ldr     r2, [pc, #28]
+  18:   e5920000        ldr     r0, [r2]
+  1c:   e5830000        str     r0, [r3]
+  20:   e1d220b4        ldrh    r2, [r2, #4]
+  24:   e1c320b4        strh    r2, [r3, #4]
+  28:   e51b0010        ldr     r0, [fp, #-16]
+  2c:   e28bd000        add     sp, fp, #0
+  30:   e49db004        pop     {fp}
+  34:   e12fff1e        bx      lr
+  38:   00000000        .word   0x00000000
+
+0000003c <main>:
+  3c:   e92d4800        push    {fp, lr}
+  40:   e28db004        add     fp, sp, #4
+  44:   e24dd008        sub     sp, sp, #8
+  48:   e24b300c        sub     r3, fp, #12
+  4c:   e1a00003        mov     r0, r3
+  50:   ebfffffe        bl      0 <leaf_call>
+  54:   e3a03000        mov     r3, #0
+  58:   e1a00003        mov     r0, r3
+  5c:   e24bd004        sub     sp, fp, #4
+  60:   e8bd8800        pop     {fp, pc}
+
+
+
 ; vim: ft=asm68k
 
--- a/doc/disas_examples/arm.atpcs_arm.disas	Tue Mar 01 21:02:10 2022 +0100
+++ b/doc/disas_examples/arm.atpcs_arm.disas	Wed Mar 02 17:30:51 2022 +0100
@@ -187,5 +187,567 @@
  150:   e1a0d00b        mov     sp, fp
  154:   e8bd8800        pop     {fp, pc}
 
+
+
+; ---------- structs by value ---------->
+;
+; struct A { int i, j; long long l; };
+;
+; void leaf_call(int b, int c, int d, int e, struct A f, int g, int h)
+; {
+; }
+;
+; void nonleaf_call(int a, int b, int c, int d, int e, struct A f, int g, int h)
+; {
+;     /* use some local data */
+;     char l[100] ={ 'L'};
+;     leaf_call(b, c, d, e, f, g, h);
+; }
+;
+; int main()
+; {
+;     nonleaf_call(0, 1, 2, 3, 4, (struct A){5, 6, 7ll}, 8, 9);
+;     return 0;
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}            ; (str fp, [sp, #-4]!)
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd014        sub     sp, sp, #20
+   c:   e50b0008        str     r0, [fp, #-8]
+  10:   e50b100c        str     r1, [fp, #-12]
+  14:   e50b2010        str     r2, [fp, #-16]
+  18:   e50b3014        str     r3, [fp, #-20]
+  1c:   e28bd000        add     sp, fp, #0
+  20:   e8bd0800        pop     {fp}
+  24:   e12fff1e        bx      lr
+
+00000028 <nonleaf_call>:
+  28:   e92d4800        push    {fp, lr}
+  2c:   e28db004        add     fp, sp, #4
+  30:   e24dd090        sub     sp, sp, #144    ; 0x90
+  34:   e50b0070        str     r0, [fp, #-112] ; 0x70
+  38:   e50b1074        str     r1, [fp, #-116] ; 0x74
+  3c:   e50b2078        str     r2, [fp, #-120] ; 0x78
+  40:   e50b307c        str     r3, [fp, #-124] ; 0x7c
+  44:   e24b2068        sub     r2, fp, #104    ; 0x68
+  48:   e3a03064        mov     r3, #100        ; 0x64
+  4c:   e1a00002        mov     r0, r2
+  50:   e3a01000        mov     r1, #0
+  54:   e1a02003        mov     r2, r3
+  58:   ebfffffe        bl      0 <memset>
+  5c:   e3a0304c        mov     r3, #76 ; 0x4c
+  60:   e54b3068        strb    r3, [fp, #-104] ; 0x68
+  64:   e1a0c00d        mov     ip, sp
+  68:   e28b300c        add     r3, fp, #12
+  6c:   e893000f        ldm     r3, {r0, r1, r2, r3}
+  70:   e88c000f        stm     ip, {r0, r1, r2, r3}
+  74:   e59b301c        ldr     r3, [fp, #28]
+  78:   e58d3010        str     r3, [sp, #16]
+  7c:   e59b3020        ldr     r3, [fp, #32]
+  80:   e58d3014        str     r3, [sp, #20]
+  84:   e51b0074        ldr     r0, [fp, #-116] ; 0x74
+  88:   e51b1078        ldr     r1, [fp, #-120] ; 0x78
+  8c:   e51b207c        ldr     r2, [fp, #-124] ; 0x7c
+  90:   e59b3004        ldr     r3, [fp, #4]
+  94:   ebfffffe        bl      0 <leaf_call>
+  98:   e24bd004        sub     sp, fp, #4
+  9c:   e8bd4800        pop     {fp, lr}
+  a0:   e12fff1e        bx      lr
+
+000000a4 <main>:
+  a4:   e92d4800        push    {fp, lr}             ;
+  a8:   e28db004        add     fp, sp, #4           ;
+  ac:   e24dd030        sub     sp, sp, #48          ;
+  b0:   e59f3058        ldr     r3, [pc, #88]        ;
+  b4:   e24bc014        sub     ip, fp, #20          ;
+  b8:   e893000f        ldm     r3, {r0, r1, r2, r3} ;
+  bc:   e88c000f        stm     ip, {r0, r1, r2, r3} ;
+  c0:   e3a03004        mov     r3, #4               ; arg 4, ...
+  c4:   e58d3000        str     r3, [sp]             ; ... pushed onto stack
+  c8:   e28dc008        add     ip, sp, #8           ; prep arg 5, write ptr ...
+  cc:   e24b3014        sub     r3, fp, #20          ; ... read ptr (local area)
+  d0:   e893000f        ldm     r3, {r0, r1, r2, r3} ; arg 5 (struct A), ...
+  d4:   e88c000f        stm     ip, {r0, r1, r2, r3} ; ... pushed onto stack (as 4 words)
+  d8:   e3a03008        mov     r3, #8               ; arg 6, ...
+  dc:   e58d3018        str     r3, [sp, #24]        ; ... pushed onto stack
+  e0:   e3a03009        mov     r3, #9               ; arg 6, ...
+  e4:   e58d301c        str     r3, [sp, #28]        ; ... pushed onto stack
+  e8:   e3a00000        mov     r0, #0               ; arg 0
+  ec:   e3a01001        mov     r1, #1               ; arg 1
+  f0:   e3a02002        mov     r2, #2               ; arg 2
+  f4:   e3a03003        mov     r3, #3               ; arg 3
+  f8:   ebfffffe        bl      28 <nonleaf_call>    ;
+  fc:   e3a03000        mov     r3, #0               ;
+ 100:   e1a00003        mov     r0, r3               ;
+ 104:   e24bd004        sub     sp, fp, #4           ;
+ 108:   e8bd4800        pop     {fp, lr}             ;
+ 10c:   e12fff1e        bx      lr                   ;
+ 110:   00000000        .word   0x00000000
+
+
+
+; ---------- structs by value, complex example (multiple structs) ---------->
+;
+; struct A { int i, j; float f; };
+; struct B { double d; long long l; };
+;
+; void leaf_call(int b, struct A c, struct B d, int e, int f, struct A g, struct B h, int i, int j)
+; {
+; }
+;
+; void nonleaf_call(int a, int b, struct A c, struct B d, int e, int f, struct A g, struct B h, int i, int j)
+; {
+;     /* use some local data */
+;     char l[100] ={ 'L'};
+;     leaf_call(b, c, d, e, f, g, h, i, j);
+; }
+;
+; int main()
+; {
+;     nonleaf_call(0, 1, (struct A){2, 3, 4.f}, (struct B){5., 6ll}, 7, 8, (struct A){9, 10, 11.f}, (struct B){12., 13ll}, 14, 15);
+;     return 0;
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}            ; (str fp, [sp, #-4]!)
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd014        sub     sp, sp, #20
+   c:   e50b0008        str     r0, [fp, #-8]
+  10:   e24b0014        sub     r0, fp, #20
+  14:   e880000e        stm     r0, {r1, r2, r3}
+  18:   e28bd000        add     sp, fp, #0
+  1c:   e8bd0800        pop     {fp}
+  20:   e12fff1e        bx      lr
+
+00000024 <nonleaf_call>:
+  24:   e24dd008        sub     sp, sp, #8
+  28:   e92d4800        push    {fp, lr}
+  2c:   e28db004        add     fp, sp, #4
+  30:   e24dd0b0        sub     sp, sp, #176    ; 0xb0
+  34:   e50b0070        str     r0, [fp, #-112] ; 0x70
+  38:   e50b1074        str     r1, [fp, #-116] ; 0x74
+  3c:   e28b1004        add     r1, fp, #4
+  40:   e881000c        stm     r1, {r2, r3}
+  44:   e24b2068        sub     r2, fp, #104    ; 0x68
+  48:   e3a03064        mov     r3, #100        ; 0x64
+  4c:   e1a00002        mov     r0, r2
+  50:   e3a01000        mov     r1, #0
+  54:   e1a02003        mov     r2, r3
+  58:   ebfffffe        bl      0 <memset>
+  5c:   e3a0304c        mov     r3, #76 ; 0x4c
+  60:   e54b3068        strb    r3, [fp, #-104] ; 0x68
+  64:   e1a0c00d        mov     ip, sp
+  68:   e28b3014        add     r3, fp, #20
+  6c:   e893000f        ldm     r3, {r0, r1, r2, r3}
+  70:   e88c000f        stm     ip, {r0, r1, r2, r3}
+  74:   e59b3024        ldr     r3, [fp, #36]   ; 0x24
+  78:   e58d3010        str     r3, [sp, #16]
+  7c:   e59b3028        ldr     r3, [fp, #40]   ; 0x28
+  80:   e58d3014        str     r3, [sp, #20]
+  84:   e28dc018        add     ip, sp, #24
+  88:   e28b302c        add     r3, fp, #44     ; 0x2c
+  8c:   e8930007        ldm     r3, {r0, r1, r2}
+  90:   e88c0007        stm     ip, {r0, r1, r2}
+  94:   e28dc028        add     ip, sp, #40     ; 0x28
+  98:   e28b303c        add     r3, fp, #60     ; 0x3c
+  9c:   e893000f        ldm     r3, {r0, r1, r2, r3}
+  a0:   e88c000f        stm     ip, {r0, r1, r2, r3}
+  a4:   e59b304c        ldr     r3, [fp, #76]   ; 0x4c
+  a8:   e58d3038        str     r3, [sp, #56]   ; 0x38
+  ac:   e59b3050        ldr     r3, [fp, #80]   ; 0x50
+  b0:   e58d303c        str     r3, [sp, #60]   ; 0x3c
+  b4:   e51b0074        ldr     r0, [fp, #-116] ; 0x74
+  b8:   e28b3004        add     r3, fp, #4
+  bc:   e893000e        ldm     r3, {r1, r2, r3}
+  c0:   ebfffffe        bl      0 <leaf_call>
+  c4:   e24bd004        sub     sp, fp, #4
+  c8:   e8bd4800        pop     {fp, lr}
+  cc:   e28dd008        add     sp, sp, #8
+  d0:   e12fff1e        bx      lr
+
+000000d4 <main>:
+  d4:   e92d4800        push    {fp, lr}
+  d8:   e28db004        add     fp, sp, #4
+  dc:   e24dd088        sub     sp, sp, #136    ; 0x88
+  e0:   e59f20b8        ldr     r2, [pc, #184]  ; 1a0 <main+0xcc>
+  e4:   e24b3040        sub     r3, fp, #64     ; 0x40
+  e8:   e8920007        ldm     r2, {r0, r1, r2}
+  ec:   e8830007        stm     r3, {r0, r1, r2}
+  f0:   e59f30ac        ldr     r3, [pc, #172]  ; 1a4 <main+0xd0>
+  f4:   e24bc034        sub     ip, fp, #52     ; 0x34
+  f8:   e893000f        ldm     r3, {r0, r1, r2, r3}
+  fc:   e88c000f        stm     ip, {r0, r1, r2, r3}
+ 100:   e59f20a0        ldr     r2, [pc, #160]  ; 1a8 <main+0xd4>
+ 104:   e24b3020        sub     r3, fp, #32
+ 108:   e8920007        ldm     r2, {r0, r1, r2}
+ 10c:   e8830007        stm     r3, {r0, r1, r2}
+ 110:   e59f3094        ldr     r3, [pc, #148]  ; 1ac <main+0xd8>
+ 114:   e24bc014        sub     ip, fp, #20
+ 118:   e893000f        ldm     r3, {r0, r1, r2, r3}
+ 11c:   e88c000f        stm     ip, {r0, r1, r2, r3}
+ 120:   e28dc008        add     ip, sp, #8
+ 124:   e24b3034        sub     r3, fp, #52     ; 0x34
+ 128:   e893000f        ldm     r3, {r0, r1, r2, r3}
+ 12c:   e88c000f        stm     ip, {r0, r1, r2, r3}
+ 130:   e3a03007        mov     r3, #7
+ 134:   e58d3018        str     r3, [sp, #24]
+ 138:   e3a03008        mov     r3, #8
+ 13c:   e58d301c        str     r3, [sp, #28]
+ 140:   e28dc020        add     ip, sp, #32
+ 144:   e24b3020        sub     r3, fp, #32
+ 148:   e8930007        ldm     r3, {r0, r1, r2}
+ 14c:   e88c0007        stm     ip, {r0, r1, r2}
+ 150:   e28dc030        add     ip, sp, #48     ; 0x30
+ 154:   e24b3014        sub     r3, fp, #20
+ 158:   e893000f        ldm     r3, {r0, r1, r2, r3}
+ 15c:   e88c000f        stm     ip, {r0, r1, r2, r3}
+ 160:   e3a0300e        mov     r3, #14
+ 164:   e58d3040        str     r3, [sp, #64]   ; 0x40
+ 168:   e3a0300f        mov     r3, #15
+ 16c:   e58d3044        str     r3, [sp, #68]   ; 0x44
+ 170:   e51b3038        ldr     r3, [fp, #-56]  ; 0x38
+ 174:   e58d3000        str     r3, [sp]
+ 178:   e24b3040        sub     r3, fp, #64     ; 0x40
+ 17c:   e893000c        ldm     r3, {r2, r3}
+ 180:   e3a00000        mov     r0, #0
+ 184:   e3a01001        mov     r1, #1
+ 188:   ebfffffe        bl      24 <nonleaf_call>
+ 18c:   e3a03000        mov     r3, #0
+ 190:   e1a00003        mov     r0, r3
+ 194:   e24bd004        sub     sp, fp, #4
+ 198:   e8bd4800        pop     {fp, lr}
+ 19c:   e12fff1e        bx      lr
+ 1a0:   00000000        .word   0x00000000
+ 1a4:   00000010        .word   0x00000010
+ 1a8:   00000020        .word   0x00000020
+ 1ac:   00000030        .word   0x00000030
+
+
+
+; ---------- returning long long ---------->
+;
+; long long f()
+; {
+;     return 7171LL;
+; }
+;
+; int main()
+; {
+;     return (int)f();
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <f>:
+   0:   e92d0810        push    {r4, fp}   ;
+   4:   e28db004        add     fp, sp, #4 ;
+   8:   e3a03b07        mov     r3, #7168  ;
+   c:   e2833003        add     r3, r3, #3 ;
+  10:   e3a04000        mov     r4, #0     ;
+  14:   e1a00003        mov     r0, r3     ; | retval in two regs
+  18:   e1a01004        mov     r1, r4     ; |
+  1c:   e24bd004        sub     sp, fp, #4 ;
+  20:   e8bd0810        pop     {r4, fp}   ;
+  24:   e12fff1e        bx      lr         ;
+
+00000028 <main>:
+  28:   e92d4830        push    {r4, r5, fp, lr}
+  2c:   e28db00c        add     fp, sp, #12
+  30:   ebfffffe        bl      0 <f>
+  34:   e1a03000        mov     r3, r0
+  38:   e1a04001        mov     r4, r1
+  3c:   e1a00003        mov     r0, r3
+  40:   e24bd00c        sub     sp, fp, #12
+  44:   e8bd4830        pop     {r4, r5, fp, lr}
+  48:   e12fff1e        bx      lr
+
+
+
+; ---------- passing structs with only fp parts ---------->
+;
+; struct A { float a; };
+; struct B { float a, b; };
+; struct C { float a, b, c; };
+; struct D { double a; };
+; struct E { double a, b; };
+; struct F { double a, b, c; };
+;
+; void leaf_call(struct A a, struct B b, struct C c, struct D d, struct E e, struct F f)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct A){1.f}, (struct B){2.f,3.f}, (struct C){4.f,5.f,6.f}, (struct D){1.}, (struct E){2.,3.}, (struct F){4.,5.,6.});
+;     return 0;
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <leaf_call>:
+   0:   e24dd008        sub     sp, sp, #8
+   4:   e52db004        push    {fp}            ; (str fp, [sp, #-4]!)
+   8:   e28db000        add     fp, sp, #0
+   c:   e24dd014        sub     sp, sp, #20
+  10:   e50b0008        str     r0, [fp, #-8]
+  14:   e24b0010        sub     r0, fp, #16
+  18:   e8800006        stm     r0, {r1, r2}
+  1c:   e58b3008        str     r3, [fp, #8]
+  20:   e28bd000        add     sp, fp, #0
+  24:   e8bd0800        pop     {fp}
+  28:   e28dd008        add     sp, sp, #8
+  2c:   e12fff1e        bx      lr
+
+00000030 <main>:
+  30:   e92d4810        push    {r4, fp, lr}          ;
+  34:   e28db008        add     fp, sp, #8            ;
+  38:   e24dd084        sub     sp, sp, #132          ;
+  3c:   e59f30d0        ldr     r3, [pc, #208]        ;
+  40:   e50b3054        str     r3, [fp, #-84]        ;
+  44:   e59f20cc        ldr     r2, [pc, #204]        ;
+  48:   e24b3050        sub     r3, fp, #80           ;
+  4c:   e8920003        ldm     r2, {r0, r1}          ;
+  50:   e8830003        stm     r3, {r0, r1}          ;
+  54:   e59f20c0        ldr     r2, [pc, #192]        ;
+  58:   e24b3048        sub     r3, fp, #72           ;
+  5c:   e8920007        ldm     r2, {r0, r1, r2}      ;
+  60:   e8830007        stm     r3, {r0, r1, r2}      ;
+  64:   e3a03000        mov     r3, #0                ;
+  68:   e3a045ff        mov     r4, #1069547520       ;
+  6c:   e2844603        add     r4, r4, #3145728      ;
+  70:   e50b303c        str     r3, [fp, #-60]        ;
+  74:   e50b4038        str     r4, [fp, #-56]        ;
+  78:   e59f30a0        ldr     r3, [pc, #160]        ;
+  7c:   e24bc034        sub     ip, fp, #52           ;
+  80:   e893000f        ldm     r3, {r0, r1, r2, r3}  ;
+  84:   e88c000f        stm     ip, {r0, r1, r2, r3}  ;
+  88:   e59f3094        ldr     r3, [pc, #148]        ;
+  8c:   e24bc024        sub     ip, fp, #36           ;
+  90:   e1a0e003        mov     lr, r3                ;
+  94:   e8be000f        ldm     lr!, {r0, r1, r2, r3} ;
+  98:   e8ac000f        stmia   ip!, {r0, r1, r2, r3} ;
+  9c:   e89e0003        ldm     lr, {r0, r1}          ;
+  a0:   e88c0003        stm     ip, {r0, r1}          ;
+  a4:   e24b403c        sub     r4, fp, #60           ;
+  a8:   e8940018        ldm     r4, {r3, r4}          ;
+  ac:   e58d3008        str     r3, [sp, #8]          ;
+  b0:   e58d400c        str     r4, [sp, #12]         ;
+  b4:   e28dc010        add     ip, sp, #16           ;
+  b8:   e24b3034        sub     r3, fp, #52           ;
+  bc:   e893000f        ldm     r3, {r0, r1, r2, r3}  ;
+  c0:   e88c000f        stm     ip, {r0, r1, r2, r3}  ;
+  c4:   e28de020        add     lr, sp, #32           ;
+  c8:   e24bc024        sub     ip, fp, #36           ;
+  cc:   e8bc000f        ldm     ip!, {r0, r1, r2, r3} ;
+  d0:   e8ae000f        stmia   lr!, {r0, r1, r2, r3} ;
+  d4:   e89c0003        ldm     ip, {r0, r1}          ;
+  d8:   e88e0003        stm     lr, {r0, r1}          ;
+  dc:   e1a0300d        mov     r3, sp                ; |              write ptr
+  e0:   e24b2044        sub     r2, fp, #68           ; |              read ptr
+  e4:   e8920003        ldm     r2, {r0, r1}          ; | arg 3 (struct C), split via  |
+  e8:   e8830003        stm     r3, {r0, r1}          ; |                              | stack    b & c
+  ec:   e51b3048        ldr     r3, [fp, #-72]        ; |                              reg        a
+  f0:   e51b0054        ldr     r0, [fp, #-84]        ; arg 0 (struct A), via reg as word         a
+  f4:   e24b2050        sub     r2, fp, #80           ; |
+  f8:   e8920006        ldm     r2, {r1, r2}          ; | arg 1 (struct B), via reg as 2 words    a & b
+  fc:   ebfffffe        bl      0 <leaf_call>         ;
+ 100:   e3a03000        mov     r3, #0                ;
+ 104:   e1a00003        mov     r0, r3                ;
+ 108:   e24bd008        sub     sp, fp, #8            ;
+ 10c:   e8bd4810        pop     {r4, fp, lr}          ;
+ 110:   e12fff1e        bx      lr                    ;
+ 114:   3f800000        .word   0x3f800000
+ 118:   00000000        .word   0x00000000
+ 11c:   00000008        .word   0x00000008
+ 120:   00000018        .word   0x00000018
+ 124:   00000028        .word   0x00000028
+
+
+
+; ---------- returning structs by value ---------->
+;
+; struct Small { char x; };
+; struct Big { long long i; long j; }; /* bigger than a word */
+;
+; struct Small f0()
+; {
+;     struct Small s = { 132 };
+;     return s;
+; }
+;
+; struct Big f1()
+; {
+;     struct Big b = { 7171LL, 232 };
+;     return b;
+; }
+;
+; int main()
+; {
+;     struct Small s = f0();
+;     struct Big b = f1();
+;     return b.j + s.x;
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <f0>:
+   0:   e52db004        push    {fp}                 ;
+   4:   e28db000        add     fp, sp, #0           ;
+   8:   e24dd00c        sub     sp, sp, #12          ;
+   c:   e3e0307b        mvn     r3, #123             ;
+  10:   e54b3008        strb    r3, [fp, #-8]        ;
+  14:   e55b3008        ldrb    r3, [fp, #-8]        ;
+  18:   e1a00003        mov     r0, r3               ; return value (via reg as <= 4b)
+  1c:   e28bd000        add     sp, fp, #0           ;
+  20:   e8bd0800        pop     {fp}                 ;
+  24:   e12fff1e        bx      lr                   ;
+
+00000028 <f1>:
+  28:   e92d0810        push    {r4, fp}             ;
+  2c:   e28db004        add     fp, sp, #4           ;
+  30:   e24dd010        sub     sp, sp, #16          ;
+  34:   e1a0c000        mov     ip, r0               ; ptr to retval space, tmp copy
+  38:   e59f3028        ldr     r3, [pc, #40]        ;
+  3c:   e24b4014        sub     r4, fp, #20          ;
+  40:   e893000f        ldm     r3, {r0, r1, r2, r3} ;
+  44:   e884000f        stm     r4, {r0, r1, r2, r3} ;
+  48:   e1a0400c        mov     r4, ip               ; write ptr
+  4c:   e24b3014        sub     r3, fp, #20          ; read ptr
+  50:   e893000f        ldm     r3, {r0, r1, r2, r3} ; |
+  54:   e884000f        stm     r4, {r0, r1, r2, r3} ; | copy struct from local area to retval space
+  58:   e1a0000c        mov     r0, ip               ; return value: ptr to retval space
+  5c:   e24bd004        sub     sp, fp, #4           ;
+  60:   e8bd0810        pop     {r4, fp}             ;
+  64:   e12fff1e        bx      lr                   ;
+  68:   00000000        .word   0x00000000
+
+0000006c <main>:
+  6c:   e92d4800        push    {fp, lr}
+  70:   e28db004        add     fp, sp, #4
+  74:   e24dd018        sub     sp, sp, #24
+  78:   ebfffffe        bl      0 <f0>
+  7c:   e1a03000        mov     r3, r0
+  80:   e54b3008        strb    r3, [fp, #-8]
+  84:   e24b301c        sub     r3, fp, #28
+  88:   e1a00003        mov     r0, r3
+  8c:   ebfffffe        bl      28 <f1>
+  90:   e51b2014        ldr     r2, [fp, #-20]
+  94:   e55b3008        ldrb    r3, [fp, #-8]
+  98:   e0823003        add     r3, r2, r3
+  9c:   e1a00003        mov     r0, r3
+  a0:   e24bd004        sub     sp, fp, #4
+  a4:   e8bd4800        pop     {fp, lr}
+  a8:   e12fff1e        bx      lr
+
+
+
+; ---------- single-field structs by values (and small array fields) ---------->
+;
+; struct C { char c; };
+; struct S { short s; };
+; struct I { int i; };
+; struct F { float f; };
+; struct D { double d; };
+;
+; struct C2 { char c[2]; };
+; struct C3 { char c[3]; };
+;
+; void leaf_call(struct C2 a, struct C b, struct S c, struct I d, struct F e, struct D f, struct C3 g)
+; {
+; }
+;
+; int main()
+; {
+;     leaf_call((struct C2){{0,1}}, (struct C){2}, (struct S){3}, (struct I){4}, (struct F){5.f}, (struct D){6.}, (struct C3){{7,8,9}});
+;     return 0;
+; }
+
+
+
+; output from debian-6.0.8-armel w/ gcc 4.4.5
+
+00000000 <leaf_call>:
+   0:   e52db004        push    {fp}            ; (str fp, [sp, #-4]!)
+   4:   e28db000        add     fp, sp, #0
+   8:   e24dd014        sub     sp, sp, #20
+   c:   e14b00b8        strh    r0, [fp, #-8]
+  10:   e54b100c        strb    r1, [fp, #-12]
+  14:   e14b21b0        strh    r2, [fp, #-16]
+  18:   e50b3014        str     r3, [fp, #-20]
+  1c:   e28bd000        add     sp, fp, #0
+  20:   e8bd0800        pop     {fp}
+  24:   e12fff1e        bx      lr
+
+00000028 <main>:
+  28:   e92d4810        push    {r4, fp, lr}     ;
+  2c:   e28db008        add     fp, sp, #8       ;
+  30:   e24dd044        sub     sp, sp, #68      ;
+  34:   e3a03000        mov     r3, #0           ;
+  38:   e54b3030        strb    r3, [fp, #-48]   ;
+  3c:   e3a03001        mov     r3, #1           ;
+  40:   e54b302f        strb    r3, [fp, #-47]   ;
+  44:   e3a03002        mov     r3, #2           ;
+  48:   e54b302c        strb    r3, [fp, #-44]   ;
+  4c:   e3a03003        mov     r3, #3           ;
+  50:   e14b32b8        strh    r3, [fp, #-40]   ;
+  54:   e3a03004        mov     r3, #4           ;
+  58:   e50b3024        str     r3, [fp, #-36]   ;
+  5c:   e59f308c        ldr     r3, [pc, #140]   ;
+  60:   e50b3020        str     r3, [fp, #-32]   ;
+  64:   e3a03000        mov     r3, #0           ;
+  68:   e3a04101        mov     r4, #1073741824  ;
+  6c:   e2844706        add     r4, r4, #1572864 ;
+  70:   e50b301c        str     r3, [fp, #-28]   ;
+  74:   e50b4018        str     r4, [fp, #-24]   ;
+  78:   e59f2074        ldr     r2, [pc, #116]   ;
+  7c:   e24b3010        sub     r3, fp, #16      ;
+  80:   e5922000        ldr     r2, [r2]         ;
+  84:   e1c320b0        strh    r2, [r3]         ;
+  88:   e2833002        add     r3, r3, #2       ;
+  8c:   e1a02822        lsr     r2, r2, #16      ;
+  90:   e5c32000        strb    r2, [r3]         ;
+  94:   e51b3020        ldr     r3, [fp, #-32]   ; |
+  98:   e58d3000        str     r3, [sp]         ; | arg 4 (struct F), via stack as word
+  9c:   e24b401c        sub     r4, fp, #28      ;
+  a0:   e8940018        ldm     r4, {r3, r4}     ;
+  a4:   e58d3008        str     r3, [sp, #8]     ; |
+  a8:   e58d400c        str     r4, [sp, #12]    ; | arg 5 (struct D), via stack as 2 words (aligned to 8, b/c double?)
+  ac:   e28d3010        add     r3, sp, #16      ;
+  b0:   e24b2010        sub     r2, fp, #16      ;
+  b4:   e5922000        ldr     r2, [r2]         ; |
+  b8:   e1c320b0        strh    r2, [r3]         ; |
+  bc:   e2833002        add     r3, r3, #2       ; | arg 6 (struct C3), via stack as 3 bytes in word slot
+  c0:   e1a02822        lsr     r2, r2, #16      ; |
+  c4:   e5c32000        strb    r2, [r3]         ; |
+  c8:   e51b0030        ldr     r0, [fp, #-48]   ; arg 0 (struct C2), via reg as word
+  cc:   e55b102c        ldrb    r1, [fp, #-44]   ; arg 1 (struct C), via reg as word
+  d0:   e15b22b8        ldrh    r2, [fp, #-40]   ; arg 2 (struct S), via reg as word
+  d4:   e51b3024        ldr     r3, [fp, #-36]   ; arg 3 (struct I), via reg as word
+  d8:   ebfffffe        bl      0 <leaf_call>    ;
+  dc:   e3a03000        mov     r3, #0           ;
+  e0:   e1a00003        mov     r0, r3           ;
+  e4:   e24bd008        sub     sp, fp, #8       ;
+  e8:   e8bd4810        pop     {r4, fp, lr}     ;
+  ec:   e12fff1e        bx      lr               ;
+  f0:   40a00000        .word   0x40a00000
+  f4:   00000000        .word   0x00000000
+
+
+
 ; vim: ft=asm68k
 
--- a/doc/disas_examples/arm64.aapcs.disas	Tue Mar 01 21:02:10 2022 +0100
+++ b/doc/disas_examples/arm64.aapcs.disas	Wed Mar 02 17:30:51 2022 +0100
@@ -1976,6 +1976,7 @@
         ENDP  ; |main|
 
 
+
 ; ---------- single-field structs by values (and small array fields) ---------->
 ;
 ; struct C { char c; };
--- a/doc/manual/callconvs/callconv_arm32.tex	Tue Mar 01 21:02:10 2022 +0100
+++ b/doc/manual/callconvs/callconv_arm32.tex	Wed Mar 02 17:30:51 2022 +0100
@@ -1,6 +1,6 @@
 %//////////////////////////////////////////////////////////////////////////////
 %
-% Copyright (c) 2007-2019 Daniel Adler <dadler@uni-goettingen.de>,
+% Copyright (c) 2007-2022 Daniel Adler <dadler@uni-goettingen.de>,
 %                         Tassilo Philipp <tphilipp@potion-studios.com>
 %
 % Permission to use, copy, modify, and distribute this software for any
@@ -17,9 +17,6 @@
 %
 %//////////////////////////////////////////////////////////////////////////////
 
-% ==================================================
-% ARM32
-% ==================================================
 \subsection{ARM32 Calling Conventions}
 
 \paragraph{Overview}
@@ -91,16 +88,18 @@
 \item if the callee takes the address of one of the parameters and uses it to address other parameters (e.g. varargs) it has to copy - in its prolog - the first four words to a reserved stack area adjacent to the other parameters on the stack
 \item parameters \textless=\ 32 bits are passed as 32 bit words
 \item 64 bit parameters are passed as two 32 bit parts (even partly via the register and partly via the stack, although this doesn't seem to be specified in the ATPCS)
-\item structures and unions are passed by value (after rounding up the size to the nearest multiple of 4), as a sequence of words
-\item if return value is a structure, a pointer pointing to the return value's space is passed in r0, the first parameter in r1, etc... (see {\bf return values})
+\item aggregates (struct, union) are passed by value (after rounding up the size to the nearest multiple of 4), as a sequence of words (splitting across registers and stack is allowed)
 \item keeping the stack eight-byte aligned can improve memory access performance and is required by LDRD and STRD on ARMv5TE processors which are part of the ARM32 family, so, in order to avoid problems one should always align the stack (tests have shown, that GCC does care about the alignment when using the ellipsis)
 \end{itemize}
 
 \paragraph{Return values}
+
 \begin{itemize}
 \item return values \textless=\ 32 bits use r0
 \item 64 bit return values use r0 and r1
-\item if return value is a structure, the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
+\item aggregates (struct, union) \textless=\ 32 bits are returned like an integer (in r0)
+\item aggregates (struct, union) \textgreater\ 32 bits the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
+\item for all other aggregates, the caller allocates space, passes pointer to it to the callee as a hidden first param (meaning in r0), and callee writes return value to this space; the ptr to the aggregate is returned in r0
 \end{itemize}
 
 \paragraph{Stack layout}
@@ -180,17 +179,19 @@
 \item subsequent parameters are pushed onto the stack (in right to left order, such that the stack pointer points to the first of the remaining parameters)
 \item if the callee takes the address of one of the parameters and uses it to address other parameters (e.g. varargs) it has to copy - in its prolog - the first four words to a reserved stack area adjacent to the other parameters on the stack
 \item parameters \textless=\ 32 bits are passed as 32 bit words
-\item 64 bit parameters are passed as two 32 bit parts (even partly via the register and partly via the stack), although this doesn't seem to be specified in the ATPCS)
-\item structures and unions are passed by value (after rounding up the size to the nearest multiple of 4), as a sequence of words
-\item if return value is a structure, a pointer pointing to the return value's space is passed in r0, the first parameter in r1, etc. (see {\bf return values})
+\item 64 bit parameters are passed as two 32 bit parts (even partly via the register and partly via the stack, although this doesn't seem to be specified in the ATPCS)
+\item aggregates (struct, union) are passed by value (after rounding up the size to the nearest multiple of 4), as a sequence of words (splitting across registers and stack is allowed)
 \item keeping the stack eight-byte aligned can improve memory access performance and is required by LDRD and STRD on ARMv5TE processors which are part of the ARM32 family, so, in order to avoid problems one should always align the stack (tests have shown, that GCC does care about the alignment when using the ellipsis)
 \end{itemize}
 
 \paragraph{Return values}
+
 \begin{itemize}
 \item return values \textless=\ 32 bits use r0
 \item 64 bit return values use r0 and r1
-\item if return value is a structure, the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
+\item aggregates (struct, union) \textless=\ 32 bits are returned like an integer (in r0)
+\item aggregates (struct, union) \textgreater\ 32 bits the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
+\item for all other aggregates, the caller allocates space, passes pointer to it to the callee as a hidden first param (meaning in r0), and callee writes return value to this space; the ptr to the aggregate is returned in r0
 \end{itemize}
 
 \paragraph{Stack layout}
@@ -379,18 +380,23 @@
 \item float and double vararg function parameters (no matter if in ellipsis part of function, or not) are passed like int or long long parameters, vfp registers aren't used
 \item if the callee takes the address of one of the parameters and uses it to address other parameters (e.g. varargs) it has to copy - in its prolog - the first four words (for first 4 integer arguments) to a reserved stack area adjacent to the other parameters on the stack
 \item parameters \textless=\ 32 bits are passed as 32 bit words
-\item structures and unions are passed by value (after rounding up the size to the nearest multiple of 4), as a sequence of words
-\item if return value is a structure, a pointer pointing to the return value's space is passed in r0, the first parameter in r1, etc. (see {\bf return values})
+\item aggregates (struct, union) with 1 to 4 identical floating-point members (either float or double) are passed field-by-field, except if passed as a vararg
+\item aggregates that could be passed via floating point register are never split across those and the stack, so if not enough registers are available an aggregate is
+passed entirely via the stack (implying above rule that any still unused float registers will be skipped for any subsequent arg)
+\item all other aggregates (struct, union), after rounding up the size to the nearest multiple of 4, are passed as a sequence of dwords, like integers (splitting across registers and stack is allowed)
 \item callee spills, caller reserves spill area space, though
 \end{itemize}
 
 \paragraph{Return values}
+
 \begin{itemize}
 \item non floating point return values \textless=\ 32 bits use r0
 \item non floating point 64-bit return values use r0 and r1
-\item single precision floating point return value uses s0
-\item double precision floating point return value uses d0
-\item if return value is a structure, the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
+\item floating point return value uses s0 (for float) or d0 (for double), respectively
+\item aggregates (struct, union) with 1 to 4 identical floating-point members are returned in s0-s3 (for float) or d0-d3 (for double), respectively
+\item all other aggregates \textless=\ 32 bits are returned via r0
+\item for all other aggregates, the caller allocates space, passes pointer to it to the callee as a hidden first param
+(meanin in r0), and callee writes return value to this space; the ptr to the aggregate is returned in x0
 \end{itemize}
 
 \paragraph{Stack layout}
--- a/doc/manual/callconvs/callconv_arm64.tex	Tue Mar 01 21:02:10 2022 +0100
+++ b/doc/manual/callconvs/callconv_arm64.tex	Wed Mar 02 17:30:51 2022 +0100
@@ -81,7 +81,7 @@
 \item other aggregates (struct, union) \textgreater\ 16 bytes in size are passed indirectly, as a pointer to a copy (if needed)
 \item all other aggregates (struct, union), after rounding up the size to the nearest multiple of 8, are passed as a sequence of dwords, like integers
 \item aggregates are never split across registers and stack, so if not enough registers are available an aggregated is passed via the stack (for aggregates that
-would've been passed as floating point values, and any still unused float registers will be skipped for any subsequent arg)
+would've been passed as floating point values, any still unused float registers will be skipped for any subsequent arg)
 \item stack is required throughout to be eight-byte aligned
 \end{itemize}
 
@@ -91,7 +91,7 @@
 \item integer return values use x0
 \item floating-point return values use d0
 \item aggregates (struct, union) that would be passed via registers if passed as a first param, are returned via those registers
-\item otherwise (e.g. if regs exhausted, or \textgreater\ 16b, ...), the caller allocates space, passes pointer to it to the callee through
+\item for aggregates not returnable via registers (e.g. if regs exhausted, or \textgreater\ 16b, ...), the caller allocates space, passes pointer to it to the callee through
 x8, and callee writes return value to this space (note that this is not a hidden first param, as x8 is not used for passing params); the ptr to the aggregate is returned in x0
 \end{itemize}