changeset 148:12729fd52ab7

- performance improvement and more correct handling of arm32 armhf calls, not copying bogus data just to reserve spill area space - added armhf callconv stack layout diagram to doc
author cslag
date Sun, 11 Sep 2016 01:19:27 +0200
parents 8ce75240a0f1
children c12120a1fbc0
files doc/manual/callconvs/callconv_arm32.tex dyncall/dyncall_call_arm32_arm_armhf.S
diffstat 2 files changed, 54 insertions(+), 25 deletions(-) [+]
line wrap: on
line diff
--- a/doc/manual/callconvs/callconv_arm32.tex	Fri Sep 09 15:23:16 2016 +0200
+++ b/doc/manual/callconvs/callconv_arm32.tex	Sun Sep 11 01:19:27 2016 +0200
@@ -333,6 +333,7 @@
 \item parameters \textless=\ 32 bits are passed as 32 bit words
 \item structures and unions are passed by value, with the first four words of the parameters in r0-r3 @@@?check doc
 \item if return value is a structure, a pointer pointing to the return value's space is passed in r0, the first parameter in r1, etc. (see {\bf return values})
+\item callee spills, caller reserves spill area space, though
 \end{itemize}
 
 \paragraph{Return values}
@@ -344,7 +345,37 @@
 \item if return value is a structure, the caller allocates space for the return value on the stack in its frame and passes a pointer to it in r0
 \end{itemize}
 
+\paragraph{Stack layout}
 
+Stack directly after function prolog:\\
+
+\begin{figure}[h]
+\begin{tabular}{5|3|1 1}
+\hhline{~-~~}
+                                         & \vdots       &                                      &                              \\
+\hhline{~=~~}
+register save area                       & \hspace{4cm} &                                      & \mrrbrace{6}{caller's frame} \\
+\hhline{~-~~}
+local data                               &              &                                      &                              \\
+\hhline{~-~~}
+\mrlbrace{7}{parameter area}             & r0-r3        & \mrrbrace{1}{spill area (if needed)} &                              \\
+\hhline{~-~~}
+                                         & \ldots       & \mrrbrace{3}{stack parameters}       &                              \\
+                                         & \ldots       &                                      &                              \\
+                                         & \ldots       &                                      &                              \\
+\hhline{~=~~}
+register save area (with return address) &              &                                      & \mrrbrace{3}{current frame}  \\
+\hhline{~-~~}
+local data                               &              &                                      &                              \\
+\hhline{~-~~}
+parameter area                           & \vdots       &                                      &                              \\
+\hhline{~-~~}
+\end{tabular}
+\caption{Stack layout on arm32 armhf}
+\end{figure}
+
+
+\newpage
 
 
 \subsubsection{Architectures}
--- a/dyncall/dyncall_call_arm32_arm_armhf.S	Fri Sep 09 15:23:16 2016 +0200
+++ b/dyncall/dyncall_call_arm32_arm_armhf.S	Sun Sep 11 01:19:27 2016 +0200
@@ -51,38 +51,36 @@
 ENTRY_C(dcCall_arm32_armhf)
 
 	/* Prolog. This function never needs to spill inside its prolog, so just store the permanent registers. */
-	mov	r12 , r13	 /* Stack ptr (r13) -> temporary (r12). */
-	stmdb	r13!, {r4-r5, r11, r12, r14} /* Permanent registers and stack pointer (now in r12), etc... -> save area on stack (except counter). */
+	mov r12 , r13  /* Stack ptr (r13) -> temporary (r12). */
+	stmdb r13!, {r4-r5, r11, r12, r14} /* Permanent registers and stack pointer (now in r12), etc... -> save area on stack (except counter). */
 
-	mov	r11 , r12	 /* Set frame ptr. */
-	mov	r4  , r0 	 /* r4 = 'fptr' (1st argument is passed in r0). */
-	mov	r5  , r1	 /* r5 = 'args' (2nd argument is passed in r1). */
-	
+	mov r11, r12     /* Set frame ptr. */
+	mov r4,  r0      /* r4 = fptr */
+	add r5,  r1, #16 /* r5 = stack args (after intreg ones) */
+
 	/* Load 16 single-precision registers (= 8 double-precision registers). */
-	fldmiad	r3, {d0-d7}
+	fldmiad r3, {d0-d7}
 
-	/* nope, or varargs will not work: sub	r2 , r2 , #16	/* skip spill area */
-	cmp     r2, #0
-	ble	armhf_call
-
-	sub	r13, r13, r2
-	and	r13, r13, #-8	/* align 8-byte. */
+	/* prep stack parameter area (includes room for spill area, callee spills if needed) */
+	sub r13, r13, r2
+	and r13, r13, #-8  /* align 8-byte. */
 
-	mov	r3, #0		/* Init byte counter. */
-	add	r1 , r1 , #16
+	sub r2, r2, #16    /* loop counters for stack params to copy */
+	mov r3, #0
 armhf_pushArgs:
-	ldr	r0, [r1,  +r3]	/* Load word into r0. */
-	str	r0, [r13, +r3]	/* Push word onto stack. */
-	add	r3, r3, #4	/* Increment byte counter. */
-	cmp	r2, r3
-	bne	armhf_pushArgs
+	cmp r2, r3
+	ble armhf_call
+	ldr r0, [r5,  +r3] /* load word into r0 ... */
+	str r0, [r13, +r3] /* ... then push onto stack */
+	add r3, r3, #4
+	b   armhf_pushArgs
 
 armhf_call:
-	ldmia	r5, {r0-r3}	/* Load first 4 arguments for new call into r0-r3. */
-				/* 'blx %r4' workaround for ARMv4t: */
-	mov	r14, r15	/*   Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */ 
-	bx	r4		/*   Call (ARM/THUMB), available for ARMv4t. */
+	ldmia r1, {r0-r3}  /* Load first 4 arguments for new call into r0-r3. */
+	                   /* 'blx %r4' workaround for ARMv4t: */
+	mov r14, r15       /* Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */ 
+	bx  r4             /* Call (ARM/THUMB), available for ARMv4t. */
 
 	/* Epilog. */
-	ldmdb	r11, {r4-r5, r11, r13, r15}	/* Restore permanent registers (ignore temporary (r12), restore stack ptr and program counter).@@@db not needed since we rewrite r13? */
+	ldmdb r11, {r4-r5, r11, r13, r15} /* Restore permanent registers (ignore temporary (r12), restore stack ptr and program counter).@@@db not needed since we rewrite r13? */