# HG changeset patch
# User cslag
# Date 1465478033 -7200
# Node ID dbca6763f2be97855e3b1154438eb7f611af0f09
# Parent  b15d814ba27430de786d750494529d0a9ebeb3c7
- complete, working mips o32 callback (using hardware fp); fixes error from last commit, which ignored first 2 float args
  * currently tested on little endian, only
  * todo update, manual update, cleanups

diff -r b15d814ba274 -r dbca6763f2be ToDo
--- a/ToDo	Wed Jun 08 02:27:12 2016 +0200
+++ b/ToDo	Thu Jun 09 15:13:53 2016 +0200
@@ -19,10 +19,11 @@
 - mailing list announcements (html email?)
 - fix for NDS:
   * ARM:
-    - callback_plain retval wrong (not only platform)
+    - callback_plain retval wrong (not only platform), area on stack where
+      it's stored is probably not zeroed, so half-words, etc., come with garbage
   * THUMB:
     - ellipsis (might be test itself, not respecting fixed part of args)
-    - callback_plain retval
+    - callback_plain retval, see above under ARM
     - callf
 - fix Minix/x86 callbacks (see callback_suite)
 - armhf ellipsis: fix weirdness with long long as first ... arg
@@ -34,6 +35,7 @@
   be used to test dycnall bindings
 - pkg-config support?
 - quadmath support (long double)
+- MIPS softfloat support? (-msoft-float)
 
 portasm:
 --------
@@ -66,8 +68,10 @@
 dyncallback:
 ------------
 - add SPARC and SPARC64 callback support
-- callback_plain's return value not correct anymore on NDS (maybe just broken testcode?)
-- add MIPS callbacks for eabi, n32, o32 (thunks are working)
+- callback_plain's return value not correct anymore on NDS (maybe just broken testcode?),
+  see above under 1.0 items
+  * check other platforms also, if asm stub initializes retval space, correctly
+- add MIPS callbacks for eabi, n32, n64
 - finish PPC32 callbacks (see bugs section, below, BSD not working)
 
 bindings:
diff -r b15d814ba274 -r dbca6763f2be doc/manual/callconvs/callconv_mips.tex
--- a/doc/manual/callconvs/callconv_mips.tex	Wed Jun 08 02:27:12 2016 +0200
+++ b/doc/manual/callconvs/callconv_mips.tex	Thu Jun 09 15:13:53 2016 +0200
@@ -120,21 +120,24 @@
 
 \begin{table}[h]
 \begin{tabular*}{0.95\textwidth}{lll}
-Name                                   & Alias                     & Brief description\\
+Name                         & Alias                     & Brief description\\
 \hline                                                             
-{\bf \$0}                              & {\bf \$zero}              & Hardware zero \\
-{\bf \$1}                              & {\bf \$at}                & Assembler temporary \\
-{\bf \$2-\$3}                          & {\bf \$v0-\$v1}           & return value, scratch \\
-{\bf \$4-\$7}                          & {\bf \$a0-\$a3}           & first four word-arguments, scratch\\
-{\bf \$8-\$15,\$24,\$25}               & {\bf \$t0-\$t7,\$t8,\$t9} & temporaries, scratch \\
-{\bf \$16-\$23}                        & {\bf \$s0-\$s7}           & Preserved \\
-{\bf \$26,\$27}                        & {\bf \$k0,\$k1}           & Reserved for kernel \\
-{\bf \$28}                             & {\bf \$gp}                & Global pointer, preserve \\
-{\bf \$29}                             & {\bf \$sp}                & Stack pointer, preserve \\
-{\bf \$30}                             & {\bf \$fp}                & Frame pointer, preserve \\
-{\bf \$31}                             & {\bf \$ra}                & Return address \\
-
-
+{\bf \$0}                    & {\bf \$zero}              & hardware zero \\
+{\bf \$1}                    & {\bf \$at}                & assembler temporary \\
+{\bf \$2-\$3}                & {\bf \$v0-\$v1}           & return value, scratch \\
+{\bf \$4-\$7}                & {\bf \$a0-\$a3}           & first integer arguments, scratch\\
+{\bf \$8-\$15,\$24,\$25}     & {\bf \$t0-\$t7,\$t8,\$t9} & temporaries, scratch \\
+{\bf \$16-\$23}              & {\bf \$s0-\$s7}           & preserved \\
+{\bf \$26,\$27}              & {\bf \$k0,\$k1}           & reserved for kernel \\
+{\bf \$28}                   & {\bf \$gp}                & global pointer, preserve \\
+{\bf \$29}                   & {\bf \$sp}                & stack pointer, preserve \\
+{\bf \$30}                   & {\bf \$fp}                & frame pointer, preserve \\
+{\bf \$31}                   & {\bf \$ra}                & return address \\
+{\bf hi, lo}                 &                           & multiply/divide special registers \\
+{\bf \$f0-\$f3}              &                           & float return value, scratch \\
+{\bf \$f4-\$f11,\$f16-\$f19} &                           & float temporaries, scratch \\
+{\bf \$f12-\$f15}            &                           & first floating point arguments, scratch \\
+{\bf \$f20-\$f31}            &                           & preserved \\
 \end{tabular*}
 \caption{Register usage on MIPS O32 calling convention}
 \end{table}
@@ -145,13 +148,14 @@
 \item Stack grows down
 \item Stack parameter order: right-to-left
 \item Caller cleans up the stack
-\item The different stack areas (e.g. parameter area, register save area, ...) are always aligned to 8 bytes.
-\item first four 32bit arguments are passed in registers \$a0-\$a3, respectively
+\item The different stack areas (local data, register save area, parameter area) are each aligned to 8 bytes.
+\item generally, first four 32bit arguments are passed in registers \$a0-\$a3, respectively (see below for exceptions if first arg is a float)
 \item subsequent parameters are passed vie the stack
-\item 64-bit params passed via registers are passed using two registers, starting at an even register number (skipping one odd register if necessary)
-\item 64-bit params passed via the stack are always 8-byte aligned
-\item results are returned in \$v0 (32-bit return values), \$v0 and \$v1 (64-bit)
-\item note that only s? registers that are modified by the function are required to be preserved on save area
+\item 64-bit params passed via registers are passed using either two registers (starting at an even register number, skipping an odd one if necessary), or via the stack using an 8-byte alignment
+\item if the very first call argument is a float, up to 2 floats or doubles can be passed via \$f12 and \$f14, respectively, for first and second argument
+\item if any arguments are passed via float registers, skip \$a0-\$a3 for subsequent arguments as if the values were passed via them
+\item note that if the first argument is not a float, but the second, it'll get passed via the \$a? registers
+\item results are returned in \$v0 (32-bit int return values), \$f0 (32-bit float), \$v0 and \$v1 (64-bit int), \$f0 and \$f3 (64bit float)
 \end{itemize}
 
 \paragraph{Stack layout}
diff -r b15d814ba274 -r dbca6763f2be dyncallback/dyncall_args_mips.h
--- a/dyncallback/dyncall_args_mips.h	Wed Jun 08 02:27:12 2016 +0200
+++ b/dyncallback/dyncall_args_mips.h	Thu Jun 09 15:13:53 2016 +0200
@@ -29,16 +29,13 @@
 
 #include "dyncall_args.h"
 
-#if !defined(DC__ABI_MIPS_O32)
-#  define DCARGS_MIPS_PARAM_REGS 4
-#else
-#  define DCARGS_MIPS_PARAM_REGS 8
-#endif
-
 struct DCArgs
 {
 	/* Don't change order! */
-#if !defined(DC__ABI_MIPS_O32)
+#if defined(DC__ABI_MIPS_O32)
+	int freg_count;
+#else
+#  define DCARGS_MIPS_PARAM_REGS 8
 	struct { int i; float f; } reg_data[DCARGS_MIPS_PARAM_REGS];
 	struct { int i; int   f; } reg_count;
 #endif
diff -r b15d814ba274 -r dbca6763f2be dyncallback/dyncall_args_mips_o32.c
--- a/dyncallback/dyncall_args_mips_o32.c	Wed Jun 08 02:27:12 2016 +0200
+++ b/dyncallback/dyncall_args_mips_o32.c	Thu Jun 09 15:13:53 2016 +0200
@@ -29,6 +29,7 @@
 DCint dcbArgInt(DCArgs* p)
 {
   DCint value;
+  p->freg_count = 2; // first int will disable float reg use.
   value = *((int*)p->stackptr);
   p->stackptr += sizeof(int);
   return value;
@@ -62,8 +63,22 @@
 DCfloat dcbArgFloat(DCArgs* p)
 {
   DCfloat result;
-  result = *((float*)p->stackptr);
-  p->stackptr += sizeof(float);
+  if(p->freg_count < 2) {
+	// Stored float regs (max 2) are always 8b aligned. The way we look them up,
+	// relative to a diverging p->stackptr, we need consider this. Only works
+	// with up to two float args, which is all we need. Hacky, but saves us
+	// from one more variable and more bookkeeping in DCArgs.
+    result = ((DCfloat*)(p->stackptr + ((int)p->stackptr & 4)) - 4) // '-4' b/c those regs are stored right before the args
+#if defined(DC__Endian_LITTLE)
+      [0];
+#else
+      [1];
+#endif
+	++p->freg_count;
+  } else {
+    result = *((DCfloat*)p->stackptr);
+  }
+  p->stackptr += sizeof(DCfloat);
   return result;
 }
 DCdouble dcbArgDouble(DCArgs* p)
@@ -73,13 +88,17 @@
     DCfloat f[2];
   } d;
   p->stackptr += ((int)p->stackptr & 4); // Skip one slot if not aligned.
-#if defined(DC__Endian_LITTLE)
-  d.f[0] = dcbArgFloat(p);
-  d.f[1] = dcbArgFloat(p);
-#else
-  d.f[1] = dcbArgFloat(p);
-  d.f[0] = dcbArgFloat(p);
-#endif
+  if(p->freg_count < 2) {
+    //result = *((DCdouble*)p->stackptr-2); this changes the value, slightly
+    d.f[0] = ((DCfloat*)p->stackptr-4)[0]; // '-4' b/c those regs are stored right before the args
+    d.f[1] = ((DCfloat*)p->stackptr-4)[1];
+    ++p->freg_count;
+  } else {
+    //result = *((DCdouble*)p->stackptr); this changes the value, slightly
+    d.f[0] = ((DCfloat*)p->stackptr)[0];
+    d.f[1] = ((DCfloat*)p->stackptr)[1];
+  }
+  p->stackptr += sizeof(DCdouble);
   return d.result;
 }
 
diff -r b15d814ba274 -r dbca6763f2be dyncallback/dyncall_callback_mips_o32_gas.s
--- a/dyncallback/dyncall_callback_mips_o32_gas.s	Wed Jun 08 02:27:12 2016 +0200
+++ b/dyncallback/dyncall_callback_mips_o32_gas.s	Thu Jun 09 15:13:53 2016 +0200
@@ -37,52 +37,62 @@
 	.ent   dcCallbackThunkEntry
 	.type  dcCallbackThunkEntry, @function
 
-/* Called by thunk - thunk stores pointer to DCCallback in $12 ($t4), and pointer to called function in $25 ($t9, required for PIC) */
+/* Called by thunk - thunk stores pointer to DCCallback in $12 ($t4), and */
+/* pointer to called function in $25 ($t9, required for PIC)              */
 dcCallbackThunkEntry:
 	.set    noreorder
 	.set    nomacro
 
-	/* Prolog. Just store the minimum, return address, frame pointer, spill area. */
-	subu  $sp, 32       /* open frame: 32b for 8b aligned frame (retval+ra+fp+spill) */
+	/* Prolog. Just store the minimum, return address, spill area.     */
+	/* Frame size of 48b comes from following areas (each 8b aligned); */
+	/*   local: fpregs:16 + retval:8 + DCArgs:8 */
+	/*   save:  ra:4 (+ pad:4)                  */
+	/*   param: spill:16                        */
+	subu  $sp, 56       /* open frame */
 	sw    $ra, 20($sp)  /* save link register */
 
-	.frame  $fp,32,$31  /* specify our frame: fp,size,lr; creates virt $fp */
+	.frame $fp,56,$31   /* specify our frame: fp,size,lr; creates virt $fp */
 
 	/* Init return value */
-	sw $zero, 24($sp)
-	sw $zero, 28($sp)
+	sw $zero, 32($sp)
+	sw $zero, 36($sp)
 
-	/* If we spill the first four, all arguments will be in one out in consecutive block */
-	/* Caller doesn't and it's up to us to spill, so let's write $4-$7 ($a0-$a3) to the */
-	/* dedicated spill area, first (at end of _caller's_ frame, so $fp points right to it). */
+	/* Store the arguments passed via registers somewhere for dcArg* to access. */
+	/* For $4-$7 ($a0-$a3), use dedicated spill area (caller doesn't spill, but */
+	/* provides it at end of _caller's_ frame, so $fp points right to it).      */
+	/* For $f12 and $f14 use our space (in local data), which is adjacent.      */
+	s.d $f12, 40($sp) /* -16($fp) */
+	s.d $f14, 48($sp) /*  -8($fp) */
+	sw $4,  0($fp)
+	sw $5,  4($fp)
+	sw $6,  8($fp)
 	sw $7, 12($fp)
-	sw $6,  8($fp)
-	sw $5,  4($fp)
-	sw $4,  0($fp)
 
-	/* Init DCArg, which contains stackptr* to the args, which is $fp. Use padding between */
-	/* stored return address and parameter area as place to store it (hacky, but saves 8b) */
-	sw $fp, 16($sp)
+	/* Init DCArg, which contains reg_count and stackptr* to the args. Point  */
+	/* stackptr to the area where the non-float args start (which is at $fp). */
+	sw $zero, 24($sp)
+	sw $fp,   28($sp)
 
 	/* Prepare callback handler call. */
 	move  $4, $12       /* Param 0 = DCCallback*, $12 ($t4) holds pointer to thunk */
-	addiu $5, $sp, 16   /* Param 1 = DCArgs*, pointer to where pointer to args is stored */
-	addiu $6, $sp, 24   /* Param 2 = results pointer to 8b of local data on stack */
+	addiu $5, $sp, 24   /* Param 1 = DCArgs*, pointer to where pointer to args is stored */
+	addiu $6, $sp, 32   /* Param 2 = results pointer to 8b of local data on stack */
 	lw    $7, 24($12)   /* Param 3 = userdata pointer */
 
 	lw    $25, 20($12)  /* store handler entry in $25 ($t9), required for PIC */
 	jalr  $25           /* jump */
-	nop
+	nop                 /* branch delay nop */
 
-	/* Copy result in corresponding registers $2-$3 ($v0-$v1) */
-	lw    $2, 24($sp)
-	lw    $3, 28($sp)
+	/* Copy result in corresponding registers $2-$3 ($v0-$v1) and $f0 */
+	lw     $2, 32($sp)
+	lw     $3, 36($sp)
+	l.d   $f0, 32($sp)
 
 	/* Epilog. Tear down frame and return. */
 	lw    $ra, 20($sp)  /* restore return address */
-	addiu $sp, $sp, 32  /* close frame */
+	addiu $sp, $sp, 56  /* close frame */
 	j     $ra           /* return */
-	nop
+	nop                 /* branch delay nop */
 
 	.set    macro
 	.set    reorder