diff --git a/.github/scripts/test.sh b/.github/scripts/test.sh
new file mode 100755
index 000000000..60eca5947
--- /dev/null
+++ b/.github/scripts/test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+runmats() {
+  echo make allxhelp "$@"
+  make -C ${MACH}/mats allxhelp "$@" 2>&1 | tee -a Make.out | grep '^matting '
+}
+
+# Split these out so that we get output every 10 minutes on Windows builds.
+runmats o=0
+runmats o=3
+runmats o=3 cp0=t
+runmats o=3 cp0=t eval=interpret
+
+if [ -f ${MACH}/mats/summary ]; then
+  cat ${MACH}/mats/summary
+  diff -q .github/scripts/summary ${MACH}/mats/summary
+  exit $?
+else
+  exit 1
+fi
diff --git a/c/random.c b/c/random.c
index d496d3a36..c7d80a718 100644
--- a/c/random.c
+++ b/c/random.c
@@ -22,12 +22,14 @@
 
 /* Representation is arecord with 6 `double` fields: */
 
-#define RANDSTATEX10(x) (((double*)&RECORDINSTIT(x, 0))[0])
-#define RANDSTATEX11(x) (((double*)&RECORDINSTIT(x, 0))[1])
-#define RANDSTATEX12(x) (((double*)&RECORDINSTIT(x, 0))[2])
-#define RANDSTATEX20(x) (((double*)&RECORDINSTIT(x, 0))[3])
-#define RANDSTATEX21(x) (((double*)&RECORDINSTIT(x, 0))[4])
-#define RANDSTATEX22(x) (((double*)&RECORDINSTIT(x, 0))[5])
+#define RECORDINSTDBLA(x) ((double *)((uptr)&RECORDINSTIT(x, 0) + (max_float_alignment - ptr_bytes)))
+
+#define RANDSTATEX10(x) (RECORDINSTDBLA(x)[0])
+#define RANDSTATEX11(x) (RECORDINSTDBLA(x)[1])
+#define RANDSTATEX12(x) (RECORDINSTDBLA(x)[2])
+#define RANDSTATEX20(x) (RECORDINSTDBLA(x)[3])
+#define RANDSTATEX21(x) (RECORDINSTDBLA(x)[4])
+#define RANDSTATEX22(x) (RECORDINSTDBLA(x)[5])
 
 /* The Generator
    =============
diff --git a/c/schlib.c b/c/schlib.c
index 4a73c560b..772e95487 100644
--- a/c/schlib.c
+++ b/c/schlib.c
@@ -216,6 +216,8 @@ void S_call_help(tc_in, singlep, lock_ts) ptr tc_in; IBOOL singlep; IBOOL lock_t
      the C stack and we may end up in a garbage collection */
     code = CP(tc);
     if (Sprocedurep(code)) code = CLOSCODE(code);
+    if (!IMMEDIATE(code) && !Scodep(code))
+      S_error_abort("S_call_help: invalid code pointer");
     S_immobilize_object(code);
 
     CP(tc) = AC1(tc);
diff --git a/c/vfasl.c b/c/vfasl.c
index a873eff68..f2c6f27cd 100644
--- a/c/vfasl.c
+++ b/c/vfasl.c
@@ -167,6 +167,7 @@ static ptr copy(vfasl_info *vfi, ptr pp, seginfo *si);
 static uptr sweep(vfasl_info *vfi, ptr p);
 static int is_rtd(ptr tf, vfasl_info *vfi);
 
+static IFASLCODE abs_reloc_variant(IFASLCODE type);
 static ptr vfasl_encode_relocation(vfasl_info *vfi, ptr obj);
 static void relink_code(ptr co, ptr sym_base, ptr *vspaces, uptr *vspace_offsets, IBOOL to_static);
 static ptr find_pointer_from_offset(uptr p_off, ptr *vspaces, uptr *vspace_offsets);
@@ -1083,6 +1084,20 @@ static int is_rtd(ptr tf, vfasl_info *vfi)
 #define VFASL_RELOC_TAG(p) (UNFIX(p) & ((1 << VFASL_RELOC_TAG_BITS) - 1))
 #define VFASL_RELOC_POS(p) (UNFIX(p) >> VFASL_RELOC_TAG_BITS)
 
+/* Picks a relocation variant that fits into the actual relocation's
+   shape, but holds an absolue value */
+static IFASLCODE abs_reloc_variant(IFASLCODE type) {
+  if (type == reloc_abs)
+    return reloc_abs;
+#if defined(I386) || defined(X86_64)
+  return reloc_abs;
+#elif defined(ARMV6)
+  return reloc_arm32_abs;
+#else
+  >> need to fill in for this platform <<
+#endif
+}
+
 static ptr vfasl_encode_relocation(vfasl_info *vfi, ptr obj) {
   ptr pos;
   int which_singleton;
@@ -1142,7 +1157,7 @@ static void relink_code(ptr co, ptr sym_base, ptr *vspaces, uptr *vspace_offsets
             code_off = RELOC_CODE_OFFSET(entry);
         }
         a += code_off;
-        obj = S_get_code_obj(reloc_abs, co, a, item_off);
+        obj = S_get_code_obj(abs_reloc_variant(RELOC_TYPE(entry)), co, a, item_off);
 
         if (IMMEDIATE(obj)) {
           if (Sfixnump(obj)) {
diff --git a/csug/foreign.stex b/csug/foreign.stex
index 879bd12d0..606554ea9 100644
--- a/csug/foreign.stex
+++ b/csug/foreign.stex
@@ -220,7 +220,10 @@ Windows: \scheme{__stdcall}, \scheme{__cdecl}, and \scheme{__com} (32-bit only).
 Since \scheme{__cdecl} is the default, specifying \scheme{__cdecl} is
 equivalent to specifying \scheme{#f} or no convention.
 Finally, \var{conv} can be \scheme{__collect_safe} to indicate that garbage
-collection is allowed concurrent to a call of the foreign procedure.
+collection is allowed concurrent to a call of the foreign procedure, or it
+can be \var{conv} can be \scheme{__varargs} to indicate that the procedure
+uses a convention that works with a variable number of arguments (which
+differs from the fixed-argument variant for some platforms and conventions).
 
 Use \scheme{__stdcall} to access most Windows API procedures.
 Use \scheme{__cdecl} for Windows API varargs procedures,
diff --git a/mats/fl.ms b/mats/fl.ms
index 2dd2a3760..cd9a8a635 100644
--- a/mats/fl.ms
+++ b/mats/fl.ms
@@ -26,6 +26,13 @@
         (flonum->fixnum (* (+ (ash (most-positive-fixnum) -1) 1) 1.0)))
    (eq? (most-negative-fixnum)
         (flonum->fixnum (* (most-negative-fixnum) 1.0)))
+   (eq? (+ (ash (most-positive-fixnum) -1) 1)
+        (flonum->fixnum (fl+ (* (+ (ash (most-positive-fixnum) -1) 1) 1.0) 0.5)))
+   (or (not (fixnum? (inexact->exact (exact->inexact (most-positive-fixnum)))))
+       (eq? (most-positive-fixnum)
+            (flonum->fixnum (fl+ (* (most-positive-fixnum) 1.0) 0.5))))
+   (eq? (most-negative-fixnum)
+        (flonum->fixnum (fl- (* (most-negative-fixnum) 1.0) 0.5)))
    (eq? (flonum->fixnum 0.0) 0)
    (eq? (flonum->fixnum 1.0) 1)
    (eq? (flonum->fixnum +4.5) +4)
diff --git a/mats/foreign.ms b/mats/foreign.ms
index 92e911396..ad5fb5170 100644
--- a/mats/foreign.ms
+++ b/mats/foreign.ms
@@ -1038,6 +1038,12 @@
     (define call-u64 (foreign-procedure "call_u64" (ptr unsigned-64 int int) unsigned-64))
     (define call-sf (foreign-procedure "call_sf" (ptr single-float int int) single-float))
     (define call-df (foreign-procedure "call_df" (ptr double-float int int) double-float))
+    (define call-varargs-df (foreign-procedure "call_varargs_df" (ptr double-float int int) double-float))
+    (define call-varargs-i7df (foreign-procedure "call_varargs_i7df" (ptr int
+                                                                          double-float double-float double-float
+                                                                          double-float double-float double-float
+                                                                          double-float)
+                                                 double-float))
     (define ($test-call-int signed? size call-int make-fc)
       (define n10000 (expt 256 size))
       (define nffff (- n10000 1))
@@ -1151,6 +1157,22 @@
         (double-float) double-float)
       73.25 7 23)
     108.25)
+  (equal?
+    (call-varargs-df
+      (foreign-callable
+        __varargs
+        (lambda (x y) (+ x y 5))
+        (double-float double-float) double-float)
+      10.25 20 300)
+    325.5)
+  (equal?
+    (call-varargs-i7df
+      (foreign-callable
+        __varargs
+        (lambda (i a b c d e f g) (+ i a b c d e f g 7))
+        (int double-float double-float double-float double-float double-float double-float double-float) double-float)
+      1 2.2 3.2 4.5 6.7 8.9 10.1 11.5)
+    55.1)
 
   (error?
     (call-i8
@@ -1212,6 +1234,13 @@
         (lambda (x) '(- x 7))
         (double-float) double-float)
       73.25 0 0))
+  (error?
+    (call-varargs-df
+      (foreign-callable
+        __varargs
+        (lambda (x y) '(- x 7))
+        (double-float double-float) double-float)
+      73.25 0 0))
 
   (begin
     (define u32xu32->u64
@@ -2540,9 +2569,33 @@
      (with-object-kept-live
       handler
       (call_many_times (foreign-callable-entry-point handler)))
+     (unlock-object handler)
      v)
    14995143)
 
+  (equal?
+   (let ()
+     (define v 0)
+     (define call_many_times_bv (foreign-procedure "call_many_times_bv" (void*) void))
+     (define work
+       (lambda (bv)
+         (set! v (+ v (bytevector-u8-ref bv 0)))
+         ;; Varying work, as above:
+         (let loop ([n (bitwise-and (bytevector-u8-ref bv 1) #xFFFF)])
+           (unless (zero? n)
+             (set! v (add1 v))
+             (loop (bitwise-arithmetic-shift-right n 1))))))
+     (define handlers (list (foreign-callable work (u8*) void)
+                            (foreign-callable work (u16*) void)
+                            (foreign-callable work (u32*) void)))
+     (map lock-object handlers)
+     (for-each (lambda (handler)
+                 (call_many_times_bv (foreign-callable-entry-point handler)))
+               handlers)
+     (map unlock-object handlers)
+     v)
+   103500000)
+
   ;; regression test related to saving registers that hold allocated
   ;; callable argument
   (let* ([call-with-many-args (foreign-procedure "call_with_many_args" (void*) boolean)]
@@ -2687,7 +2740,7 @@
       (separate-eval
         `(begin
            ,load-libc
-           (define f (foreign-procedure "printf" (string double) int))
+           (define f (foreign-procedure __varargs "printf" (string double) int))
            (f "(%g)" 3.5)
            (void)))
       read)
@@ -2697,7 +2750,7 @@
       (separate-eval
         `(begin
            ,load-libc
-           (define f (foreign-procedure "printf" (string double double double double double double) int))
+           (define f (foreign-procedure __varargs "printf" (string double double double double double double) int))
            (f "(%g %g %g %g %g %g)" 3.5 2.5 -1.5 6.75 8.25 -9.5)
            (void)))
       read)
@@ -2707,7 +2760,7 @@
       (separate-eval
         `(begin
            ,load-libc
-           (define f (foreign-procedure "printf" (string double double double double double double double double) int))
+           (define f (foreign-procedure __varargs "printf" (string double double double double double double double double) int))
            (f "(%g %g %g %g %g %g %g %g)" 3.5 2.5 -1.5 6.75 8.25 -9.5 1e32 -4.5)
            (void)))
       read)
@@ -2717,11 +2770,30 @@
       (separate-eval
         `(begin
            ,load-libc
-           (define f (foreign-procedure "printf" (string double double double double double double double double double double) int))
+           (define f (foreign-procedure __varargs "printf" (string double double double double double double double double double double) int))
            (f "(%g %g %g %g %g %g %g %g %g %g)" 3.5 2.5 -1.5 6.75 8.25 -9.5 1e32 -4.5 7.25 -0.5)
            (void)))
       read)
     '(3.5 2.5 -1.5 6.75 8.25 -9.5 1e32 -4.5 7.25 -0.5))
+
+  (equal? (let ([cb (foreign-callable __varargs
+                                      (lambda (x y) (+ x y 5))
+                                      (double-float double-float) double-float)])
+            (with-object-kept-live
+             cb
+             ((foreign-procedure __varargs (foreign-callable-entry-point cb)
+                                 (double-float double-float) double-float)
+              3.4 5.5)))
+          13.9)
+  (equal? (let ([cb (foreign-callable __varargs
+                                      (lambda (x y) (+ x y 5))
+                                      (double-float double-float) single-float)])
+            (with-object-kept-live
+             cb
+             ((foreign-procedure __varargs (foreign-callable-entry-point cb)
+                                 (double-float double-float) single-float)
+              3.5 -5.25)))
+          3.25)
 )
 
 (mat structs
diff --git a/mats/foreign2.c b/mats/foreign2.c
index 3e12cf1ff..56b2f5b2d 100644
--- a/mats/foreign2.c
+++ b/mats/foreign2.c
@@ -244,6 +244,17 @@ EXPORT double_float call_df(ptr code, double_float x, int m, int k) {
   return (*((double_float (*) (double_float))Sforeign_callable_entry_point(code)))(x + m) + k;
 }
 
+EXPORT double_float call_varargs_df(ptr code, double_float x, int m, int k) {
+  return (*((double_float (*) (double, ...))Sforeign_callable_entry_point(code)))(x - m, x + m) + k;
+}
+
+EXPORT double_float call_varargs_i7df(ptr code, int i,
+                                      double_float a, double_float b, double_float c,
+                                      double_float d, double_float e, double_float f,
+                                      double_float g) {
+  return (*((double_float (*) (int, ...))Sforeign_callable_entry_point(code)))(i, a, b, c, d, e, f, g);
+}
+
 EXPORT u8 *u8_star_to_u8_star(u8 *s) {
   return s == (u8 *)0 ? (u8 *)0 : s + 1;
 }
@@ -444,6 +455,18 @@ EXPORT void call_many_times(void (*f)(iptr))
   }
 }
 
+EXPORT void call_many_times_bv(void (*f)(char *s))
+{
+  /* make this sensible as u8*, u16*, and u32* */
+  char buf[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
+  int x;
+
+  for (x = 0; x < 1000000; x++) {
+    buf[0] = (x & 63) + 1;
+    f(buf);
+  }
+}
+
 typedef void (*many_arg_callback_t)(int i, const char* s1, const char* s2, const char* s3,
                                     const char* s4, int i2, const char* s6, const char* s7, int i3);
 EXPORT void call_with_many_args(many_arg_callback_t callback)
diff --git a/mats/hash.ms b/mats/hash.ms
index 92b832494..a61f7c764 100644
--- a/mats/hash.ms
+++ b/mats/hash.ms
@@ -1674,7 +1674,10 @@
           [wht (make-weak-eq-hashtable)]
           [eht (make-ephemeron-eq-hashtable)])
       (let ([ls2 (map (lambda (a1) (eq-hashtable-cell ht (car a1) (cdr a1))) ls1)]
-            [ls2-2 (map (lambda (a1) (eq-hashtable-try-atomic-cell ht (car a1) (cdr a1))) ls1)]
+            [ls2-2 (map (lambda (a1) (let loop ()
+                                       (define c (eq-hashtable-try-atomic-cell ht (car a1) (cdr a1)))
+                                       (or c (loop))))
+                        ls1)]
             [ls3 (map (lambda (a1) (hashtable-cell wht (car a1) (cdr a1))) ls1)]
             [ls4 (map (lambda (a1) (hashtable-cell eht (car a1) (cdr a1))) ls1)])
         (let ([ls2* (map (lambda (a1) (eq-hashtable-ref-cell ht (car a1))) ls1)]
diff --git a/mats/misc.ms b/mats/misc.ms
index d74521334..d5d9826d0 100644
--- a/mats/misc.ms
+++ b/mats/misc.ms
@@ -4694,34 +4694,35 @@
              (#2%display 1))))
 )
 
-(mat $read-time-stamp-counter
+(unless (memq (machine-type) '(arm32le tarm32le)) ; timestamp counter tends to be priviledged on arm32le
+  (mat $read-time-stamp-counter
 
-  (let ([t (#%$read-time-stamp-counter)])
-    (and (integer? t) (exact? t)))
+    (let ([t (#%$read-time-stamp-counter)])
+      (and (integer? t) (exact? t)))
 
-  (let ()
-    ;; NB: pulled from thread.ms, to use as a delay
-    (define fat+
-      (lambda (x y)
-        (if (zero? y)
-            x
-            (fat+ (1+ x) (1- y)))))
-    (define fatfib
-      (lambda (x)
-        (if (< x 2)
-            1
-            (fat+ (fatfib (1- x)) (fatfib (1- (1- x)))))))
-    (let loop ([count 10] [success 0])
-      (if (fx= count 0)
-          (>= success 9)
-          (let ([t0 (#%$read-time-stamp-counter)])
-            (fatfib 26)
-            (let ([t1 (#%$read-time-stamp-counter)])
-              (loop (fx- count 1)
-                    (if (< t0 t1)
-                        (fx+ success 1)
-                        success)))))))
-)
+    (let ()
+      ;; NB: pulled from thread.ms, to use as a delay
+      (define fat+
+        (lambda (x y)
+          (if (zero? y)
+              x
+              (fat+ (1+ x) (1- y)))))
+      (define fatfib
+        (lambda (x)
+          (if (< x 2)
+              1
+              (fat+ (fatfib (1- x)) (fatfib (1- (1- x)))))))
+      (let loop ([count 10] [success 0])
+        (if (fx= count 0)
+            (>= success 9)
+            (let ([t0 (#%$read-time-stamp-counter)])
+              (fatfib 26)
+              (let ([t1 (#%$read-time-stamp-counter)])
+                (loop (fx- count 1)
+                      (if (< t0 t1)
+                          (fx+ success 1)
+                          success)))))))
+    ))
 
 (mat procedure-arity-mask
   (equal? (procedure-arity-mask (lambda () #f)) 1)
diff --git a/mats/root-experr-compile-0-f-f-f b/mats/root-experr-compile-0-f-f-f
index 79018982c..9b3d5453b 100644
--- a/mats/root-experr-compile-0-f-f-f
+++ b/mats/root-experr-compile-0-f-f-f
@@ -9950,6 +9950,7 @@ foreign.mo:Expected error in mat foreign-fixed-types: "foreign-callable: invalid
 foreign.mo:Expected error in mat foreign-fixed-types: "foreign-callable: invalid return value (- x 7) from #<procedure>".
 foreign.mo:Expected error in mat foreign-fixed-types: "foreign-callable: invalid return value (- x 7) from #<procedure>".
 foreign.mo:Expected error in mat foreign-fixed-types: "foreign-callable: invalid return value (- x 7) from #<procedure>".
+foreign.mo:Expected error in mat foreign-fixed-types: "foreign-callable: invalid return value (- x 7) from #<procedure>".
 foreign.mo:Expected error in mat foreign-C-types: "int-to-int: invalid foreign-procedure argument qqq".
 foreign.mo:Expected error in mat foreign-C-types: "unsigned-to-unsigned: invalid foreign-procedure argument qqq".
 foreign.mo:Expected error in mat foreign-C-types: "unsigned-int-to-unsigned-int: invalid foreign-procedure argument qqq".
diff --git a/mats/thread.ms b/mats/thread.ms
index 68c042682..9c361330d 100644
--- a/mats/thread.ms
+++ b/mats/thread.ms
@@ -1569,6 +1569,18 @@
         (condition-broadcast c))
       (equal? gc-ids (list (get-thread-id)))))
 )
-  
+
+(mat wait-for-threads
+  (begin
+    ;; To avoid breaking later tests that use `(collect)`,
+    ;; wait for any threads created here to exit
+    (let ()
+      (define $threads (foreign-procedure "(cs)threads" () scheme-object))
+      (let loop ()
+        (unless (= 1 (length ($threads)))
+          (sleep (make-time 'time-duration 10000 0))
+          (loop))))
+    #t)
+)  
 
 )
diff --git a/s/Mf-rv64le b/s/Mf-rv64le
index 87777e405..edb46aff4 100644
--- a/s/Mf-rv64le
+++ b/s/Mf-rv64le
@@ -14,6 +14,6 @@
 # limitations under the License.
 
 m = rv64le
-archincludes = rv64le.ss
+archincludes = rv64.ss
 
 include Mf-base
diff --git a/s/arm32.ss b/s/arm32.ss
index 3a2e371b8..ecc9439f3 100644
--- a/s/arm32.ss
+++ b/s/arm32.ss
@@ -77,11 +77,11 @@
     #;[%yp]
     [     %r0  %Carg1 %Cretval  #f  0 uptr]
     [     %r1  %Carg2           #f  1 uptr]
-    [     %r2  %Carg3           #f  2 uptr]
-    [     %r3  %Carg4           #f  3 uptr]
+    [     %r2  %Carg3 %reify1   #f  2 uptr]
+    [     %r3  %Carg4 %reify2   #f  3 uptr]
     [     %lr                   #f 14 uptr] ; %lr is trashed by 'c' calls including calls to hand-coded routines like get-room
-    [%fp1 %Cfparg5   %d4  %s8   #f  8 fp]
-    [%fp2 %Cfparg6   %d5  %s10  #f 10 fp]
+    [%fp1           %d8  %s16   #t 16 fp]   ; allocable fp regs must not overlap with any half registers
+    [%fp2           %d9  %s18   #t 18 fp]
   )
   (machine-dependent
     [%sp                        #t 13 uptr]
@@ -94,18 +94,19 @@
     [%Cfparg3b                %s5   #f  5 fp]
     [%Cfparg4            %d3  %s6   #f  6 fp]
     [%Cfparg4b                %s7   #f  7 fp]
+    [%Cfparg5            %d4  %s8   #f  8 fp]
     [%Cfparg5b                %s9   #f  9 fp]
+    [%Cfparg6            %d5  %s10  #f 10 fp]
     [%Cfparg6b                %s11  #f 11 fp]
-    [%Cfparg7   %fptmp1  %d6  %s12  #f 12 fp]
-    [%Cfparg7b  %fptmp2       %s13  #f 13 fp]
+    [%Cfparg7            %d6  %s12  #f 12 fp]
+    [%Cfparg7b                %s13  #f 13 fp]
     [%Cfparg8            %d7  %s14  #f 14 fp]
     [%Cfparg8b                %s15  #f 15 fp]
     ;; etc., but other FP registers are preserved
     #;[                  %d16       #t 32 fp] ; >= 32: high bit goes in D, N, or M bit, low bits go in Vd, Vn, Vm
     #;[                  %d17       #t 33 fp]
     ; etc.
-    )
-  (reify-support %ts %lr %r3 %r2))
+    ))
 
 ;;; SECTION 2: instructions
 (module (md-handle-jump) ; also sets primitive handlers
@@ -230,20 +231,18 @@
                  (cond
                    [(and (eq? x1 %zero) (or (unsigned12? imm) (unsigned12? (- imm))))
                     (return x0 %zero imm type)]
-                   [(funky12 imm) =>
+                   [(funky12 imm)
                     ; NB: dubious value?  check to see if it's exercised
-                    (lambda (imm)
-                      (let ([u (make-tmp 'u)])
-                        (seq
-                          (build-set! ,u (asm ,null-info ,(asm-add #f) ,x0 (immediate ,imm)))
-                          (return u x1 0 type))))]
-                   [(funky12 (- imm)) =>
+                    (let ([u (make-tmp 'u)])
+                      (seq
+                        (build-set! ,u (asm ,null-info ,(asm-add #f) ,x0 (immediate ,imm)))
+                        (return u x1 0 type)))]
+                   [(funky12 (- imm))
                     ; NB: dubious value?  check to see if it's exercised
-                    (lambda (imm)
-                      (let ([u (make-tmp 'u)])
-                        (seq
-                          (build-set! ,u (asm ,null-info ,(asm-sub #f) ,x0 (immediate ,imm)))
-                          (return u x1 0 type))))]
+                    (let ([u (make-tmp 'u)])
+                      (seq
+                        (build-set! ,u (asm ,null-info ,(asm-sub #f) ,x0 (immediate ,(- imm))))
+                        (return u x1 0 type)))]
                    [else
                     (let ([u (make-tmp 'u)])
                       (seq
@@ -264,7 +263,7 @@
              (k (with-output-language (L15d Lvalue) `(mref ,u ,%zero 0 uptr)))))]
         [else (mref->mref a k)])))
 
-  (define fpmem->fpmem
+  (define fpmem->fpmem ; allows mem argument, too
     (lambda (a k)
       (define return
         (lambda (x0 x1 imm)
@@ -295,6 +294,10 @@
                    [else
                     (return x0 %zero imm)])))))])))
 
+  (define mem->fpmem
+    (lambda (a k)
+      (fpmem->fpmem a k)))
+
   (define-syntax coercible?
     (syntax-rules ()
       [(_ ?a ?aty*)
@@ -309,7 +312,8 @@
              (and (memq 'unsigned12 aty*) (imm-unsigned12? a))
              (and (memq 'imm-constant aty*) (imm-constant? a))
              (and (memq 'uword8 aty*) (imm-uword8? a))
-             (and (memq 'mem aty*) (mem? a))))]))
+             (and (memq 'mem aty*) (mem? a))
+             (and (memq 'fpmem aty*) (fpmem? a))))]))
 
   (define-syntax coerce-opnd ; passes k something compatible with aty*
     (syntax-rules ()
@@ -317,6 +321,7 @@
        (let ([a ?a] [aty* ?aty*] [k ?k])
          (cond
            [(and (memq 'mem aty*) (mem? a)) (mem->mem a k)]
+           [(and (memq 'fpmem aty*) (fpmem? a)) (fpmem->fpmem a k)]
            [(and (memq 'funky12 aty*) (imm-funky12? a)) (k (imm->imm a))]
            [(and (memq 'negate-funky12 aty*) (imm-negate-funky12? a)) (k (imm->negate-imm a))]
            [(and (memq 'lognot-funky12 aty*) (imm-lognot-funky12? a)) (k (imm->lognot-imm a))]
@@ -387,12 +392,6 @@
 
   (define-syntax define-instruction
     (lambda (x)
-      (define mem-type?
-        (lambda (t)
-          (syntax-case t (mem fpmem)
-            [mem #t]
-            [fpmem #t]
-            [else #f])))
       (define make-value-clause
         (lambda (fmt)
           (syntax-case fmt (mem fpmem ur fpur)
@@ -724,52 +723,33 @@
                      `(asm ,null-info ,(asm-store type) ,x ,y ,w ,u)))
                  `(asm ,null-info ,(asm-store type) ,x ,y ,w ,z)))))]))
 
-  (let ()
-    (define pick-asm-op
-      (lambda (op info)
-        (let ([flreg (info-loadfl-flreg info)])
-          (case op
-            [(load-single->double load-double->single) (asm-fl-load/cvt op flreg)]
-            [(store-single->double) (asm-fl-store/cvt op flreg)]
-            [else (asm-fl-load/store op flreg)]))))
-    (define-instruction effect (load-single->double load-double->single store-single->double
-                                 store-single store-double
-                                 load-single load-double)
-      [(op (x ur) (y ur) (z uword8))
-       (if (eq? y %zero)
-           `(asm ,info ,(pick-asm-op op info) ,x ,z)
-           (let ([u (make-tmp 'u)])
-             (seq
-               `(set! ,(make-live-info) ,u (asm ,info ,(asm-add #f) ,x ,y))
-               `(asm ,info ,(pick-asm-op op info) ,u ,z))))]
-      [(op (x ur) (y ur) (z ur))
-       (let ([u (make-tmp 'u)])
-         (seq
-           `(set! ,(make-live-info) ,u (asm ,info ,(asm-add #f) ,x ,z))
-           (if (eq? y %zero)
-               `(asm ,info ,(pick-asm-op op info) ,u (immediate 0))
-               (seq
-                 `(set! ,(make-live-info) ,u (asm ,info ,(asm-add #f) ,u ,y))
-                 `(asm ,info ,(pick-asm-op op info) ,u (immediate 0))))))]))
+  (define-instruction value (load-single->double)
+    [(op (x fpur) (y fpmem))
+     (let ([u (make-tmp 'u 'fp)])
+       (seq
+        `(set! ,(make-live-info) ,u (asm ,null-info ,asm-fpmove-single ,y))
+        `(set! ,(make-live-info) ,x (asm ,info ,(asm-fl-cvt 'single->double) ,u))))])
 
-  (let ()
-    ; vldr, vstr allow only word offsets, and we require byte offset due to the type tag
-    (module (with-flonum-data-pointers)
-      (define $flonum-data-pointer
-        (lambda (x p)
-          (with-output-language (L15d Effect)
-            (let ([u (make-tmp 'u)])
-              (seq
-                `(set! ,(make-live-info) ,u (asm ,null-info ,(asm-add #f) ,x (immediate ,(constant flonum-data-disp))))
-                (p u))))))
-      (define-syntax with-flonum-data-pointers
-        (syntax-rules ()
-          [(_ () e1 e2 ...) (begin e1 e2 ...)]
-          [(_ (x1 x2 ...) e1 e2 ...)
-           ($flonum-data-pointer x1
-             (lambda (x1)
-               (with-flonum-data-pointers (x2 ...) e1 e2 ...)))])))
+  (define-instruction effect (store-double->single)
+    [(op (x fpmem) (y fpur))
+     (let ([u (make-tmp 'u 'fp)])
+       (seq
+        `(set! ,(make-live-info) ,u (asm ,null-info ,(asm-fl-cvt 'double->single) ,y))
+        `(asm ,info ,asm-fpmove-single ,x ,u)))])
+
+  (define-instruction effect (store-single)
+    [(op (x fpmem) (y fpur))
+     `(asm ,info ,asm-fpmove-single ,x ,y)])
+
+  (define-instruction value (load-single)
+    [(op (x fpur) (y fpmem))
+     `(set! ,(make-live-info) ,x (asm ,info ,asm-fpmove-single ,y))])
+
+  (define-instruction value (single->double double->single)
+    [(op (x fpur) (y fpur))
+     `(set! ,(make-live-info) ,x (asm ,info ,(asm-fl-cvt op) ,y))])
 
+  (let ()
     (define (fpmem->mem mem dir)
       (with-output-language (L15d Triv)
         (nanopass-case (L15d Triv) mem
@@ -783,45 +763,48 @@
 
     (define-instruction value (fpt)
       [(op (x fpur) (y ur))
-       `(set! ,(make-live-info) ,x (asm ,info ,asm-fpt ,y))])
+       (let ([u (make-tmp 'u 'fp)])
+         (seq
+          `(set! ,(make-live-info) ,u (asm ,null-info ,asm-kill))
+          `(set! ,(make-live-info) ,x (asm ,info ,asm-fpt ,y, u))))])
 
     (define-instruction value (fpmove)
       [(op (x fpmem) (y fpur)) `(set! ,(make-live-info) ,x ,y)]
-      [(op (x fpur) (y fpur)) `(set! ,(make-live-info) ,x ,y)])
+      [(op (x fpur) (y fpmem fpur)) `(set! ,(make-live-info) ,x ,y)])
 
     (define-instruction value (fpcastto/hi)
       [(op (x ur) (y fpmem)) `(set! ,(make-live-info) ,x ,(fpmem->mem y 'hi))]
       [(op (x ur) (y fpur)) `(set! ,(make-live-info) ,x (asm ,info ,(asm-fpcastto 'hi) ,y))])
     
-  (define-instruction value (fpcastto/lo)
-    [(op (x ur) (y fpmem)) `(set! ,(make-live-info) ,x ,(fpmem->mem y 'lo))]
-    [(op (x ur) (y fpur)) `(set! ,(make-live-info) ,x (asm ,info ,(asm-fpcastto 'lo) ,y))])
-
-  (define-instruction value (fpcastfrom)
-    [(op (x fpmem) (hi ur) (lo ur)) (seq
-                                     `(set! ,(make-live-info) ,(fpmem->mem x 'lo) ,lo)
-                                     `(set! ,(make-live-info) ,(fpmem->mem x 'hi) ,hi))]
-    [(op (x fpur) (hi ur) (lo ur)) `(set! ,(make-live-info) ,x (asm ,info ,asm-fpcastfrom ,lo ,hi))])
-
-
-
-    (define-instruction value (fp+ fp- fp/ fp*)
-      [(op (x fpur) (y fpur) (z fpur))
-       `(set! ,(make-live-info) ,x (asm ,info ,(asm-fpop-2 op) ,y ,z))])
-
-    (define-instruction value (fpsqrt)
-      [(op (x fpur) (y fpur))
-       `(set! ,(make-live-info) ,x (asm ,info ,asm-fpsqrt ,y))])
-
-    (define-instruction value (trunc)
-      [(op (z ur) (x ur))
-       (with-flonum-data-pointers (x)
-         `(set! ,(make-live-info) ,z (asm ,info ,asm-trunc ,x)))])
+    (define-instruction value (fpcastto/lo)
+      [(op (x ur) (y fpmem)) `(set! ,(make-live-info) ,x ,(fpmem->mem y 'lo))]
+      [(op (x ur) (y fpur)) `(set! ,(make-live-info) ,x (asm ,info ,(asm-fpcastto 'lo) ,y))])
+    
+    (define-instruction value (fpcastfrom)
+      [(op (x fpmem) (hi ur) (lo ur)) (seq
+                                       `(set! ,(make-live-info) ,(fpmem->mem x 'lo) ,lo)
+                                       `(set! ,(make-live-info) ,(fpmem->mem x 'hi) ,hi))]
+      [(op (x fpur) (hi ur) (lo ur)) `(set! ,(make-live-info) ,x (asm ,info ,asm-fpcastfrom ,lo ,hi))]))
+
+  (define-instruction value (fp+ fp- fp/ fp*)
+    [(op (x fpur) (y fpur) (z fpur))
+     `(set! ,(make-live-info) ,x (asm ,info ,(asm-fpop-2 op) ,y ,z))])
+
+  (define-instruction value (fpsqrt)
+    [(op (x fpur) (y fpur))
+     `(set! ,(make-live-info) ,x (asm ,info ,asm-fpsqrt ,y))])
+
+  (define-instruction value (fptrunc)
+    [(op (z ur) (x fpur))
+     (let ([u (make-tmp 'u 'fp)])
+       (seq
+        `(set! ,(make-live-info) ,u (asm ,null-info ,asm-kill))
+        `(set! ,(make-live-info) ,z (asm ,info ,asm-fptrunc ,x ,u))))])
 
-    (define-instruction pred (fp= fp< fp<=)
-      [(op (x fpur) (y fpur))
-       (let ([info (make-info-condition-code op #f #f)])
-         (values '() `(asm ,info ,(asm-fp-relop info) ,x ,y)))]))
+  (define-instruction pred (fp= fp< fp<=)
+    [(op (x fpur) (y fpur))
+     (let ([info (make-info-condition-code op #f #f)])
+       (values '() `(asm ,info ,(asm-fp-relop info) ,x ,y)))])
 
   (define-instruction effect (inc-cc-counter)
     [(op (x ur) (w ur funky12) (z funky12 ur))
@@ -946,14 +929,16 @@
     ; NB: compiler ipmlements init-lock! and unlock! as 32-bit store of zero 
     (define-instruction pred (lock!)
       [(op (x ur) (y ur) (w funky12))
-       (let ([u (make-tmp 'u)])
+       (let ([u (make-tmp 'u)]
+             [u2 (make-tmp 'u2)])
          (values
            (lea->reg x y w
              (lambda (r)
                (with-output-language (L15d Effect)
                  (seq
                    `(set! ,(make-live-info) ,u (asm ,null-info ,asm-kill))
-                   `(asm ,null-info ,asm-lock ,r ,u)))))
+                   `(set! ,(make-live-info) ,u2 (asm ,null-info ,asm-kill))
+                   `(asm ,null-info ,asm-lock ,r ,u ,u2)))))
            `(asm ,info-cc-eq ,asm-eq ,u (immediate 0))))])
     (define-instruction effect (locked-incr! locked-decr!)
       [(op (x ur) (y ur) (w funky12))
@@ -994,6 +979,9 @@
   (define-instruction effect (vpush-multiple)
     [(op) `(asm ,info ,(asm-vpush-multiple (info-vpush-reg info) (info-vpush-n info)))])
 
+  (define-instruction effect (vpop-multiple)
+    [(op) `(asm ,info ,(asm-vpop-multiple (info-vpush-reg info) (info-vpush-n info)))])
+
   (define-instruction effect save-flrv
     [(op) `(asm ,info ,asm-save-flrv)])
 
@@ -1009,13 +997,12 @@
                      asm-move asm-move/extend asm-load asm-store asm-swap asm-library-call asm-library-call! asm-library-jump
                      asm-mul asm-smull asm-cmp/shift asm-add asm-sub asm-rsb asm-logand asm-logor asm-logxor asm-bic
                      asm-pop-multiple asm-shiftop asm-logand asm-lognot
-                     asm-logtest asm-fp-relop asm-relop asm-push-multiple asm-vpush-multiple
+                     asm-logtest asm-fp-relop asm-relop asm-push-multiple asm-vpush-multiple asm-vpop-multiple
                      asm-indirect-jump asm-literal-jump
                      asm-direct-jump asm-return-address asm-jump asm-conditional-jump asm-data-label
                      asm-rp-header asm-rp-compact-header
                      asm-indirect-call asm-condition-code
-                     asm-fl-load/store 
-                     asm-fl-load/cvt asm-fl-store/cvt asm-fpt asm-fpmove asm-fpcastto asm-fpcastfrom asm-trunc 
+                     asm-fpmove-single asm-fl-cvt asm-fpt asm-fpmove asm-fpcastto asm-fpcastfrom asm-fptrunc 
                      asm-lock asm-lock+/- asm-cas
                      asm-fpop-2 asm-fpsqrt asm-c-simple-call
                      asm-save-flrv asm-restore-flrv asm-return asm-c-return asm-size
@@ -1190,7 +1177,8 @@
 
   (define-op popm  pm-op #b10001011) 
   (define-op pushm pm-op #b10010010)
-  (define-op vpushm vpushm-op)
+  (define-op vpushm vpm-op #b11010 #b10)
+  (define-op vpopm  vpm-op #b11001 #b11)
 
   (define-op vldr.sgl vldr/vstr-op #b1010 #b01)
   (define-op vldr.dbl vldr/vstr-op #b1011 #b01)
@@ -1221,6 +1209,7 @@
   (define-op mrc mrc/mcr-op #b1)
 
   (define-op vadd vadd-op #b11 #b0 #b11100)
+
   (define-op vsub vadd-op #b11 #b1 #b11100)
   (define-op vmul vadd-op #b10 #b0 #b11100)
   (define-op vdiv vadd-op #b00 #b0 #b11101)
@@ -1485,7 +1474,7 @@
   (define vmov.gpr-op
     (lambda (op dir flreg flreg-delta gpreg code*)
       (let-values ([(n vn) (ax-flreg->bits flreg flreg-delta)])
-        (emit-code (op flreg gpreg code*)
+        (emit-code (op flreg gpreg flreg-delta code*)
           [28 (ax-cond 'al)]
           [21 #b1110000]
           [20 dir]
@@ -1574,14 +1563,15 @@
         [12 #b1111]
         [0  #b101000010000])))
 
-  (define vpushm-op
-    (lambda (op flreg n code*)
+  (define vpm-op
+    (lambda (op opcode opcode2 flreg n code*)
       (let-values ([(d vd) (ax-flreg->bits flreg)])
         (emit-code (op flreg n code*)
           [28 (ax-cond 'al)]
-          [23 #b11010]
+          [23 opcode]
           [22 d]
-          [16 #b101101]
+          [20 opcode2]
+          [16 #b1101]
           [12 vd]
           [8  #b1011]
           [0  (fxsll n 1)]))))
@@ -1959,40 +1949,15 @@
         (Trivit (src0 src1)
           (emit cmp/shift count type src0 src1 code*)))))
 
-  (define-who asm-fl-load/cvt
-    (lambda (op flreg)
-      (lambda (code* base offset)
-        (Trivit (base offset)
-          (case op
-            [(load-single->double)
-             (emit vldr.sgl %fptmp2 base (ax-imm-data offset)
-               (emit vcvt.sgl->dbl flreg %fptmp2 code*))]
-            [(load-double->single)
-             (emit vldr.dbl %fptmp2 base (ax-imm-data offset)
-               (emit vcvt.dbl->sgl flreg %fptmp2 code*))]
-            [else (sorry! who "unrecognized op ~s" op)])))))
-
-  (define-who asm-fl-store/cvt
-    (lambda (op flreg)
-      (lambda (code* base offset)
-        (Trivit (base offset)
-          (case op
-            [(store-single->double)
-             (emit vcvt.sgl->dbl %fptmp2 flreg
-               (emit vstr.dbl %fptmp2 base (ax-imm-data offset) code*))]
-            [else (sorry! who "unrecognized op ~s" op)])))))
-
-  (define-who asm-fl-load/store
-    (lambda (op flreg)
-      (lambda (code* base offset)
-        (Trivit (base offset)
-          (let ([offset (ax-imm-data offset)])
-            (case op
-              [(load-single) (emit vldr.sgl flreg base offset code*)]
-              [(load-double) (emit vldr.dbl flreg base offset code*)]
-              [(store-single) (emit vstr.sgl flreg base offset code*)]
-              [(store-double) (emit vstr.dbl flreg base offset code*)]
-              [else (sorry! who "unrecognized op ~s" op)]))))))
+  (define-who asm-fl-cvt
+    (lambda (op)
+      (lambda (code* dest-reg src-reg)
+        (case op
+          [(single->double)
+           (emit vcvt.sgl->dbl dest-reg src-reg code*)]
+          [(double->single)
+           (emit vcvt.dbl->sgl dest-reg src-reg code*)]
+          [else (sorry! who "unrecognized op ~s" op)]))))
 
   (define-who asm-load
     (lambda (type)
@@ -2060,37 +2025,51 @@
     (lambda (code* dest src)
       (emit vsqrt dest src code*)))
 
-  (define asm-trunc
-    (lambda (code* dest flonumreg)
-      (Trivit (dest flonumreg)
-        (emit vldr.dbl %fptmp1 flonumreg 0
-          (emit vcvt.dbl->s32 %fptmp1 %fptmp1
-            (emit vmov.s32->gpr %fptmp1 0 dest code*))))))
+  (define asm-fptrunc
+    (lambda (code* dest flonumreg tmpreg)
+      (Trivit (dest)
+        (emit vcvt.dbl->s32 tmpreg flonumreg
+            (emit vmov.s32->gpr tmpreg 0 dest code*)))))
 
   (define asm-fpt
-    (lambda (code* dest src)
+    (lambda (code* dest src tmpreg)
       (Trivit (src)
-        (emit vmov.gpr->s32 %fptmp1 0 src
-          (emit vcvt.s32->dbl %fptmp1 dest code*))))) 
+        (emit vmov.gpr->s32 tmpreg 0 src
+          (emit vcvt.s32->dbl dest tmpreg code*))))) 
 
   (define-who asm-fpmove
     ;; fpmove pseudo instruction is used by set! case in
     ;; select-instructions! and generate-code; at most one of src or
     ;; dest can be an mref, and then the offset is double-aligned
     (lambda (code* dest src)
+      (gen-fpmove who code* dest src #t)))
+
+  (define-who asm-fpmove-single
+    ;; fpmove pseudo instruction is used by set! case in
+    ;; select-instructions! and generate-code; at most one of src or
+    ;; dest can be an mref, and then the offset is double-aligned
+    (lambda (code* dest src)
+      (gen-fpmove who code* dest src #f)))
+
+  (define gen-fpmove
+    (lambda (who code* dest src double?)
       (let ([dest-it dest]
             [src-it src])
         (Trivit (dest-it src-it)
           (record-case dest-it
             [(disp) (imm reg)
              (safe-assert (fx= 0 (fxand imm #b11)))
-             (emit vstr.dbl src (cons 'reg reg) imm code*)]
+             (if double?
+                 (emit vstr.dbl src (cons 'reg reg) imm code*)
+                 (emit vstr.sgl src (cons 'reg reg) imm code*))]
             [(index) (n ireg breg) (sorry! who "cannot handle indexed fp dest ref")]
             [else
              (record-case src-it
                [(disp) (imm reg)
                 (safe-assert (fx= 0 (fxand imm #b11)))
-                (emit vldr.dbl dest (cons 'reg reg) imm code*)]
+                (if double?
+                    (emit vldr.dbl dest (cons 'reg reg) imm code*)
+                    (emit vldr.sgl dest (cons 'reg reg) imm code*))]
                [(index) (n ireg breg) (sorry! who "cannot handle indexed fp src ref")]
                [else (emit vmov.fpr dest src code*)])])))))
 
@@ -2099,8 +2078,8 @@
       (lambda (code* dest src)
         (Trivit (dest)
           (if (eq? part 'lo)
-              (emit vmov.gpr->s32 src 0 dest code*)
-              (emit vmov.gpr->s32 src 1 dest code*))))))
+              (emit vmov.s32->gpr src 0 dest code*)
+              (emit vmov.s32->gpr src 1 dest code*))))))
 
   (define asm-fpcastfrom
     (lambda (code* dest lo-src hi-src)
@@ -2119,19 +2098,19 @@
               [else (sorry! who "unexpected asm-swap type argument ~s" type)]))))))
 
   (define asm-lock
-    ;  tmp = ldrex src
-    ;  cmp tmp, 0
+    ;  tmp2 = ldrex src
+    ;  cmp tmp2, 0
     ;  bne L1 (+2)
-    ;  tmp = 1
-    ;  tmp = strex tmp, src
+    ;  tmp2 = 1
+    ;  tmp = strex tmp2, src
     ;L1:
-    (lambda (code* src tmp)
-      (Trivit (src tmp)
-        (emit ldrex tmp src
-          (emit cmpi tmp 0
+    (lambda (code* src tmp tmp2)
+      (Trivit (src tmp tmp2)
+        (emit ldrex tmp2 src
+          (emit cmpi tmp2 0
             (emit bnei 1
-              (emit movi1 tmp 1
-                (emit strex tmp tmp src code*))))))))
+              (emit movi1 tmp2 1
+                (emit strex tmp tmp2 src code*))))))))
 
   (define-who asm-lock+/-
     ; L:
@@ -2211,6 +2190,11 @@
       (lambda (code*)
         (emit vpushm reg n code*))))
 
+  (define asm-vpop-multiple
+    (lambda (reg n)
+      (lambda (code*)
+        (emit vpopm reg n code*))))
+
   (define asm-save-flrv
     (lambda (code*)
       (let ([sp (cons 'reg %sp)])
@@ -2271,7 +2255,8 @@
 
   (define asm-direct-jump
     (lambda (l offset)
-      (asm-helper-jump '() (make-funcrel 'arm32-jump l offset))))
+      (let ([offset (adjust-return-point-offset offset l)])
+        (asm-helper-jump '() (make-funcrel 'arm32-jump l offset)))))
 
   (define asm-literal-jump
     (lambda (info)
@@ -2313,17 +2298,18 @@
         (or (cond
               [(local-label-offset l) =>
                (lambda (offset)
-                 (let ([disp (fx- next-addr (fx- offset incr-offset) 4)])
-                   (cond
-                     [(funky12 disp)
-                      (Trivit (dest)
-                        ; aka adr, encoding A1
-                        (emit addi #f dest `(reg . ,%pc) disp '()))]
-                     [(funky12 (- disp))
-                      (Trivit (dest)
-                        ; aka adr, encoding A2
-                        (emit subi #f dest `(reg . ,%pc) (- disp) '()))]
-                     [else #f])))]
+                 (let ([incr-offset (adjust-return-point-offset incr-offset l)])
+                   (let ([disp (fx- next-addr (fx- offset incr-offset) 4)])
+                     (cond
+                       [(funky12 disp)
+                        (Trivit (dest)
+                          ; aka adr, encoding A1
+                          (emit addi #f dest `(reg . ,%pc) disp '()))]
+                       [(funky12 (- disp))
+                        (Trivit (dest)
+                          ; aka adr, encoding A2
+                          (emit subi #f dest `(reg . ,%pc) (- disp) '()))]
+                      [else #f]))))]
               [else #f])
             (asm-move '() dest (with-output-language (L16 Triv) `(label-ref ,l ,incr-offset)))))))
 
@@ -2562,6 +2548,8 @@
 		    (or (andmap double-member? members)
 			(andmap float-member? members)))))]
 	[else #f]))
+    (define num-int-regs 4) ; number of integer registers normally usd by the ABI
+    (define num-dbl-regs 8) ; number of `double` registers normally usd by the ABI
     (define sgl-regs (lambda () (list %Cfparg1 %Cfparg1b %Cfparg2 %Cfparg2b %Cfparg3 %Cfparg3b %Cfparg4 %Cfparg4b
 				      %Cfparg5 %Cfparg5b %Cfparg6 %Cfparg6b %Cfparg7 %Cfparg7b %Cfparg8 %Cfparg8b)))
     (define-who asm-foreign-call
@@ -2570,15 +2558,13 @@
         (letrec ([load-double-stack
                    (lambda (offset)
                      (lambda (x) ; requires var
-                       (%seq
-                         (inline ,(make-info-loadfl %fptmp1) ,%load-double ,x ,%zero ,(%constant flonum-data-disp))
-                         (inline ,(make-info-loadfl %fptmp1) ,%store-double ,%sp ,%zero (immediate ,offset)))))]
+                      `(set! ,(%mref ,%sp ,%zero ,offset fp)
+                             ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                  [load-single-stack
                    (lambda (offset)
                      (lambda (x) ; requires var
-                       (%seq
-			 (inline ,(make-info-loadfl %fptmp1) ,%load-double->single ,x ,%zero ,(%constant flonum-data-disp))
-                         (inline ,(make-info-loadfl %fptmp1) ,%store-single ,%sp ,%zero (immediate ,offset)))))]
+                      (%inline store-double->single ,(%mref ,%sp ,%zero ,offset fp)
+                               ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                  [load-int-stack
                    (lambda (offset)
                      (lambda (rhs) ; requires rhs
@@ -2611,11 +2597,21 @@
                  [load-double-reg
                    (lambda (fpreg fp-disp)
                      (lambda (x) ; requires var
-                       `(inline ,(make-info-loadfl fpreg) ,%load-double ,x ,%zero (immediate ,fp-disp))))]
+                       `(set! ,fpreg ,(%mref ,x ,%zero ,fp-disp fp))))]
                  [load-single-reg
                    (lambda (fpreg fp-disp single?)
                      (lambda (x) ; requires var
-                       `(inline ,(make-info-loadfl fpreg) ,(if single? %load-single %load-double->single) ,x ,%zero (immediate ,fp-disp))))]
+                       (let ([%op (if single? %load-single %double->single)])
+                         `(set! ,fpreg (inline ,null-info ,%op ,(%mref ,x ,%zero ,fp-disp fp))))))]
+                 [load-double-int-reg
+                   (lambda (loreg hireg)
+                     (lambda (x) ; requires var
+                       (let-values ([(endreg otherreg) (constant-case native-endianness
+                                                         [(little) (values loreg hireg)]
+                                                         [(big) (values hireg loreg)])])
+                         (%seq
+                          (set! ,endreg ,(%mref ,x ,(constant flonum-data-disp)))
+                          (set! ,otherreg ,(%mref ,x ,(fx+ 4 (constant flonum-data-disp))))))))]
                  [load-int-reg
                    (lambda (ireg)
                      (lambda (x)
@@ -2649,34 +2645,46 @@
                          (set! ,loreg ,(%mref ,x ,from-offset))
                          (set! ,hireg ,(%mref ,x ,(fx+ from-offset 4))))))]
                  [do-args
-                  (lambda (types)
+                  (lambda (types varargs?)
                     ; sgl* is always of even-length, i.e., has a sgl/dbl reg first
                     ; bsgl is set to "b" single (second half of double) if we have one to fill
-                    (let loop ([types types] [locs '()] [live* '()] [int* (int-regs)] [sgl* (sgl-regs)] [bsgl #f] [isp 0])
+                    (let loop ([types types] [locs '()] [live* '()] [int* (int-regs)] [sgl* (if varargs? '() (sgl-regs))] [bsgl #f] [isp 0])
                       (if (null? types)
                           (values isp locs live*)
                           (nanopass-case (Ltype Type) (car types)
                             [(fp-double-float)
-                             (if (null? sgl*)
-                                 (let ([isp (align 8 isp)])
-                                   (loop (cdr types)
+                             (cond
+                               [(and varargs?
+                                     ;; For varargs, treat a double like a 64-bit integer
+                                     (let ([int* (if (even? (length int*)) int* (cdr int*))])
+                                       (and (pair? int*)
+                                            int*)))
+                                => (lambda (int*)
+                                     (loop (cdr types)
+                                       (cons (load-double-int-reg (car int*) (cadr int*)) locs)
+                                       (cons* (car int*) (cadr int*) live*) (cddr int*) sgl* bsgl isp))]
+                               [(null? sgl*)
+                                (let ([isp (align 8 isp)])
+                                  (loop (cdr types)
                                      (cons (load-double-stack isp) locs)
-                                     live* int* '() #f (fx+ isp 8)))
-                                 (loop (cdr types)
-                                   (cons (load-double-reg (car sgl*) (constant flonum-data-disp)) locs)
-                                   live* int* (cddr sgl*) bsgl isp))]
+                                     live* int* '() #f (fx+ isp 8)))]
+                               [else
+                                (loop (cdr types)
+                                  (cons (load-double-reg (car sgl*) (constant flonum-data-disp)) locs)
+                                  (cons (car sgl*) live*) int* (cddr sgl*) bsgl isp)])]
                             [(fp-single-float)
+                             (safe-assert (not varargs?))
                              (if bsgl
                                  (loop (cdr types)
                                    (cons (load-single-reg bsgl (constant flonum-data-disp) #f) locs)
-                                   live* int* sgl* #f isp)
+                                   (cons bsgl live*) int* sgl* #f isp)
                                  (if (null? sgl*)
                                      (loop (cdr types)
                                        (cons (load-single-stack isp) locs)
                                        live* int* '() #f (fx+ isp 4))
                                      (loop (cdr types)
                                        (cons (load-single-reg (car sgl*) (constant flonum-data-disp) #f) locs)
-                                       live* int* (cddr sgl*) (cadr sgl*) isp)))]
+                                       (cons (car sgl*) live*) int* (cddr sgl*) (cadr sgl*) isp)))]
 			    [(fp-ftd& ,ftd)
 			     (let ([size ($ftd-size ftd)]
 				   [members ($ftd->members ftd)]
@@ -2688,7 +2696,8 @@
 				 [(8)
 				  (let* ([int* (if (even? (length int*)) int* (cdr int*))]
 					 [num-members (length members)]
-					 [doubles? (and (fx<= num-members 4)
+					 [doubles? (and (not varargs?)
+                                                        (fx<= num-members 4)
 							(andmap double-member? members))])
 				    ;; Sequence of up to 4 doubles that fits in registers?
 				    (cond
@@ -2721,14 +2730,15 @@
 							(cons* (car int*) (cadr int*) live*) (cddr int*) isp))]))]))]
 				 [else
 				    (let* ([num-members (length members)]
-					   [floats? (and (fx<= num-members 4)
+					   [floats? (and (not varargs?)
+                                                         (fx<= num-members 4)
 							 (andmap float-member? members))])
 				      ;; Sequence of up to 4 floats that fits in registers?
 				      (cond
 				       [(and floats?
 					     (fx>= (fx+ (length sgl*) (if bsgl 1 0)) num-members))
 					;; Allocate each float to register
-					(let flt-loop ([size size] [offset 0] [sgl* sgl*] [bsgl bsgl] [loc #f])
+					(let flt-loop ([size size] [offset 0] [sgl* sgl*] [bsgl bsgl] [loc #f] [live* live*])
 					  (cond
 					   [(fx= size 0)
 					    (loop (cdr types) (cons loc locs) live* int* sgl* bsgl isp)]
@@ -2736,7 +2746,8 @@
 					    (flt-loop (fx- size 4) (fx+ offset 4)
 						      (if bsgl sgl* (cddr sgl*))
 						      (if bsgl #f (cadr sgl*))
-						      (combine-loc loc (load-single-reg (or bsgl (car sgl*)) offset #t)))]))]
+						      (combine-loc loc (load-single-reg (or bsgl (car sgl*)) offset #t))
+                                                      (cons (or bsgl (car sgl*)) live*))]))]
 				       [else
 					;; General case; use integer registers while available,
 					;;  possibly splitting between registers and stack
@@ -2800,8 +2811,9 @@
 					 (fx+ offset (if double? 8 4))
 					 `(seq
 					   ,e
-					   (inline ,(make-info-loadfl (car sgl*)) ,(if double? %store-double %store-single)
-						   ,dest-x ,%zero (immediate ,offset))))])))]
+                                           ,(if double?
+                                                `(set! ,(%mref ,dest-x ,%zero ,offset fp) ,(car sgl*))
+                                                (%inline store-single ,(%mref ,dest-x ,%zero ,offset fp) ,(car sgl*)))))])))]
 			    [else
 			     ;; result is in %Cretval and maybe %r1
 			     `(seq
@@ -2817,13 +2829,36 @@
 				  [(8) `(seq
 					 (set! ,(%mref ,dest-x ,0) ,%Cretval)
 					 (set! ,(%mref ,dest-x ,4) ,%r1))]))]))])]
-		     [else e]))])
+		     [else e]))]
+                 [get-result-regs
+                  (lambda (result-type varargs?)
+                    (nanopass-case (Ltype Type) result-type
+                      [(fp-double-float)
+                       (if varargs?
+                           (list %r1 %Cretval)
+                           (list %Cfpretval))]
+                      [(fp-single-float)
+                       (if varargs?
+                           (list %Cretval)
+                           (list %Cfpretval))]
+                      [(fp-integer ,bits)
+                       (case bits
+                         [(64) (list %r1 %Cretval)]
+                         [else (list %Cretval)])]
+                      [(fp-unsigned ,bits)
+                       (case bits
+                         [(64) (list %r1 %Cretval)]
+                         [else (list %Cretval)])]
+                      [else (list %r0)]))])
           (lambda (info)
             (safe-assert (reg-callee-save? %tc)) ; no need to save-restore
             (let* ([arg-type* (info-foreign-arg-type* info)]
+                   [varargs? (memq 'varargs (info-foreign-conv* info))]
 		   [result-type (info-foreign-result-type info)]
+                   [result-reg* (get-result-regs result-type varargs?)]
 		   [fill-result-here? (indirect-result-that-fits-in-registers? result-type)])
-              (with-values (do-args (if fill-result-here? (cdr arg-type*) arg-type*))
+              (with-values (do-args (if fill-result-here? (cdr arg-type*) arg-type*)
+                                    varargs?)
                 (lambda (args-frame-size locs live*)
                   (let* ([frame-size (align 8 (+ args-frame-size
 						 (if fill-result-here?
@@ -2844,16 +2879,25 @@
 			 [else locs]))
                       (lambda (t0)
 			(add-fill-result fill-result-here? result-type args-frame-size
-			  `(inline ,(make-info-kill*-live* (reg-list %r0) live*) ,%c-call ,t0)))
+			  `(inline ,(make-info-kill*-live* result-reg* live*) ,%c-call ,t0)))
                       (nanopass-case (Ltype Type) result-type
                         [(fp-double-float)
-                         (lambda (lvalue)
-                           `(inline ,(make-info-loadfl %Cfpretval) ,%store-double ,lvalue ,%zero
-                              ,(%constant flonum-data-disp)))]
+                         (if varargs?
+                             (lambda (lvalue)
+                               `(set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp)
+                                      ,(%inline fpcastfrom ,%r1 ,%Cretval)))
+                             (lambda (lvalue)
+                               `(set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp)
+                                      ,%Cfpretval)))]
                         [(fp-single-float)
-                         (lambda (lvalue)
-                           `(inline ,(make-info-loadfl %Cfpretval) ,%store-single->double ,lvalue ,%zero
-                              ,(%constant flonum-data-disp)))]
+                         (if varargs?
+                             (lambda (lvalue)
+                               (let ([t %Cfpretval]) ; should be ok as a temporary register
+                                 `(seq
+                                   (set! ,t ,(%inline fpcastfrom ,%r1 ,%Cretval)) ; we don't actually care about the hi/%r1 part
+                                   (set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp) ,(%inline single->double ,t)))))
+                             (lambda (lvalue)
+                               `(set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp) ,(%inline single->double ,%Cfpretval))))]
                         [(fp-integer ,bits)
                          (case bits
                            [(8) (lambda (lvalue) `(set! ,lvalue ,(%inline sext8 ,%r0)))]
@@ -2884,31 +2928,31 @@
                    +---------------------------+
                    |                           |
                    |    incoming stack args    |
-  sp+36+R+X+Y+Z+W: |                           |
+  sp+52+R+X+Y+Z+W: |                           |
                    +---------------------------+<- 8-byte boundary
                    |                           | 
                    |    saved int reg args     | 0-4 words
-    sp+36+R+X+Y+Z: |                           |
+    sp+52+R+X+Y+Z: |                           |
                    +---------------------------+
                    |                           | 
                    |   pad word if necessary   | 0-1 words
-      sp+36+R+X+Y: |                           |
+      sp+52+R+X+Y: |                           |
                    +---------------------------+<- 8-byte boundary
                    |                           | 
                    |   saved float reg args    | 0-16 words
-        sp+36+R+X: |                           |
+        sp+52+R+X: |                           |
                    +---------------------------+<- 8-byte boundary
                    |                           | 
                    |      &-return space       | up to 8 words
-          sp+36+R: |                           |
+          sp+52+R: |                           |
                    +---------------------------+<- 8-byte boundary
                    |                           | 
                    |   pad word if necessary   | 0-1 words
-            sp+36: |                           |
+            sp+52: |                           |
                    +---------------------------+
                    |                           |
-                   |   callee-save regs + lr   | 9 words
-             sp+0: |                           | 
+                   |   callee-save regs + lr   | 13 words
+             sp+0: |   callee-save fpregs      |
                    +---------------------------+<- 8-byte boundary
 
       X = 0 or 4 (depending on whether pad is present)
@@ -2920,15 +2964,13 @@
           (define load-double-stack
             (lambda (offset)
               (lambda (x) ; requires var
-                (%seq
-                  (inline ,(make-info-loadfl %fptmp1) ,%load-double ,%sp ,%zero (immediate ,offset))
-                  (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+                `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                       ,(%mref ,%sp ,%zero ,offset fp)))))
           (define load-single-stack
             (lambda (offset)
               (lambda (x) ; requires var
-                (%seq
-                  (inline ,(make-info-loadfl %fptmp1) ,%load-single->double ,%sp ,%zero (immediate ,offset))
-                  (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+                `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                       ,(%inline load-single->double ,(%mref ,%sp ,%zero ,offset fp))))))
           (define load-int-stack
             (lambda (type offset)
               (lambda (lvalue)
@@ -2957,16 +2999,19 @@
 	      (lambda (lvalue)
 		`(set! ,lvalue ,(%inline + ,%sp (immediate ,offset))))))
           (define count-reg-args
-            (lambda (types synthesize-first?)
+            (lambda (types synthesize-first? varargs?)
               ; bsgl? is #t iff we have a "b" single (second half of double) float reg to fill
               (let f ([types types] [iint (if synthesize-first? -1 0)] [idbl 0] [bsgl? #f])
                 (if (null? types)
                     (values iint idbl)
                     (nanopass-case (Ltype Type) (car types)
                       [(fp-double-float)
-                       (if (fx< idbl 8)
-                           (f (cdr types) iint (fx+ idbl 1) bsgl?)
-                           (f (cdr types) iint idbl #f))]
+                       (if varargs?
+                           (let ([iint (align 2 iint)])
+                             (f (cdr types) (if (fx< iint num-int-regs) (fx+ iint 2) iint) idbl bsgl?))
+                           (if (fx< idbl 8)
+                               (f (cdr types) iint (fx+ idbl 1) bsgl?)
+                               (f (cdr types) iint idbl #f)))]
                       [(fp-single-float)
                        (if bsgl?
                            (f (cdr types) iint idbl #f)
@@ -3007,8 +3052,8 @@
                              [(fp-unsigned ,bits) (fx= bits 64)]
                              [else #f])
                            (let ([iint (align 2 iint)])
-                             (f (cdr types) (if (fx< iint 4) (fx+ iint 2) iint) idbl bsgl?))
-                           (f (cdr types) (if (fx< iint 4) (fx+ iint 1) iint) idbl bsgl?))])))))
+                             (f (cdr types) (if (fx< iint num-int-regs) (fx+ iint 2) iint) idbl bsgl?))
+                           (f (cdr types) (if (fx< iint num-int-regs) (fx+ iint 1) iint) idbl bsgl?))])))))
           (define do-stack
             ; all of the args are on the stack at this point, though not contiguous since
             ; we push all of the int reg args with one push instruction and all of the
@@ -3016,7 +3061,7 @@
 	    ; continue on into the stack variables, which is convenient when a struct
 	    ; argument is split across registers and the stack
             (lambda (types saved-reg-bytes pre-pad-bytes return-bytes float-reg-bytes post-pad-bytes int-reg-bytes
-			   synthesize-first?)
+			   synthesize-first? varargs?)
               (let* ([return-space-offset (fx+ saved-reg-bytes pre-pad-bytes)]
                      [float-reg-offset (fx+ return-space-offset return-bytes)]
                      [int-reg-offset (fx+ float-reg-offset float-reg-bytes post-pad-bytes)]
@@ -3037,20 +3082,36 @@
                             locs))
                       (nanopass-case (Ltype Type) (car types)
                         [(fp-double-float)
-                         (if (< idbl 8)
-                             (loop (cdr types)
-                               (cons (load-double-stack float-reg-offset) locs)
-                               iint (fx+ idbl 1) bsgl-offset int-reg-offset (fx+ float-reg-offset 8) stack-arg-offset)
-                             (let ([stack-arg-offset (align 8 stack-arg-offset)])
-                               (loop (cdr types)
-                                 (cons (load-double-stack stack-arg-offset) locs)
-                                 iint 8 #f int-reg-offset float-reg-offset (fx+ stack-arg-offset 8))))]
+                         (cond
+                           [(and varargs?
+                                 ;; For varargs, treat a double like a 64-bit integer
+                                 (let ([iint (align 2 iint)])
+                                   (and (fx< iint num-int-regs)
+                                        iint)))
+                            => (lambda (new-iint)
+                                 (let ([int-reg-offset (if (fxeven? iint) int-reg-offset (fx+ int-reg-offset 4))]
+                                       [iint new-iint])
+                                   (loop (cdr types)
+                                     (cons (load-double-stack int-reg-offset) locs)
+                                     (fx+ iint 2) idbl bsgl-offset (fx+ int-reg-offset 8) float-reg-offset stack-arg-offset)))]
+                           [(and (not varargs?)
+                                 (< idbl num-dbl-regs))
+                            (loop (cdr types)
+                              (cons (load-double-stack float-reg-offset) locs)
+                              iint (fx+ idbl 1) bsgl-offset int-reg-offset (fx+ float-reg-offset 8) stack-arg-offset)]
+                           [else
+                            (let ([stack-arg-offset (align 8 stack-arg-offset)]
+                                  [iint (if varargs? (align 2 iint) iint)]) ; use up register if argument didn't fit
+                              (loop (cdr types)
+                                (cons (load-double-stack stack-arg-offset) locs)
+                                iint num-dbl-regs #f int-reg-offset float-reg-offset (fx+ stack-arg-offset 8)))])]
                         [(fp-single-float)
+                         (safe-assert (not varargs?))
                          (if bsgl-offset
                              (loop (cdr types)
                                (cons (load-single-stack bsgl-offset) locs)
                                iint idbl #f int-reg-offset float-reg-offset stack-arg-offset)
-                             (if (< idbl 8)
+                             (if (< idbl num-dbl-regs)
                                  (loop (cdr types)
                                    ; with big-endian ARM might need to adjust offset +/- 4 since pair of
                                    ; single floats in a pushed double float might be reversed
@@ -3058,28 +3119,30 @@
                                    iint (fx+ idbl 1) (fx+ float-reg-offset 4) int-reg-offset (fx+ float-reg-offset 8) stack-arg-offset)
                                  (loop (cdr types)
                                    (cons (load-single-stack stack-arg-offset) locs)
-                                   iint 8 #f int-reg-offset float-reg-offset (fx+ stack-arg-offset 4))))]
+                                   iint num-dbl-regs #f int-reg-offset float-reg-offset (fx+ stack-arg-offset 4))))]
 			[(fp-ftd& ,ftd)
 			 (let* ([size ($ftd-size ftd)]
 				[members ($ftd->members ftd)]
 				[num-members (length members)])
 			   (cond
-			    [(and (fx<= num-members 4)
+			    [(and (not varargs?)
+                                  (fx<= num-members 4)
 				  (andmap double-member? members))
 			     ;; doubles are either in registers or all on stack
-			     (if (fx<= (fx+ idbl num-members) 8)
+			     (if (fx<= (fx+ idbl num-members) num-dbl-regs)
 				 (loop (cdr types)
                                    (cons (load-stack-address float-reg-offset) locs)
 				   iint (fx+ idbl num-members) #f int-reg-offset (fx+ float-reg-offset size) stack-arg-offset)
 				 (let ([stack-arg-offset (align 8 stack-arg-offset)])
 				   (loop (cdr types)
                                      (cons (load-stack-address stack-arg-offset) locs)
-                                     iint 8 #f int-reg-offset #f (fx+ stack-arg-offset size))))]
-			    [(and (fx<= num-members 4)
+                                     iint num-dbl-regs #f int-reg-offset #f (fx+ stack-arg-offset size))))]
+			    [(and (not varargs?)
+                                  (fx<= num-members 4)
 				  (andmap float-member? members))
 			     ;; floats are either in registers or all on stack
 			     (let ([amt (fxsrl (align 2 (fx- num-members (if bsgl-offset 1 0))) 1)])
-			       (if (fx<= (fx+ idbl amt) 8)
+			       (if (fx<= (fx+ idbl amt) num-dbl-regs)
 				   (let ([odd-floats? (fxodd? num-members)])
 				     (if bsgl-offset
 					 (let ([dbl-size (align 8 (fx- size 4))])
@@ -3094,15 +3157,15 @@
 					     (fx+ float-reg-offset dbl-size) stack-arg-offset))))
 				   (loop (cdr types)
                                      (cons (load-stack-address stack-arg-offset) locs)
-                                     iint 8 #f int-reg-offset float-reg-offset (fx+ stack-arg-offset size))))]
+                                     iint num-dbl-regs #f int-reg-offset float-reg-offset (fx+ stack-arg-offset size))))]
 			    [(fx= 8 ($ftd-alignment ftd))
                              (let ([int-reg-offset (if (fxeven? iint) int-reg-offset (fx+ int-reg-offset 4))]
 				   [iint (align 2 iint)]
 				   [amt (fxsrl size 2)])
-                               (if (fx< iint 4) ; argument starts in registers, may continue on stack
+                               (if (fx< iint num-int-regs) ; argument starts in registers, may continue on stack
                                    (loop (cdr types)
                                      (cons (load-stack-address int-reg-offset) locs)
-                                     (fxmin 4 (fx+ iint amt)) idbl bsgl-offset (fx+ int-reg-offset size) float-reg-offset
+                                     (fxmin num-int-regs (fx+ iint amt)) idbl bsgl-offset (fx+ int-reg-offset size) float-reg-offset
 				     (fx+ stack-arg-offset (fxmax 0 (fx* 4 (fx- (fx+ iint amt) 4)))))
                                    (let ([stack-arg-offset (align 8 stack-arg-offset)])
                                      (loop (cdr types)
@@ -3111,10 +3174,10 @@
 			    [else
 			     (let* ([size (align 4 size)]
 				    [amt (fxsrl size 2)])
-			       (if (fx< iint 4) ; argument starts in registers, may continue on stack
+			       (if (fx< iint num-int-regs) ; argument starts in registers, may continue on stack
 				   (loop (cdr types)
                                      (cons (load-stack-address int-reg-offset) locs)
-				     (fxmin 4 (fx+ iint amt)) idbl bsgl-offset (fx+ int-reg-offset size) float-reg-offset
+				     (fxmin num-int-regs (fx+ iint amt)) idbl bsgl-offset (fx+ int-reg-offset size) float-reg-offset
 				     (fx+ stack-arg-offset (fxmax 0 (fx* 4 (fx- (fx+ iint amt) 4)))))
 				   (loop (cdr types)
 				     (cons (load-stack-address stack-arg-offset) locs)
@@ -3126,7 +3189,7 @@
                                [else #f])
                              (let ([int-reg-offset (if (fxeven? iint) int-reg-offset (fx+ int-reg-offset 4))]
 				   [iint (align 2 iint)])
-                               (if (fx= iint 4)
+                               (if (fx= iint num-int-regs)
                                    (let ([stack-arg-offset (align 8 stack-arg-offset)])
                                      (loop (cdr types)
                                        (cons (load-int64-stack stack-arg-offset) locs)
@@ -3134,7 +3197,7 @@
                                    (loop (cdr types)
                                      (cons (load-int64-stack int-reg-offset) locs)
                                      (fx+ iint 2) idbl bsgl-offset (fx+ int-reg-offset 8) float-reg-offset stack-arg-offset)))
-                             (if (fx= iint 4)
+                             (if (fx= iint num-int-regs)
                                  (loop (cdr types)
                                    (cons (load-int-stack (car types) stack-arg-offset) locs)
                                    iint idbl bsgl-offset int-reg-offset float-reg-offset (fx+ stack-arg-offset 4))
@@ -3142,13 +3205,14 @@
                                    (cons (load-int-stack (car types) int-reg-offset) locs)
                                    (fx+ iint 1) idbl bsgl-offset (fx+ int-reg-offset 4) float-reg-offset stack-arg-offset)))]))))))
           (define do-result
-            (lambda (result-type synthesize-first? return-stack-offset)
+            (lambda (result-type synthesize-first? varargs? return-stack-offset)
 	      (nanopass-case (Ltype Type) result-type
                 [(fp-ftd& ,ftd)
 		 (let* ([members ($ftd->members ftd)]
 			[num-members (length members)])
 		   (cond
-		    [(and (fx<= 1 num-members 4)
+		    [(and (not varargs?)
+                          (fx<= 1 num-members 4)
 			  (or (andmap double-member? members)
 			      (andmap float-member? members)))
 		     ;; double/float results returned in floating-point registers
@@ -3163,8 +3227,9 @@
 				    (if double? (cddr sgl*) (cdr sgl*))
 				    (fx+ offset (if double? 8 4))
 				    (let ([new-e
-					   `(inline ,(make-info-loadfl (car sgl*)) ,(if double? %load-double %load-single)
-						    ,%sp ,%zero (immediate ,offset))])
+                                           (if double?
+                                               `(set! ,(car sgl*) ,(%mref ,%sp ,%zero ,offset fp))
+                                               `(set! ,(car sgl*) ,(%inline load-single ,(%mref ,%sp ,%zero ,offset fp))))])
 				      (if e `(seq ,e ,new-e) new-e)))]))))
 		      '()
 		      ($ftd-size ftd))]
@@ -3179,20 +3244,42 @@
 				8)]
 		       [else
 			(values (lambda ()
-				  `(set! ,%Cretval ,(%mref ,%sp ,return-stack-offset)))
-				(list %Cretval %r1)
+                                  (case ($ftd-size ftd)
+                                    [(1)
+                                     (let ([rep (if ($ftd-unsigned? ftd) 'unsigned-8 'integer-8)])
+                                       `(set! ,%Cretval (inline ,(make-info-load rep #f) ,%load ,%sp ,%zero (immediate ,return-stack-offset))))]
+                                    [(2)
+                                     (let ([rep (if ($ftd-unsigned? ftd) 'unsigned-16 'integer-16)])
+                                       `(set! ,%Cretval (inline ,(make-info-load rep #f) ,%load ,%sp ,%zero (immediate ,return-stack-offset))))]
+                                    [else `(set! ,%Cretval ,(%mref ,%sp ,return-stack-offset))]))
+				(list %Cretval)
 				4)])]))]
 		[(fp-double-float)
-		 (values (lambda (rhs)
-			   `(inline ,(make-info-loadfl %Cfpretval) ,%load-double
-				    ,rhs ,%zero ,(%constant flonum-data-disp)))
-			 '()
+		 (values (if varargs?
+                             (lambda (rhs)
+                               (let-values ([(endreg otherreg) (constant-case native-endianness
+                                                                 [(little) (values %Cretval %r1)]
+                                                                 [(big) (values %r1 %Cretval)])])
+                                 `(seq
+                                   (set! ,endreg ,(%mref ,rhs ,(constant flonum-data-disp)))
+                                   (set! ,otherreg ,(%mref ,rhs ,(fx+ 4 (constant flonum-data-disp)))))))
+                             (lambda (rhs)
+                               `(set! ,%Cfpretval ,(%mref ,rhs ,%zero ,(constant flonum-data-disp) fp))))
+			 (if varargs?
+                             (list %Cretval %r1)
+                             (list %Cfpretval))
 			 0)]
 		[(fp-single-float)
-		 (values (lambda (rhs)
-			   `(inline ,(make-info-loadfl %Cfpretval) ,%load-double->single
-				    ,rhs ,%zero ,(%constant flonum-data-disp)))
-			 '()
+		 (values (if varargs?
+                             (lambda (rhs)
+                               `(seq
+                                 (set! ,%Cfpretval ,(%inline double->single ,(%mref ,rhs ,%zero ,(constant flonum-data-disp) fp)))
+                                 (set! ,%Cretval ,(%inline fpcastto/lo ,%Cfpretval))))
+                             (lambda (rhs)
+                               `(set! ,%Cfpretval ,(%inline double->single ,(%mref ,rhs ,%zero ,(constant flonum-data-disp) fp)))))
+                         (if varargs?
+                             (list %Cretval)
+                             (list %Cfpretval))
 			 0)]
                 [(fp-void)
                  (values (lambda () `(nop))
@@ -3213,21 +3300,30 @@
 		  [else
 		   (values (lambda (x)
 			     `(set! ,%Cretval ,x))
-			   (list %Cretval %r1)
+			   (list %Cretval)
 			   0)])])))
           (lambda (info)
             (define callee-save-regs+lr (list %r4 %r5 %r6 %r7 %r8 %r9 %r10 %r11 %lr))
+            (define callee-save-fpregs  (list %fp1 %fp2)) ; must be consecutive
             (define isaved (length callee-save-regs+lr))
+            (define fpsaved (length callee-save-fpregs))
+            (safe-assert (andmap (lambda (r)
+                                   (or (not (reg-callee-save? r))
+                                       (if (eq? (reg-type r) 'fp)
+                                           (memq r callee-save-fpregs)
+                                           (memq r callee-save-regs+lr))))
+                                 (vector->list regvec)))
             (let* ([arg-type* (info-foreign-arg-type* info)]
+		   [varargs? (memq 'varargs (info-foreign-conv* info))]
 		   [result-type (info-foreign-result-type info)]
-		   [synthesize-first? (indirect-result-that-fits-in-registers? result-type)])
-              (let-values ([(iint idbl) (count-reg-args arg-type* synthesize-first?)])
-                (let ([saved-reg-bytes (fx* isaved 4)]
+                   [synthesize-first? (indirect-result-that-fits-in-registers? result-type)])
+              (let-values ([(iint idbl) (count-reg-args arg-type* synthesize-first? varargs?)])
+                (let ([saved-reg-bytes (fx+ (fx* isaved 4) (fx* fpsaved 8))]
                       [pre-pad-bytes (if (fxeven? isaved) 0 4)]
                       [int-reg-bytes (fx* iint 4)]
                       [post-pad-bytes (if (fxeven? iint) 0 4)]
                       [float-reg-bytes (fx* idbl 8)])
-                  (let-values ([(get-result result-regs return-bytes) (do-result result-type synthesize-first?
+                  (let-values ([(get-result result-regs return-bytes) (do-result result-type synthesize-first? varargs?
 										 (fx+ saved-reg-bytes pre-pad-bytes))])
                     (let ([return-bytes (align 8 return-bytes)])
                       (values
@@ -3244,6 +3340,7 @@
                             ,(if (fx= pre-pad-bytes 0) `(nop) `(set! ,%sp ,(%inline - ,%sp (immediate 4))))
                             ; save the callee save registers & return address
                             (inline ,(make-info-kill*-live* '() callee-save-regs+lr) ,%push-multiple)
+                            (inline ,(make-info-vpush (car callee-save-fpregs) fpsaved) ,%vpush-multiple)
                             ; set up tc for benefit of argument-conversion code, which might allocate
                             ,(if-feature pthreads
                                (%seq 
@@ -3253,12 +3350,13 @@
                         ; list of procedures that marshal arguments from their C stack locations
                         ; to the Scheme argument locations
                         (do-stack arg-type* saved-reg-bytes pre-pad-bytes return-bytes float-reg-bytes post-pad-bytes int-reg-bytes
-                                  synthesize-first?)
+                                  synthesize-first? varargs?)
                         get-result
                         (lambda ()
                           (in-context Tail
                             (%seq
                               ; restore the callee save registers
+                              (inline ,(make-info-vpush (car callee-save-fpregs) fpsaved) ,%vpop-multiple)
                               (inline ,(make-info-kill* callee-save-regs+lr) ,%pop-multiple)
                               ; deallocate space for pad & arg reg values
                               (set! ,%sp ,(%inline + ,%sp (immediate ,(fx+ pre-pad-bytes int-reg-bytes post-pad-bytes float-reg-bytes))))
diff --git a/s/cpnanopass.ss b/s/cpnanopass.ss
index ace787d30..5788caca0 100644
--- a/s/cpnanopass.ss
+++ b/s/cpnanopass.ss
@@ -599,18 +599,13 @@
         (syntax-case x (reserved allocable machine-dependent)
           [(k (reserved [rreg rreg-alias ... rreg-callee-save? rreg-mdinfo rreg-type] ...)
               (allocable [areg areg-alias ... areg-callee-save? areg-mdinfo areg-type] ...)
-              (machine-depdendent [mdreg mdreg-alias ... mdreg-callee-save? mdreg-mdinfo mdreg-type] ...)
-              (reify-support reify-reg ...))
-           (with-implicit (k regvec arg-registers extra-registers extra-fpregisters real-register?
-                             cons-reify-registers with-initialized-registers)
+              (machine-depdendent [mdreg mdreg-alias ... mdreg-callee-save? mdreg-mdinfo mdreg-type] ...))
+           (with-implicit (k regvec arg-registers extra-registers extra-fpregisters real-register? with-initialized-registers)
              #`(begin
                  (define-reserved-registers [rreg rreg-alias ... rreg-callee-save? rreg-mdinfo rreg-type] ...)
                  (define-allocable-registers regvec arg-registers extra-registers extra-fpregisters with-initialized-registers
                    [areg areg-alias ... areg-callee-save? areg-mdinfo areg-type] ...)
                  (define-machine-dependent-registers [mdreg mdreg-alias ... mdreg-callee-save? mdreg-mdinfo mdreg-type] ...)
-                 (define-syntax cons-reify-registers
-                   (syntax-rules ()
-                     [(_ reg*) (cons* reify-reg ... reg*)]))
                  (define-syntax real-register?
                    (with-syntax ([real-reg* #''(rreg ... rreg-alias ... ... areg ... areg-alias ... ... mdreg ... mdreg-alias ... ...)])
                      (syntax-rules ()
@@ -709,8 +704,6 @@
            (fold-right
              (lambda (reg reg*)
                (cond
-                 [(eq? (syntax->datum reg) 'reify-support)
-                  #`(cons-reify-registers #,reg*)]
                  [(real-register? (syntax->datum reg))
                   #`(cons #,reg #,reg*)]
                  [else reg*]))
@@ -976,8 +969,8 @@
       (declare-intrinsic dofretu32* dofretu32* (%ac0 %ts %td %cp %ac1) (%ac0) (%xp))
       (declare-intrinsic get-room get-room () (%xp) (%xp))
       (declare-intrinsic scan-remembered-set scan-remembered-set () () ())
-      (declare-intrinsic reify-1cc reify-1cc (%xp %ac0 reify-support) () (%td))
-      (declare-intrinsic maybe-reify-cc maybe-reify-cc (%xp %ac0 reify-support) () (%td))
+      (declare-intrinsic reify-1cc reify-1cc (%xp %ac0 %ts %reify1 %reify2) () (%td)) ; %reify1 & %reify2 are defined as needed per machine...
+      (declare-intrinsic maybe-reify-cc maybe-reify-cc (%xp %ac0 %ts %reify1 %reify2) () (%td)) ; ... to have enough registers to allocate
       (declare-intrinsic dooverflow dooverflow () () ())
       (declare-intrinsic dooverflood dooverflood () (%xp) ())
       ; a dorest routine takes all of the register and frame arguments from the rest
@@ -1036,11 +1029,6 @@
       (sealed #t)
       (fields type swapped?))
 
-    (define-record-type info-loadfl (nongenerative)
-      (parent info)
-      (sealed #t)
-      (fields flreg))
-
     (define-record-type info-condition-code (nongenerative)
       (parent info)
       (sealed #t)
@@ -4005,10 +3993,8 @@
                         [else
                          (bind #t ([t (%constant-alloc type-flonum (constant size-flonum))])
                            (%seq
-                             (inline ,(make-info-loadfl %fptmp1) ,%load-double
-                               ,base ,index (immediate ,offset))
-                             (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                               ,t ,%zero ,(%constant flonum-data-disp))
+                             (set! ,(%mref ,t ,%zero ,(constant flonum-data-disp) fp)
+                                   (unboxed-fp ,(%mref ,base ,index ,offset fp)))
                              ,t))])))]
                [(single-float)
                 (if swapped?
@@ -4018,18 +4004,22 @@
                           (set! ,(%mref ,t ,(constant flonum-data-disp))
                             (inline ,(make-info-load 'unsigned-32 #t) ,%load ,base ,index
                               (immediate ,offset)))
-                          (inline ,(make-info-loadfl %fptmp1) ,%load-single->double
-                            ,t ,%zero ,(%constant flonum-data-disp))
-                          (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                            ,t ,%zero ,(%constant flonum-data-disp))
+                          (set! ,(%mref ,t ,%zero ,(constant flonum-data-disp) fp)
+                                (unboxed-fp (inline ,(make-info-unboxed-args '(#t))
+                                                    ,%load-single->double
+                                                    ;; slight abuse to call this "unboxed", but `load-single->double`
+                                                    ;; wants an FP-flavored address
+                                                    (unboxed-fp ,(%mref ,t ,%zero ,(constant flonum-data-disp) fp)))))
                           ,t)))
                     (bind #f (base index)
                       (bind #t ([t (%constant-alloc type-flonum (constant size-flonum))])
                         (%seq
-                          (inline ,(make-info-loadfl %fptmp1) ,%load-single->double
-                            ,base ,index (immediate ,offset))
-                          (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                            ,t ,%zero ,(%constant flonum-data-disp))
+                          (set! ,(%mref ,t ,%zero ,(constant flonum-data-disp) fp)
+                                (unboxed-fp (inline ,(make-info-unboxed-args '(#t))
+                                                    ,%load-single->double
+                                                    ;; slight abuse to call this "unboxed", but `load-single->double`
+                                                    ;; wants an FP-flavored address
+                                                    (unboxed-fp ,(%mref ,base ,index ,offset fp)))))
                           ,t))))]
                [(integer-8 integer-16 integer-24 integer-32 integer-40 integer-48 integer-56 integer-64)
                 (build-int-load swapped? type base index offset
@@ -4108,21 +4098,14 @@
                [(double-float)
                 (bind #f (base index)
                   (bind #f fp (value)
-                   `(set! ,(%mref ,base ,index ,offset fp) ,value)))
-                #;
-                (bind #f (base index)
-                  (%seq
-                    (inline ,(make-info-loadfl %fptmp1) ,%load-double
-                      ,value ,%zero ,(%constant flonum-data-disp))
-                    (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                      ,base ,index (immediate ,offset))))]
+                   `(set! ,(%mref ,base ,index ,offset fp) ,value)))]
                [(single-float)
                 (bind #f (base index)
-                  (%seq
-                    (inline ,(make-info-loadfl %fptmp1) ,%load-double->single
-                      ,value ,%zero ,(%constant flonum-data-disp))
-                    (inline ,(make-info-loadfl %fptmp1) ,%store-single
-                      ,base ,index (immediate ,offset))))]
+                  `(inline ,(make-info-unboxed-args '(#t #t)) ,%store-double->single
+                           ;; slight abuse to call this "unboxed", but `store-double->single`
+                           ;; wants an FP-flavored address
+                           (unboxed-fp ,(%mref ,base ,index ,offset fp))
+                           (unboxed-fp ,(%mref ,value ,%zero ,(constant flonum-data-disp) fp))))]
                ; 40-bit+ only on 64-bit machines
                [(integer-8 integer-16 integer-24 integer-32 integer-40 integer-48 integer-56 integer-64
                  unsigned-8 unsigned-16 unsigned-24 unsigned-32 unsigned-40 unsigned-48 unsigned-56 unsigned-64)
@@ -7428,7 +7411,7 @@
                            (%inline sll ,body (immediate ,(fx- 0 cnt)))
                            body)))
                   (immediate ,mask)))))
-<
+
           (define-inline 3 fllp
             [(e) (build-flonum-extractor 19 12 e)])
 
@@ -7543,19 +7526,14 @@
               (lambda (e1 e2)
                 (bind #f (e1 e2)
                   (bind #t ([t (%constant-alloc type-typed-object (constant size-inexactnum))])
-                    `(seq
+                    (%seq
                        (set! ,(%mref ,t ,(constant inexactnum-type-disp))
                          ,(%constant type-inexactnum))
-                       ,(%seq
-                          (inline ,(make-info-loadfl %fptmp1) ,%load-double
-                            ,e1 ,%zero ,(%constant flonum-data-disp))
-                          (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                            ,t ,%zero ,(%constant inexactnum-real-disp))
-                          (inline ,(make-info-loadfl %fptmp1) ,%load-double
-                            ,e2 ,%zero ,(%constant flonum-data-disp))
-                          (inline ,(make-info-loadfl %fptmp1) ,%store-double
-                            ,t ,%zero ,(%constant inexactnum-imag-disp))
-                          ,t))))))
+                       (set! ,(%mref ,t ,%zero ,(constant inexactnum-real-disp) fp)
+                             (unboxed-fp ,(%mref ,e1 ,%zero ,(constant flonum-data-disp) fp)))
+                       (set! ,(%mref ,t ,%zero ,(constant inexactnum-imag-disp) fp)
+                             (unboxed-fp ,(%mref ,e2 ,%zero ,(constant flonum-data-disp) fp)))
+                       ,t)))))
 
             (define-inline 3 fl-make-rectangular
               [(e1 e2) (build-fl-make-rectangular e1 e2)])
@@ -7856,7 +7834,7 @@
         (define-inline 3 flonum->fixnum
           [(e-x) (bind #f (e-x)
                    (build-fix
-                     (%inline trunc ,e-x)))])
+                     `(inline ,(make-info-unboxed-args '(#t)) ,%fptrunc ,e-x)))])
         (let ()
           (define build-fixnum->flonum
            ; NB: x must already be bound in order to ensure it is done before the flonum is allocated
@@ -12178,7 +12156,10 @@
                                    (if (null? frame-x*)
                                        (begin (set! max-fv (fxmax max-fv i)) '())
                                        (let ([i (fx+ i 1)])
-                                         (cons (get-ptr-fv i) (f (cdr frame-x*) i)))))])
+                                         (cons (get-ptr-fv i) (f (cdr frame-x*) i)))))]
+                            [cp-save (meta-cond
+                                      [(real-register? '%cp) (make-tmp 'cp)]
+                                      [else #f])])
                         ; add 2 for the old RA and cchain
                         (set! max-fv (fx+ max-fv 2))
                         (let-values ([(c-init c-args c-result c-return) (asm-foreign-callable info)])
@@ -12188,17 +12169,20 @@
                           ; c-return restores callee-save registers and returns to C
                           (%seq
                             ,(c-init)
-                            ; although we don't actually need %cp in a register, we need
-                            ; to make sure that `(%tc-ref cp)` doesn't change before S_call_help
-                            ; is called, and claiming that %cp is live is the easiest way
                             ,(restore-scheme-state
-                               (in %cp)
+                               (in %cp) ; to save and then restore just before S_call_help
                                (out %ac0 %ac1 %xp %yp %ts %td scheme-args extra-regs))
                             ; need overflow check since we're effectively retroactively turning
                             ; what was a foreign call into a Scheme non-tail call
                             (fcallable-overflow-check)
                             ; leave room for the RA & c-chain
                             (set! ,%sfp ,(%inline + ,%sfp (immediate ,(fx* (constant ptr-bytes) 2))))
+                            ; stash %cp and restore later to make sure it's intact by the time
+                            ; that we get to S_call_help
+                            ,(meta-cond
+                              [(real-register? '%cp) `(set! ,cp-save ,%cp)]
+                              [else `(nop)])
+                            ; convert arguments
                             ,(fold-left (lambda (e x arg-type c-arg) `(seq ,(C->Scheme arg-type c-arg x) ,e))
                                (set-locs fv* frame-x*
                                  (set-locs (map (lambda (reg) (in-context Lvalue (%mref ,%tc ,(reg-tc-disp reg)))) reg*) reg-x*
@@ -12208,6 +12192,9 @@
                             ; needs to be a quote, not an immediate
                             (set! ,(ref-reg %ac1) (literal ,(make-info-literal #f 'object 0 0)))
                             (set! ,(ref-reg %ts) (label-ref ,self-label 0)) ; for locking
+                            ,(meta-cond
+                              [(real-register? '%cp) `(set! ,%cp ,cp-save)]
+                              [else `(nop)])
                             ,(save-scheme-state
                                (in %ac0 %ac1 %ts %cp)
                                (out %xp %yp %td scheme-args extra-regs))
@@ -12546,7 +12533,7 @@
                                              (set! ,fv0 ,%xp)
                                              ,(%mv-jump ,%xp (,%ac0 ,arg-registers ... ,fv0))))]))))))))))))
         (define reify-cc-help
-          (lambda (1-shot? always? finish)
+          (lambda (1-shot? always? save-ra? finish)
             (with-output-language (L13 Tail)
                 (%seq
                   (set! ,%td ,(%tc-ref stack-link))
@@ -12555,7 +12542,7 @@
                              (%seq
                                ,(let ([alloc
                                        (%seq
-                                        (set! ,%xp ,(%constant-alloc type-closure (constant size-continuation)))
+                                        (set! ,%xp ,(%constant-alloc type-closure (constant size-continuation) #f save-ra?))
                                         (set! ,(%mref ,%xp ,(constant continuation-code-disp))
                                               (literal ,(make-info-literal #f 'library (lookup-libspec nuate) (constant code-data-disp)))))])
                                   (if 1-shot?
@@ -12697,28 +12684,25 @@
            [(dorest4) (make-do-rest 4 frame-args-offset)]
            [(dorest5) (make-do-rest 5 frame-args-offset)]
            [(reify-1cc maybe-reify-cc)
-            (let ([other-reg* (fold-left (lambda (live* kill) (remq kill live*))
-                                         (vector->list regvec)
-                                         ;; Registers used by `reify-cc-help` output,
-                                         ;; including some as needed per machine
-                                         (reg-list %xp %td %ac0 reify-support))]
-                  [1cc? (eq? sym 'reify-1cc)])
+            (let ([1cc? (eq? sym 'reify-1cc)])
               `(lambda ,(make-named-info-lambda (if 1cc? "reify-1cc" "maybe-reify-cc") '(0)) 0 ()
                 ,(asm-enter
                    (%seq
-                     (check-live ,other-reg* ...)
-                     ,(reify-cc-help 1cc? 1cc?
+                     ;; make sure the reify-1cc intrinsic declares kill for registers used by `reify-cc-help`,
+                     ;; plus (say) %ts to have one to allocate, plus more as needed to allocate per machine
+                     (check-live ,(intrinsic-entry-live* reify-1cc) ...)
+                     ,(reify-cc-help 1cc? 1cc? #t
                        (lambda (reg)
                          (if (eq? reg %td)
-                             `(asm-return ,%td ,other-reg* ...)
+                             `(asm-return ,%td ,(intrinsic-return-live* reify-1cc) ...)
                              `(seq
                                 (set! ,%td ,reg)
-                                (asm-return ,%td ,other-reg* ...)))))))))]
+                                (asm-return ,%td ,(intrinsic-return-live* reify-1cc) ...)))))))))]
            [(callcc)
             `(lambda ,(make-named-info-lambda 'callcc '(1)) 0 ()
                ,(%seq
                   (set! ,(ref-reg %cp) ,(make-arg-opnd 1))
-                  ,(reify-cc-help #f #f
+                  ,(reify-cc-help #f #f #f
                     (lambda (reg)
                       (%seq
                         (set! ,(make-arg-opnd 1) ,reg)
@@ -15716,7 +15700,7 @@
           [(asm-return) (values (asm-return) chunk* offset)]
           [(asm-c-return ,info) (values (asm-c-return info) chunk* offset)]
           [(jump (label-ref ,l ,offset0))
-           (values (asm-direct-jump l (adjust-return-point-offset offset0 l)) chunk* offset)]
+           (values (asm-direct-jump l offset0) chunk* offset)]
           [(jump (literal ,info))
            (values (asm-literal-jump info) chunk* offset)]
           [(jump ,t)
@@ -15799,8 +15783,7 @@
           [(rp-compact-header ,error-on-values ,fs ,lpm) (values (asm-rp-compact-header code* error-on-values fs lpm current-func #f) chunk* offset)]
           [(set! ,x (label-ref ,l ,offset1))
            (guard (eq? (local-label-func l) current-func))
-           (let ([chunk (make-chunk code*)]
-                 [offset1 (adjust-return-point-offset offset1 l)])
+           (let ([chunk (make-chunk code*)])
              (let ([offset (fx+ (chunk-size chunk) offset)] [chunk* (cons chunk chunk*)])
                (let ([chunk (asm-return-address x l offset1 offset)])
                  (values '() (cons chunk chunk*) (fx+ (chunk-size chunk) offset)))))]
diff --git a/s/cprep.ss b/s/cprep.ss
index 2f610e189..e8e969f12 100644
--- a/s/cprep.ss
+++ b/s/cprep.ss
@@ -93,6 +93,7 @@
                       [(i3nt-stdcall) '__stdcall]
                       [(i3nt-com) '__com]
                       [(adjust-active) '__collect_safe]
+                      [(varargs) '__varargs]
                       [else #f]))
                   x*)))
          (define-who uncprep-fp-specifier
diff --git a/s/ftype.ss b/s/ftype.ss
index 3f4e3d0aa..ce9ab7108 100644
--- a/s/ftype.ss
+++ b/s/ftype.ss
@@ -122,7 +122,7 @@ notes:
     big-endian machines, the first field occupies the high-order bits,
     with each subsequent field just below the preceding field.
 
-  - ftype pointers are records encapsulating an ftype descriptor
+  - ftyp<e pointers are records encapsulating an ftype descriptor
     (ftd) along with the address of the foreign object, except that
     pointers of type void* are just addresses.  the encapsulated
     ftd is used to verify the applicability of an ftype-&ref,
@@ -957,6 +957,12 @@ ftype operators:
       (or (ftd-struct? x)
           (ftd-union? x)
           (ftd-array? x))))
+  (set! $ftd-unsigned?
+    (lambda (x)
+      (and (ftd-base? x)
+           (case (ftd-base-type x)
+             [(unsigned-8 unsigned-16 unsigned-32 unsigned-64) #t]
+             [else #f]))))
   (set! $ftd->members
     (lambda (x)
       ;; Currently used for x86_64 and arm32 ABI: Returns a list of
diff --git a/s/mathprims.ss b/s/mathprims.ss
index 741eea4ed..8401e2f3a 100644
--- a/s/mathprims.ss
+++ b/s/mathprims.ss
@@ -317,11 +317,25 @@
          ($flonum-sign x)))
 
    (set-who! flonum->fixnum
-      (let ([flmnf (fixnum->flonum (most-negative-fixnum))]
-            [flmpf (fixnum->flonum (most-positive-fixnum))])
+      (let ([flmnf (meta-cond
+                    ;; 64-bit fixnums: -1.0 is the same flonum
+                    [(fl= (fixnum->flonum (most-negative-fixnum))
+                          (fl- (fixnum->flonum (most-negative-fixnum)) 1.0))
+                     ;; Find the next lower flonum:
+                     (let loop ([amt 2.0])
+                       (let ([v (fl- (fixnum->flonum (most-negative-fixnum)) amt)])
+                         (if (fl= v (fixnum->flonum (most-negative-fixnum)))
+                             (loop (fl* 2.0 amt))
+                             v)))]
+                    [else
+                     (fl- (fixnum->flonum (most-negative-fixnum)) 1.0)])]
+            ;; Although adding 1.0 doesn't change the flonum for
+            ;; 64-bit fixnums, the flonum doesn't fit in a fixnum, so
+            ;; this is the upper bbound we want either way:
+            [flmpf (fl+ (fixnum->flonum (most-positive-fixnum)) 1.0)])
          (lambda (x)
             (unless (flonum? x) (flargerr who x))
-            (unless (fl<= flmnf x flmpf)
+            (unless (fl< flmnf x flmpf)
                ($oops who "result for ~s would be outside of fixnum range" x))
             (#3%flonum->fixnum x))))
 )
diff --git a/s/mkgc.ss b/s/mkgc.ss
index da253d998..a30a2da9e 100644
--- a/s/mkgc.ss
+++ b/s/mkgc.ss
@@ -1072,7 +1072,7 @@
           [sweep
            (S_set_code_obj "gc" (RELOC_TYPE entry) _ a obj item_off)]
           [vfasl-sweep
-           (S_set_code_obj "vfasl" (abs-for-vfasl (RELOC_TYPE entry)) _ a obj item_off)]
+           (S_set_code_obj "vfasl" (abs_reloc_variant (RELOC_TYPE entry)) _ a obj item_off)]
           [else]))))
 
     (case-mode
@@ -1142,11 +1142,6 @@
    [(vfasl-copy vfasl-sweep) e]
    [else]))
 
-(define-trace-macro (abs-for-vfasl e)
-  (case-mode
-   [vfasl-sweep reloc_abs]
-   [else e]))
-
 (define-trace-macro (when-mark e)
   (case-mode
    [(mark) e]
diff --git a/s/mkheader.ss b/s/mkheader.ss
index e2eab56a5..c8d8c1bb2 100644
--- a/s/mkheader.ss
+++ b/s/mkheader.ss
@@ -656,8 +656,8 @@
             (pr "                        \"cmp r12, #0\\n\\t\"\\~%")
             (pr "                        \"bne 1f\\n\\t\"\\~%")
             (pr "                        \"mov r12, #1\\n\\t\"\\~%")
-            (pr "                        \"strex r11, r12, [%0]\\n\\t\"\\~%")
-            (pr "                        \"cmp r11, #0\\n\\t\"\\~%")
+            (pr "                        \"strex r7, r12, [%0]\\n\\t\"\\~%")
+            (pr "                        \"cmp r7, #0\\n\\t\"\\~%")
             (pr "                        \"beq 2f\\n\\t\"\\~%")
             (pr "                        \"1:\\n\\t\"\\~%")
             (pr "                        \"ldr r12, [%0, #0]\\n\\t\"\\~%")
@@ -667,7 +667,7 @@
             (pr "                        \"2:\\n\\t\"\\~%")
             (pr "                        :                \\~%")
             (pr "                        : \"r\" (addr)\\~%")
-            (pr "                        : \"cc\", \"memory\", \"r12\", \"r11\")~%")
+            (pr "                        : \"cc\", \"memory\", \"r12\", \"r7\")~%")
 
             (nl)
             (pr "#define UNLOCK(addr)     \\~%")
@@ -683,14 +683,14 @@
             (pr "                        \"0:\\n\\t\"\\~%")
             (pr "                        \"ldrex r12, [%1, #0]\\n\\t\"\\~%")
             (pr "                        \"add r12, r12, #1\\n\\t\"\\~%")
-            (pr "                        \"strex r11, r12, [%1]\\n\\t\"\\~%")
-            (pr "                        \"cmp r11, #0\\n\\t\"\\~%")
+            (pr "                        \"strex r7, r12, [%1]\\n\\t\"\\~%")
+            (pr "                        \"cmp r7, #0\\n\\t\"\\~%")
             (pr "                        \"bne 0b\\n\\t\"\\~%")
             (pr "                        \"cmp r12, #0\\n\\t\"\\~%")
             (pr "                        \"moveq %0, #1\\n\\t\"\\~%")
             (pr "                        : \"=&r\" (ret)\\~%")
             (pr "                        : \"r\" (addr)\\~%")
-            (pr "                        : \"cc\", \"memory\", \"r12\", \"r11\")~%")
+            (pr "                        : \"cc\", \"memory\", \"r12\", \"r7\")~%")
 
             (nl)
             (pr "#define LOCKED_DECR(addr, ret) \\~%")
@@ -698,14 +698,14 @@
             (pr "                        \"0:\\n\\t\"\\~%")
             (pr "                        \"ldrex r12, [%1, #0]\\n\\t\"\\~%")
             (pr "                        \"sub r12, r12, #1\\n\\t\"\\~%")
-            (pr "                        \"strex r11, r12, [%1]\\n\\t\"\\~%")
-            (pr "                        \"cmp r11, #0\\n\\t\"\\~%")
+            (pr "                        \"strex r7, r12, [%1]\\n\\t\"\\~%")
+            (pr "                        \"cmp r7, #0\\n\\t\"\\~%")
             (pr "                        \"bne 0b\\n\\t\"\\~%")
             (pr "                        \"cmp r12, #0\\n\\t\"\\~%")
             (pr "                        \"moveq %0, #1\\n\\t\"\\~%")
             (pr "                        : \"=&r\" (ret)\\~%")
             (pr "                        : \"r\" (addr)\\~%")
-            (pr "                        : \"cc\", \"memory\", \"r12\", \"r11\")~%")]
+            (pr "                        : \"cc\", \"memory\", \"r12\", \"r7\")~%")]
           [(rv64)
            (pr "#define INITLOCK(addr) \\~%")
            (pr "  __asm__ __volatile__ (\"sw zero, 0(%0)\\n\\t\"\\~%")
diff --git a/s/np-languages.ss b/s/np-languages.ss
index 1e46e69e4..eb2c322cd 100644
--- a/s/np-languages.ss
+++ b/s/np-languages.ss
@@ -542,10 +542,6 @@
   (declare-primitive inc-profile-counter effect #f)
   (declare-primitive invoke-prelude effect #f)
   (declare-primitive keep-live effect #f)
-  (declare-primitive load-double effect #f)
-  (declare-primitive load-double->single effect #f)
-  (declare-primitive load-single effect #f)
-  (declare-primitive load-single->double effect #f)
   (declare-primitive locked-decr! effect #f)
   (declare-primitive locked-incr! effect #f)
   (declare-primitive pause effect #f)
@@ -558,12 +554,12 @@
   (declare-primitive save-flrv effect #f)
   (declare-primitive save-lr effect #f) ; ppc
   (declare-primitive store effect #f)
-  (declare-primitive store-double effect #f)
-  (declare-primitive store-single effect #f)
-  (declare-primitive store-single->double effect #f)
+  (declare-primitive store-single effect #f); not required by cpnanopass
+  (declare-primitive store-double->single effect #f)
   (declare-primitive store-with-update effect #f) ; ppc
   (declare-primitive unactivate-thread effect #f) ; threaded version only
   (declare-primitive vpush-multiple effect #f) ; arm
+  (declare-primitive vpop-multiple effect #f) ; arm
   (declare-primitive cas effect #f)
 
   (declare-primitive < pred #t)
@@ -615,7 +611,6 @@
   (declare-primitive sll value #t)
   (declare-primitive srl value #t)
   (declare-primitive sra value #t)
-  (declare-primitive trunc value #t)
   (declare-primitive zext8 value #t)
   (declare-primitive zext16 value #t)
   (declare-primitive zext32 value #t) ; 64-bit only
@@ -627,6 +622,12 @@
   (declare-primitive fp/ value #t)
   (declare-primitive fpt value #t)
   (declare-primitive fpsqrt value #t) ; not implemented for some ppc32 (so we don't use it)
+  (declare-primitive fptrunc value #t)
+  (declare-primitive double->single value #t) ; not required by cpnanopass
+  (declare-primitive single->double value #t) ; not required by cpnanopass
+
+  (declare-primitive load-single value #t) ; not required by cpnanopass
+  (declare-primitive load-single->double value #t)
 
   (declare-primitive fpcastto value #t) ; 64-bit only
   (declare-primitive fpcastto/hi value #t) ; 32-bit only
diff --git a/s/primdata.ss b/s/primdata.ss
index a0f61bebb..dc0fe5148 100644
--- a/s/primdata.ss
+++ b/s/primdata.ss
@@ -2098,6 +2098,7 @@
   ($ftd-atomic-category [flags single-valued])
   ($ftd-compound? [sig [(sub-ptr) -> (boolean)]] [flags discard])
   ($ftd-size [flags single-valued])
+  ($ftd-unsigned? [flags single-valued])
   ($ftd->members [flags single-valued])
   ($ftype-guardian-oops [flags])
   ($ftype-pointer? [sig [(ptr) -> (boolean)]] [flags pure unrestricted mifoldable])
diff --git a/s/rv64.ss b/s/rv64.ss
index d32889b87..33b40f7ab 100644
--- a/s/rv64.ss
+++ b/s/rv64.ss
@@ -62,60 +62,227 @@
 
 ;; Mapping of scheme specific task registers to registers of the CPU
 (define-registers
-;; Use saved registers r18-r21 for %tc, %sfp, %ap and %trap
   (reserved
     ;; Three or more cols for each definition
     ;reg   alias ...         callee-save reg-mdinfo
-    [%tc   %x9 %s1           #t          9] ;; thread context
-    [%sfp  %x8 %s0  %fp      #t          8] ;; scheme frame pointer
-    [%ap   %x10 %a0 %Carg1 %Cretval   #f         10] ;;  
-    [%trap %x11 %a1 %Carg2 %Cretval1  #f         11] ;; tracks when scheme should check for interrupts
-    [%real-zero %x0          #f          0]);; hardwired zero - can't call it %zero
+    [%tc   %x9 %s1           #t          9 uptr] ;; thread context
+    [%sfp  %x8 %s0  %fp      #t          8 uptr] ;; scheme frame pointer
+    [%ap   %x10 %a0 %Carg1 %Cretval   #f         10 uptr] ;;  
+    [%trap %x11 %a1 %Carg2 %Cretval1  #f         11 uptr] ;; tracks when scheme should check for interrupts
+    [%real-zero %x0          #f          0 uptr]);; hardwired zero - can't call it %zero
   (allocable
-    [%ac0  %x12 %a2 %Carg3   #f         12] ;; argument count
-    [%xp   %x13 %a3 %Carg4   #f         13] ;; used during alloc for the computed alloc spot
-    [%ts   %x14 %a4 %Carg5   #f         14] ;; special temps
-    [%td   %x15 %a5 %Carg6   #f         15] ;; special temps
-    [%cp   %x16 %a6 %Carg7   #f         16] ;; closure pointer
+    [%ac0  %x12 %a2 %Carg3   #f         12 uptr] ;; argument count
+    [%xp   %x13 %a3 %Carg4   #f         13 uptr] ;; used during alloc for the computed alloc spot
+    [%ts   %x14 %a4 %Carg5   #f         14 uptr] ;; special temps
+    [%td   %x15 %a5 %Carg6   #f         15 uptr] ;; special temps
+    [%cp   %x16 %a6 %Carg7   #f         16 uptr] ;; closure pointer
     ;; Extra registers - length should match asm-arg-reg-max
-    [      %x1  %ra %lr               #f  1]
-    [      %x3  %gp                   #f  3]
-    [      %x4  %tp                   #f  4]
-    [      %x5  %t0                   #f  5]
-    [      %x6  %t1                   #f  6]
-    [      %x7  %t2                   #f  7]
-    [      %x17 %a7 %Carg8            #f 17]
-    [      %x18 %s2                   #t 18]
-    [      %x19 %s3                   #t 19]
-    [      %x20 %s4                   #t 20]
-    [      %x21 %s5                   #t 21]
-    [      %x22 %s6                   #t 22]
-    [      %x23 %s7                   #t 23]
-    [      %x24 %s8                   #t 24]
-    [      %x25 %s9                   #t 25]
-    [      %x26 %s10                  #t 26]
-    [      %x27 %s11                  #t 27]
-    [      %x28 %t3                   #f 28]
-    [      %x29 %t4                   #f 29]
-    [      %x30 %t5                   #f 30]
-    [      %x31 %t6                   #f 31]
-  )
+    [      %x1  %ra %lr               #f  1 uptr]
+    [      %x3  %gp                   #f  3 uptr]
+    [      %x4  %tp                   #f  4 uptr]
+    [      %x5  %t0                   #f  5 uptr]
+    [      %x6  %t1                   #f  6 uptr]
+    [      %x7  %t2                   #f  7 uptr]
+    [      %x17 %a7 %Carg8            #f 17 uptr]
+    [      %x18 %s2                   #t 18 uptr]
+    [      %x19 %s3                   #t 19 uptr]
+    [      %x20 %s4                   #t 20 uptr]
+    [      %x21 %s5                   #t 21 uptr]
+    [      %x22 %s6                   #t 22 uptr]
+    [      %x23 %s7                   #t 23 uptr]
+    [      %x24 %s8                   #t 24 uptr]
+    [      %x25 %s9                   #t 25 uptr]
+    [      %x26 %s10                  #t 26 uptr]
+    [      %x27 %s11                  #t 27 uptr]
+    [      %x28 %t3                   #f 28 uptr]
+    [      %x29 %t4                   #f 29 uptr]
+    [      %x30 %t5                   #f 30 uptr]
+    [      %x31 %t6                   #f 31 uptr]
+    [                     %f18 %fs2    #t 51 fp]
+    [                     %f19 %fs3    #t 52 fp]
+    )
   (machine-dependent
-   [%sp   %x2        #f         2]
-   [%pc              #f        32]
-   [%flreg1          #f        33]
-   [      %f10 %fa0 %Cfpretval       #f 43]
-   [      %f11 %fa1 %Cfpretval1      #f 44]
+   [%sp                  %x2          #t  2 uptr]
+   ;; There is really not a specific number for the pc reg
+   ;; so we fake it and call it 32
+   [%pc                               #f 32 uptr]
+   ;; Floating point registers
+   [                     %f0  %ft0    #f 33 fp]
+   [                     %f1  %ft1    #f 34 fp]
+   [                     %f2  %ft2    #f 35 fp]
+   [                     %f3  %ft3    #f 36 fp]
+   [                     %f4  %ft4    #f 37 fp]
+   [                     %f5  %ft5    #f 38 fp]
+   [                     %f6  %ft6    #f 39 fp]
+   [                     %f7  %ft7    #f 40 fp]
+   [                     %f8  %fs0    #t 41 fp]
+   [                     %f9  %fs1    #t 42 fp]
+   [%Cfparg1 %Cfpretval  %f10 %fa0    #f 43 fp]
+   [%Cfparg2 %Cfpretval1 %f11 %fa1    #f 44 fp]
+   [%Cfparg3             %f12 %fa2    #f 45 fp]
+   [%Cfparg4             %f13 %fa3    #f 46 fp]
+   [%Cfparg5             %f14 %fa4    #f 47 fp]
+   [%Cfparg6             %f15 %fa5    #f 48 fp]
+   [%Cfparg7             %f16 %fa6    #f 49 fp]
+   [%Cfparg8             %f17 %fa7    #f 50 fp]
+   ;; f18 and f19 are in the allocable section
+   [                     %f20 %fs4    #t 53 fp]
+   [                     %f21 %fs5    #t 54 fp]
+   [                     %f22 %fs6    #t 55 fp]
+   [                     %f23 %fs7    #t 56 fp]
+   [                     %f24 %fs8    #t 57 fp]
+   [                     %f25 %fs9    #t 58 fp]
+   [                     %f26 %fs10   #t 59 fp]
+   [                     %f27 %fs11   #t 60 fp]
+   [                     %f28 %ft8    #f 61 fp]
+   [                     %f29 %ft9    #f 62 fp]
+   [                     %f30 %ft10   #f 63 fp]
+   [                     %f31 %ft11   #f 64 fp]
    ))
 
 ;;; SECTION 2: instructions
-(module (md-handle-jump) ; also sets primitive handlers
+(module (md-handle-jump) ; also sets primitive handlers        
   (import asm-module)
 
-)
+  (define-syntax (seq x)
+    (syntax-case x ()
+      [(_ e ... ex)
+       (with-syntax ([(t ...) (generate-temporaries #'(e ...))])
+         #'(let ([t e] ...)
+             (with-values ex
+               (case-lambda
+                 [(x*) (cons* t ... x*)]
+                 [(x* p) (values (cons* t ... x*) p)]))))]))
+  
+  (define lmem? mref?)
+  
+  (define (mem? x)
+    (or (lmem? x) (literal@? x)))
+
+  (define (set-ur=mref ur mref)
+    (mref->mref mref
+      (lambda (mref)
+        (build-set! ,ur ,mref))))
+  
+  (define (lvalue->ur x k)
+    (if (mref? x)
+        (let ([u (make-tmp 'u)])
+          (seq
+           (set-ur=mref u x)
+           (k u)))
+        (k x)))
+  
+  (define (mref->mref a k)
+    (define (return x0 x1 imm type)
+      (k
+       (with-output-language (L15d Triv)
+         `(mref ,x0 ,x1 ,imm ,type))))
+    (nanopass-case (L15c Triv) a
+      [(mref ,lvalue0 ,lvalue1 ,imm ,type)
+       (lvalue->ur lvalue0
+         (lambda (x0)
+           (lvalue->ur lvalue1
+             (lambda (x1)
+               (return x0 x1 imm type)))))]))
+       
+  (define (mem->mem a k)
+    (cond
+     [(literal@? a)
+      (let ([u (make-tmp 'u)])
+        (seq
+         (build-set! ,u ,(literal@->literal a))
+         (k
+          (with-output-language (L15d Lvalue)
+            `(mref ,u ,%zero 0 ptr)))))]
+     [else (mref->mref a k)]))
+  
+  (define (md-handle-jump t)
+    (with-output-language (L15d Tail)
+             
+      (nanopass-case (L15c Triv) t
+        [,lvalue
+         (if (mem? lvalue)
+             (mem->mem lvalue (lambda (e) (values '() `(jump ,e))))
+             (values '() `(jump ,lvalue)))]
+        [(literal ,info) (values '() `(jump (literal ,info)))]
+        [(label-ref ,l ,offset) (values '() `(jump (label-ref ,l ,offset)))]))))
 
 ;;; SECTION 3: assembler
-(module asm-module ()
+(module asm-module (
+                    asm-c-return
+                    asm-conditional-jump
+                    asm-direct-jump
+                    asm-enter
+                    asm-foreign-call
+                    asm-foreign-callable
+                    asm-fpmove
+                    asm-indirect-jump
+                    asm-jump
+                    asm-library-jump
+                    asm-literal-jump
+                    asm-move
+                    asm-return
+                    asm-return-address
+                    asm-rp-compact-header
+                    asm-rp-header
+                    asm-size
+                    )
 
-)
+  (define asm-enter values)
   
+  (define (asm-c-return info) 
+    (sorry! 'asm-c-return "unimplemented: asm-c-return"))
+
+  (define (asm-direct-jump l offset)
+    (sorry! 'asm-direct-jump "unimplemented: asm-direct-jump"))
+
+  (define-who (asm-indirect-jump src)
+    (sorry! who "unimplemented: asm-indirect-jump"))
+
+  (define (asm-library-jump l)
+    (sorry! 'asm-library-jump "unimplemented: asm-library-jump"))
+
+  (define (asm-return) 
+    (sorry! 'asm-return "unimplemented: asm-return"))
+
+  (define-who (asm-fpmove code* dest src)
+    (sorry! who "unimplemented: asm-fpmove"))
+
+  (define (asm-literal-jump info)
+    (sorry! 'asm-literal-jump "unimplemented: asm-literal-jump"))
+
+  (define-who (asm-move code* dest src)
+    (sorry! who "unimplemented: asm-move"))
+
+  (define (asm-rp-compact-header code* err? fs lpm func code-size)
+    (sorry! 'asm-rp-compact-header "unimplemented: asm-rp-compact-header"))
+
+  (define (asm-rp-header code* mrvl fs lpm func code-size)
+    (sorry! 'asm-rp-header "unimplemented: asm-rp-header"))
+
+  (define (asm-size x)
+    (case (car x)
+      [(asm rv64-abs rv64-jump rv64-call) 0]
+      [else 8]))
+
+  (define-who (asm-jump l next-addr)
+    (sorry! who "unimplemented: asm-jump"))
+
+  (define-who (asm-conditional-jump info l1 l2 next-addr)
+    (sorry! who "unimplemented: asm-conditional-jump"))
+
+  (define-who (asm-return-address dest l incr-offset next-addr)
+    (sorry! who "unimplemented: asm-return-address"))
+
+  (module (
+           asm-foreign-call
+           asm-foreign-callable
+           )
+      
+    (define-who asm-foreign-call
+      (with-output-language (L13 Effect)
+        (let () #f)))
+    
+    (define-who asm-foreign-callable
+      (with-output-language (L13 Effect)
+        (let () #f)))))
diff --git a/s/rv64le.def b/s/rv64le.def
index 699f9e4d7..a1bf1cbe7 100644
--- a/s/rv64le.def
+++ b/s/rv64le.def
@@ -29,6 +29,12 @@
 (define-constant max-integer-alignment 8)
 (define-constant asm-arg-reg-max 21)
 (define-constant asm-arg-reg-cnt 8)
+
+;; What's this for?
+;; Sounds like the maximum number of available fp regs
+;; but it's set to 2 in all other archs... doing the same here
+;; for now.
+(define-constant asm-fpreg-max 2) 
 (define-constant typedef-ptr "void *")
 (define-constant typedef-iptr "long")
 (define-constant typedef-uptr "unsigned long")
diff --git a/s/syntax.ss b/s/syntax.ss
index 4c185b439..425f30410 100644
--- a/s/syntax.ss
+++ b/s/syntax.ss
@@ -8920,39 +8920,40 @@
     (define squawk
       (lambda (x)
         (syntax-error x (format "invalid ~s convention" who))))
-    (let loop ([conv* conv*] [accum '()] [keep-accum '()])
+    (let loop ([conv* conv*] [selected #f] [accum '()] [keep-accum '()])
       (cond
         [(null? conv*) (datum->syntax #'filter-conv keep-accum)]
         [else
          (let* ([orig-c (car conv*)]
-                [c (syntax->datum orig-c)]
-                [c (cond
-                     [(not c) #f]
-                     [(eq? c '__collect_safe) 'adjust-active]
-                     [else
-                      (case ($target-machine)
-                        [(i3nt ti3nt)
-                         (case c
-                           [(__stdcall) 'i3nt-stdcall]
-                           [(__cdecl) #f]
-                           [(__com) 'i3nt-com]
-                           [else (squawk orig-c)])]
-                        [(ppcnt)
-                         (case c
-                           [(__stdcall __cdecl) #f]
-                           [else (squawk orig-c)])]
-                        [else (squawk orig-c)])])])
-           (when (member c accum)
-             (syntax-error orig-c (format "redundant ~s convention" who)))
-           (unless (or (null? accum)
-                       (eq? c 'adjust-active)
-                       (and (eq? 'adjust-active (car accum))
-                            (null? (cdr accum))))
-             (syntax-error orig-c (format "conflicting ~s convention" who)))
-           (loop (cdr conv*) (cons c accum)
-                 (if c
-                     (cons c keep-accum)
-                     keep-accum)))]))))
+                [c (syntax->datum orig-c)])
+           (let-values ([(c select?)
+                         (cond
+                           [(not c) (values #f #f)]
+                           [(eq? c '__collect_safe) (values 'adjust-active #f)]
+                           [(eq? c '__varargs) (values 'varargs #f)]
+                           [else
+                            (values
+                             (case ($target-machine)
+                               [(i3nt ti3nt)
+                                (case c
+                                  [(__stdcall) 'i3nt-stdcall]
+                                  [(__cdecl) #f]
+                                  [(__com) 'i3nt-com]
+                                  [else (squawk orig-c)])]
+                               [(ppcnt)
+                                (case c
+                                  [(__stdcall __cdecl) #f]
+                                  [else (squawk orig-c)])]
+                               [else (squawk orig-c)])
+                             #t)])])
+             (when (member c accum)
+               (syntax-error orig-c (format "redundant ~s convention" who)))
+             (when (and select? selected)
+               (syntax-error orig-c (format "conflicting ~s convention" who)))
+             (loop (cdr conv*) (if select? c selected) (cons c accum)
+                   (if c
+                       (cons c keep-accum)
+                       keep-accum))))]))))
 
 (define $make-foreign-procedure
   (lambda (who conv* foreign-name ?foreign-addr type* result-type)
@@ -8960,6 +8961,9 @@
       (define (check-strings-allowed)
         (when (memq 'adjust-active (syntax->datum conv*))
           ($oops who "string argument not allowed with __collect_safe procedure")))
+      (define (check-floats-allowed)
+        (when (memq 'varargs (syntax->datum conv*))
+          ($oops who "float argument not allowed for __varargs procedure")))
       (with-syntax ([conv* conv*]
                     [foreign-name foreign-name]
                     [?foreign-addr ?foreign-addr]
@@ -9056,6 +9060,9 @@
                                                         ($fp-string->utf32 x 'big)
                                                         (err ($moi) x)))))
                                        (u32*))]
+                                   [(single-float)
+                                    (check-floats-allowed)
+                                    #f]
                                    [else #f])
                                  (if (or ($ftd? type) ($ftd-as-box? type))
                                      (let ([ftd (if ($ftd? type) type (unbox type))])
@@ -9151,6 +9158,9 @@
       (define (check-strings-allowed)
         (when (memq 'adjust-active (syntax->datum conv*))
           ($oops who "string result not allowed with __collect_safe callable")))
+      (define (check-floats-allowed)
+        (when (memq 'varargs (syntax->datum conv*))
+          ($oops who "float argument not allowed for __varargs procedure")))
       (with-syntax ([conv* conv*] [?proc ?proc])
         (with-syntax ([((actual (t ...) (arg ...)) ...)
                        (map
@@ -9240,6 +9250,9 @@
                                     #`((mod x #x100000000000000)
                                        (x)
                                        (unsigned-64)))]
+                                 [(single-float)
+                                  (check-floats-allowed)
+                                  #f]
                                  [else #f])
                                (with-syntax ([(x) (generate-temporaries #'(*))])
                                  #`(x (x) (#,(datum->syntax #'foreign-callable type))))))
diff --git a/s/trv64le.def b/s/trv64le.def
index 3b0b4312c..cf580cbc6 100644
--- a/s/trv64le.def
+++ b/s/trv64le.def
@@ -29,6 +29,12 @@
 (define-constant max-integer-alignment 8)
 (define-constant asm-arg-reg-max 21)
 (define-constant asm-arg-reg-cnt 8)
+
+;; What's this for?
+;; Sounds like the maximum number of available fp regs
+;; but it's set to 2 in all other archs... doing the same here
+;; for now.
+(define-constant asm-fpreg-max 2) 
 (define-constant typedef-ptr "void *")
 (define-constant typedef-iptr "long")
 (define-constant typedef-uptr "unsigned long")
diff --git a/s/x86.ss b/s/x86.ss
index 25986f6dc..666be8659 100644
--- a/s/x86.ss
+++ b/s/x86.ss
@@ -38,8 +38,7 @@
     [%fptmp1   #f 0 fp]
     [%fptmp2   #f 1 fp]
     [%sp       #t 4 uptr]
-    #;[%esi      #f 6])
-  (reify-support %ts))
+    #;[%esi      #f 6]))
 
 ;;; SECTION 2: instructions
 (module (md-handle-jump) ; also sets primitive handlers
@@ -811,17 +810,16 @@
   (define-instruction effect (flds)
     [(op (z mem)) `(asm ,info ,asm-flds ,z)])
 
-  (define-instruction effect (load-single->double load-double->single)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-cvt op (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction value (load-single->double)
+    [(op (x fpur) (y fpmem))
+     `(set! ,(make-live-info) ,x (asm ,info ,(asm-fl-cvt 'single->double) ,y))])
 
-  (define-instruction effect (store-single store-double)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-store op (info-loadfl-flreg info)) ,x ,y ,z)])
-
-  (define-instruction effect (load-double load-single)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-load op (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction effect (store-double->single)
+    [(op (x fpmem) (y fpmem fpur))
+     (let ([u (make-tmp 'u 'fp)])
+       (seq
+        `(set! ,(make-live-info) ,u (asm ,null-info ,(asm-fl-cvt 'double->single) ,y))
+        `(asm ,info ,asm-store-single ,x ,u)))])
 
   (define-instruction value (fpt)
     [(op (x fpur) (y ur)) `(set! ,(make-live-info) ,x (asm ,info ,asm-fpt ,y))])
@@ -855,8 +853,8 @@
   (define-instruction effect inc-profile-counter 
     [(op (x ur mem) (y imm32 ur)) `(asm ,info ,asm-inc-profile-counter ,x ,y)])
 
-  (define-instruction value (trunc)
-    [(op (z ur) (x ur)) `(set! ,(make-live-info) ,z (asm ,info ,asm-trunc ,x))])
+  (define-instruction value (fptrunc)
+    [(op (z ur) (x fpmem fpur)) `(set! ,(make-live-info) ,z (asm ,info ,asm-fptrunc ,x))])
 
   ;; no kills since we expect to be called when all necessary state has already been saved
   (define-instruction value get-tc
@@ -1029,7 +1027,7 @@
                      asm-direct-jump asm-return-address asm-jump asm-conditional-jump asm-data-label
                      asm-rp-header asm-rp-compact-header
                      asm-lea1 asm-lea2 asm-indirect-call asm-fstpl asm-fstps asm-fldl asm-flds asm-condition-code
-                     asm-fl-cvt asm-fl-store asm-fl-load asm-fpt asm-trunc asm-div
+                     asm-fl-cvt asm-store-single asm-fpt asm-fptrunc asm-div
                      asm-exchange asm-pause asm-locked-incr asm-locked-decr asm-locked-cmpxchg
                      asm-fpop-2 asm-fpmove asm-fpmovefrom asm-fpcastfrom asm-fpcastto asm-fpsqrt asm-c-simple-call
                      asm-save-flrv asm-restore-flrv asm-return asm-c-return asm-size
@@ -1820,28 +1818,17 @@
         (emit flds src code*))))
 
   (define asm-fl-cvt
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([src (build-mem-opnd base index offset)])
-          (case op
-            [(load-single->double) (emit sse.cvtss2sd src (cons 'reg flreg) code*)]
-            [(load-double->single) (emit sse.cvtsd2ss src (cons 'reg flreg) code*)])))))
-
-  (define asm-fl-store
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([dest (build-mem-opnd base index offset)])
+    (lambda (op)
+      (lambda (code* dest-reg src)
+        (Trivit (src)
           (case op
-            [(store-single) (emit sse.movss (cons 'reg flreg) dest code*)]
-            [(store-double) (emit sse.movsd (cons 'reg flreg) dest code*)])))))
+            [(single->double) (emit sse.cvtss2sd src (cons 'reg dest-reg) code*)]
+            [(double->single) (emit sse.cvtsd2ss src (cons 'reg dest-reg) code*)])))))
 
-  (define asm-fl-load
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([src (build-mem-opnd base index offset)])
-          (case op
-            [(load-single) (emit sse.movss src (cons 'reg flreg) code*)]
-            [(load-double) (emit sse.movsd src (cons 'reg flreg) code*)])))))
+  (define asm-store-single
+    (lambda (code* dest flreg)
+      (Trivit (dest)
+        (emit sse.movss (cons 'reg flreg) dest code*))))
 
   (define asm-fpt
     (lambda (code* dest src)
@@ -1907,11 +1894,10 @@
                (emit sse.psrlq (cons 'reg %fptmp1) shift
                  (emit sse.movd (cons 'reg %fptmp1) dest code*)))])))))
 
-  (define asm-trunc
-    (lambda (code* dest flonumreg)
-      (Trivit (dest)
-        (let ([src `(disp ,(constant flonum-data-disp) ,flonumreg)])
-          (emit sse.cvttsd2si src dest code*)))))
+  (define asm-fptrunc
+    (lambda (code* dest src)
+      (Trivit (dest src)
+        (emit sse.cvttsd2si src dest code*))))
 
   (define asm-load
     (lambda (type)
@@ -2323,7 +2309,8 @@
 
   (define asm-direct-jump
     (lambda (l offset)
-      (emit bra (make-funcrel 'literal l offset) '())))
+      (let ([offset (adjust-return-point-offset offset l)])
+        (emit bra (make-funcrel 'literal l offset) '()))))
 
   (define asm-literal-jump
     (lambda (info)
@@ -2552,15 +2539,13 @@
       (letrec ([load-double-stack
                 (lambda (offset)
                   (lambda (x) ; requires var
-                    (%seq
-                      (inline ,(make-info-loadfl %fptmp1) ,%load-double ,x ,%zero ,(%constant flonum-data-disp))
-                      (inline ,(make-info-loadfl %fptmp1) ,%store-double ,%sp ,%zero (immediate ,offset)))))]
+                    `(set! ,(%mref ,%sp ,%zero ,offset fp)
+                           ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                [load-single-stack
                 (lambda (offset)
                   (lambda (x) ; requires var
-                    (%seq
-                      (inline ,(make-info-loadfl %fptmp1) ,%load-double->single ,x ,%zero ,(%constant flonum-data-disp))
-                      (inline ,(make-info-loadfl %fptmp1) ,%store-single ,%sp ,%zero (immediate ,offset)))))]
+                    (%inline store-double->single ,(%mref ,%sp ,%zero ,offset fp)
+                             ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                [load-stack
                  (lambda (offset)
                    (lambda (rhs) ; requires rhs
@@ -2852,15 +2837,13 @@
         (define load-double-stack
           (lambda (offset)
             (lambda (x) ; requires var
-              (%seq
-                (inline ,(make-info-loadfl %fptmp1) ,%load-double ,%sp ,%zero (immediate ,offset))
-                (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+              `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                     ,(%mref ,%sp ,%zero ,offset fp)))))
         (define load-single-stack
           (lambda (offset)
             (lambda (x) ; requires var
-              (%seq
-                (inline ,(make-info-loadfl %fptmp1) ,%load-single->double ,%sp ,%zero (immediate ,offset))
-                (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+              `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                     ,(%inline load-single->double ,(%mref ,%sp ,%zero ,offset fp))))))
         (define load-stack
           (lambda (type offset)
             (lambda (lvalue) ; requires lvalue
diff --git a/s/x86_64.ss b/s/x86_64.ss
index c62724fb7..08d122ed5 100644
--- a/s/x86_64.ss
+++ b/s/x86_64.ss
@@ -43,8 +43,7 @@
       [%Cfparg2            #f  1 fp]
       [%fptmp1             #f  4 fp]  ; xmm 0-5 are caller-save
       [%fptmp2             #f  5 fp]  ; xmm 6-15 are callee-save
-      [%sp                 #t  4 uptr])
-    (reify-support %ts))
+      [%sp                 #t  4 uptr]))
   (define-registers
     (reserved
       [%tc  %r14           #t 14 uptr]
@@ -77,8 +76,7 @@
       [%Cfparg8            #f  7 fp]
       [%fptmp1             #f  8 fp]
       [%fptmp2             #f  9 fp]
-      [%sp                 #t  4 uptr])
-    (reify-support %ts)))
+      [%sp                 #t  4 uptr])))
 
 ;;; SECTION 2: instructions
 (module (md-handle-jump) ; also sets primitive handlers
@@ -866,26 +864,32 @@
                  `(set! ,(make-live-info) ,u (asm ,null-info ,(asm-lea2 0) ,y ,z))
                  `(asm ,info ,(asm-store (info-load-type info)) ,x ,u (immediate 0) ,w))))))])
 
-  (define-instruction effect (load-single->double load-double->single)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-cvt op (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction value (load-single->double)
+    [(op (x fpur) (y fpmem))
+     `(set! ,(make-live-info) ,x (asm ,info ,(asm-fl-cvt 'single->double) ,y))])
 
-  (define-instruction effect (store-single->double)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-store-single->double (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction value (single->double double->single)
+    [(op (x fpur) (y fpmem fpur))
+     `(set! ,(make-live-info) ,x (asm ,info ,(asm-fl-cvt op) ,y))])
 
-  (define-instruction effect (store-single store-double)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-store op (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction effect (store-double->single)
+    [(op (x fpmem) (y fpmem fpur))
+     (let ([u (make-tmp 'u 'fp)])
+       (seq
+        `(set! ,(make-live-info) ,u (asm ,null-info ,(asm-fl-cvt 'double->single) ,y))
+        `(asm ,info ,asm-store-single ,x ,u)))])
+
+  (define-instruction effect (store-single)
+    [(op (x fpmem) (y fpur))
+     `(asm ,info ,asm-store-single ,x ,y)])
 
-  (define-instruction effect (load-double load-single)
-    [(op (x ur) (y ur) (z imm32))
-     `(asm ,info ,(asm-fl-load op (info-loadfl-flreg info)) ,x ,y ,z)])
+  (define-instruction value (load-single)
+    [(op (x fpur) (y fpmem))
+     `(set! ,(make-live-info) ,x (asm ,info ,asm-load-single ,y))])
 
   (define-instruction value (get-double)
-    [(op (z ur))
-     `(set! ,(make-live-info) ,z
-        (asm ,info ,(asm-get-double (info-loadfl-flreg info))))])
+    [(op (z ur) (y fpur))
+     `(set! ,(make-live-info) ,z (asm ,info ,asm-get-double ,y))])
 
   (define-instruction value (fpt)
     [(op (x fpur) (y ur)) `(set! ,(make-live-info) ,x (asm ,info ,asm-fpt ,y))])
@@ -917,8 +921,8 @@
   (define-instruction effect inc-profile-counter 
     [(op (x ur mem) (y imm32 ur)) `(asm ,info ,asm-inc-profile-counter ,x ,y)])
 
-  (define-instruction value (trunc)
-    [(op (z ur) (x ur)) `(set! ,(make-live-info) ,z (asm ,info ,asm-trunc ,x))])
+  (define-instruction value (fptrunc)
+    [(op (z ur) (x fpmem fpur)) `(set! ,(make-live-info) ,z (asm ,info ,asm-fptrunc ,x))])
 
   (define-instruction value get-tc
     [(op (z ur))
@@ -1114,7 +1118,7 @@
                      asm-direct-jump asm-return-address asm-jump asm-conditional-jump asm-data-label
                      asm-rp-header asm-rp-compact-header
                      asm-lea1 asm-lea2 asm-indirect-call asm-condition-code
-                     asm-fl-cvt asm-fl-store asm-fl-load asm-fpt asm-trunc asm-div asm-popcount
+                     asm-fl-cvt asm-store-single asm-load-single asm-fpt asm-fptrunc asm-div asm-popcount
                      asm-exchange asm-pause asm-locked-incr asm-locked-decr asm-locked-cmpxchg
                      asm-fpsqrt asm-fpop-2 asm-fpmove asm-fpcast
                      asm-c-simple-call
@@ -2004,12 +2008,12 @@
             [else (sorry! who "unexpected op ~s" op)])))))
 
   (define asm-fl-cvt
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([src (build-mem-opnd base index offset)])
+    (lambda (op)
+      (lambda (code* dest-reg src)
+        (Trivit (src)
           (case op
-            [(load-single->double) (emit sse.cvtss2sd src (cons 'reg flreg) code*)]
-            [(load-double->single) (emit sse.cvtsd2ss src (cons 'reg flreg) code*)])))))
+            [(single->double) (emit sse.cvtss2sd src (cons 'reg dest-reg) code*)]
+            [(double->single) (emit sse.cvtsd2ss src (cons 'reg dest-reg) code*)])))))
 
   (define asm-store-single->double
     (lambda (flreg)
@@ -2018,26 +2022,19 @@
           (emit sse.cvtss2sd flreg flreg
             (emit sse.movsd flreg dest code*))))))
 
-  (define asm-fl-store
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([dest (build-mem-opnd base index offset)])
-          (case op
-            [(store-single) (emit sse.movss (cons 'reg flreg) dest code*)]
-            [(store-double) (emit sse.movsd (cons 'reg flreg) dest code*)])))))
+  (define asm-store-single
+    (lambda (code* dest flreg)
+      (Trivit (dest)
+        (emit sse.movss (cons 'reg flreg) dest code*))))
 
-  (define asm-fl-load
-    (lambda (op flreg)
-      (lambda (code* base index offset)
-        (let ([src (build-mem-opnd base index offset)])
-          (case op
-            [(load-single) (emit sse.movss src (cons 'reg flreg) code*)]
-            [(load-double) (emit sse.movsd src (cons 'reg flreg) code*)])))))
+  (define asm-load-single
+    (lambda (code* flreg src)
+      (Trivit (src)
+        (emit sse.movss src (cons 'reg flreg) code*))))
 
   (define asm-get-double
-    (lambda (flreg)
-      (lambda (code* dst)
-        (emit sse.movd (cons 'reg flreg) (cons 'reg dst) code*))))
+    (lambda (code* dst flreg)
+      (emit sse.movd (cons 'reg flreg) (cons 'reg dst) code*)))
 
   (define asm-fpt
     (lambda (code* dest src)
@@ -2082,11 +2079,10 @@
       (Trivit (dest src)
         (emit sse.movd src dest code*))))
 
-  (define asm-trunc
-    (lambda (code* dest flonumreg)
-      (Trivit (dest)
-        (let ([src `(disp ,(constant flonum-data-disp) ,flonumreg)])
-          (emit sse.cvttsd2si src dest code*)))))
+  (define asm-fptrunc
+    (lambda (code* dest src)
+      (Trivit (dest src)
+        (emit sse.cvttsd2si src dest code*))))
 
   (define asm-load
     (lambda (type)
@@ -2523,7 +2519,8 @@
 
   (define asm-direct-jump
     (lambda (l offset)
-      (asm-helper-jump '() (make-funcrel 'x86_64-jump l offset))))
+      (let ([offset (adjust-return-point-offset offset l)])
+        (asm-helper-jump '() (make-funcrel 'x86_64-jump l offset)))))
 
   (define asm-literal-jump
     (lambda (info)
@@ -2543,10 +2540,11 @@
         (or (cond
               [(local-label-offset l) =>
                (lambda (offset)
-                 (let ([disp (fx- next-addr (fx- offset incr-offset))])
-                   (and (signed-32? disp)
-                        (Trivit (dest)
-                          (emit lea `(riprel ,disp) dest '())))))]
+                 (let ([incr-offset (adjust-return-point-offset incr-offset l)])
+                   (let ([disp (fx- next-addr (fx- offset incr-offset))])
+                     (and (signed-32? disp)
+                          (Trivit (dest)
+                            (emit lea `(riprel ,disp) dest '()))))))]
                [else #f])
             (asm-move '() dest (with-output-language (L16 Triv) `(label-ref ,l ,incr-offset)))))))
 
@@ -2817,17 +2815,15 @@
 
     (module (push-registers pop-registers push-registers-size)
       (define (move-registers regs load?)
-        (define vfp (make-vfp))
-        (define (fp-reg? reg)
-          (let loop ([i (fx- (vector-length vfp) 1)])
-            (or (eq? reg (vector-ref vfp i))
-                (and (fx> i 0) (loop (fx- i 1))))))
+        (define (fp-reg? reg) (eq? (reg-type reg) 'fp))
         (with-output-language (L13 Effect)
           (let loop ([regs regs] [offset 0])
             (let* ([reg (car regs)]
                    [e (cond
                        [(fp-reg? reg)
-                        `(inline ,(make-info-loadfl reg) ,(if load? %load-double %store-double) ,%sp ,%zero (immediate ,offset))]
+                        (if load?
+                            `(set! ,reg ,(%mref ,%sp ,%zero ,offset fp))
+                            `(set! ,(%mref ,%sp ,%zero ,offset fp) ,reg))]
                        [load? `(set! ,reg ,(%mref ,%sp ,offset))]
                        [else `(set! ,(%mref ,%sp ,offset) ,reg)])]
                    [regs (cdr regs)])
@@ -2861,15 +2857,13 @@
         (letrec ([load-double-stack
                   (lambda (offset)
                     (lambda (x) ; requires var
-                      (%seq
-                        (inline ,(make-info-loadfl %fptmp1) ,%load-double ,x ,%zero ,(%constant flonum-data-disp))
-                        (inline ,(make-info-loadfl %fptmp1) ,%store-double ,%sp ,%zero (immediate ,offset)))))]
+                      `(set! ,(%mref ,%sp ,%zero ,offset fp)
+                             ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                  [load-single-stack
                   (lambda (offset)
                     (lambda (x) ; requires var
-                      (%seq
-                        (inline ,(make-info-loadfl %fptmp1) ,%load-double->single ,x ,%zero ,(%constant flonum-data-disp))
-                        (inline ,(make-info-loadfl %fptmp1) ,%store-single ,%sp ,%zero (immediate ,offset)))))]
+                      (%inline store-double->single ,(%mref ,%sp ,%zero ,offset fp)
+                               ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                  [load-int-stack
                   (lambda (offset)
                     (lambda (rhs) ; requires rhs
@@ -2877,17 +2871,18 @@
                  [load-double-reg
                   (lambda (fpreg)
                     (lambda (x) ; requires var
-                      `(inline ,(make-info-loadfl fpreg) ,%load-double ,x ,%zero ,(%constant flonum-data-disp))))]
+                      `(set! ,fpreg ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))]
                  [load-double-reg2
                   (lambda (fpreg ireg)
                     (lambda (x) ; requires var
                       (%seq
-                       (inline ,(make-info-loadfl fpreg) ,%load-double ,x ,%zero ,(%constant flonum-data-disp))
-                       (set! ,ireg (inline ,(make-info-loadfl fpreg) ,%get-double)))))]
+                        (set! ,fpreg ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))
+                        ;; To support the varargs convention, copy the value into a GP register
+                        (set! ,ireg ,(%inline get-double ,fpreg)))))]
                  [load-single-reg
                   (lambda (fpreg)
                     (lambda (x) ; requires var
-                      `(inline ,(make-info-loadfl fpreg) ,%load-double->single ,x ,%zero ,(%constant flonum-data-disp))))]
+                      `(set! ,fpreg ,(%inline double->single ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)))))]
                  [load-int-reg
                   (lambda (type ireg)
                     (lambda (x)
@@ -2934,10 +2929,10 @@
                           (cond
                            [(fx= size 4)
                             ;; Must be the last element
-                            `(inline ,(make-info-loadfl (vector-ref vfp ifp)) ,%load-single ,x ,%zero (immediate ,x-offset))]
+                            `(set! ,(vector-ref vfp ifp) ,(%inline load-single ,(%mref ,x ,%zero ,x-offset fp)))]
                            [else
                             `(seq
-                              (inline ,(make-info-loadfl (vector-ref vfp ifp)) ,%load-double ,x ,%zero (immediate ,x-offset))
+                              (set! ,(vector-ref vfp ifp) ,(%mref ,x ,%zero ,x-offset fp))
                               ,(loop (fx- size 8) iint (fx+ ifp 1) (cdr classes) (fx+ x-offset 8)))])]
                          ;; Remaining cases are integers:
                          [(>= size 8)
@@ -3138,8 +3133,8 @@
                 `(seq
                   ,(loop (cdr classes) (fx+ offset 8) iregs (cdr fpregs) (fx- size 8))
                   ,(case size
-                     [(4) `(inline ,(make-info-loadfl (car fpregs)) ,%store-single ,%rcx ,%zero (immediate ,offset))]
-                     [else `(inline ,(make-info-loadfl (car fpregs)) ,%store-double ,%rcx ,%zero (immediate ,offset))]))]
+                     [(4) (%inline store-single ,(%mref ,%rcx ,%zero ,offset fp) ,(car fpregs))]
+                     [else `(set! ,(%mref ,%rcx ,%zero ,offset fp) ,(car fpregs))]))]
                [else
                 `(seq
                   ,(loop (cdr classes) (fx+ offset 8) (cdr iregs) fpregs (fx- size 8))
@@ -3215,6 +3210,7 @@
                    [result-classes (classify-type result-type)]
                    [result-size (classified-size result-type)]
                    [fill-result-here? (result-fits-in-registers? result-classes)]
+                   [result-reg* (get-result-regs fill-result-here? result-type result-classes)]
                    [adjust-active? (if-feature pthreads (memq 'adjust-active conv*) #f)])
               (with-values (do-args (if fill-result-here? (cdr arg-type*) arg-type*) (make-vint) (make-vfp))
                 (lambda (frame-size nfp locs live* fp-live*)
@@ -3225,17 +3221,17 @@
                           (let* ([t (if adjust-active? %deact t0)] ; need a register if `adjust-active?`
                                  [c-call
                                   (add-deactivate adjust-active? t0 (append fp-live* live*)
-                                   (get-result-regs fill-result-here? result-type result-classes)
+                                   result-reg*
                                    (if-feature windows
                                      (%seq
                                        (set! ,%sp ,(%inline - ,%sp (immediate 32)))
-                                       (inline ,(make-info-kill*-live* (reg-list %rax %rdx) live*) ,%c-call ,t)
+                                       (inline ,(make-info-kill*-live* result-reg* (append fp-live* live*)) ,%c-call ,t)
                                        (set! ,%sp ,(%inline + ,%sp (immediate 32))))
                                      (%seq
                                        ;; System V ABI varargs functions require count of fp regs used in %al register.
                                        ;; since we don't know if the callee is a varargs function, we always set it.
                                        (set! ,%rax (immediate ,nfp))
-                                       (inline ,(make-info-kill*-live* (reg-list %rax %rdx) (cons %rax live*)) ,%c-call ,t))))])
+                                       (inline ,(make-info-kill*-live* result-reg* (cons %rax (append fp-live* live*))) ,%c-call ,t))))])
                             (cond
                              [fill-result-here?
                               (add-fill-result c-call (fx- frame-size (constant ptr-bytes)) result-classes result-size)]
@@ -3243,12 +3239,10 @@
                         (nanopass-case (Ltype Type) result-type
                           [(fp-double-float)
                            (lambda (lvalue)
-                             `(inline ,(make-info-loadfl %Cfpretval) ,%store-double ,lvalue ,%zero
-                                ,(%constant flonum-data-disp)))]
+                             `(set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp) ,%Cfpretval))]
                           [(fp-single-float)
                            (lambda (lvalue)
-                             `(inline ,(make-info-loadfl %Cfpretval) ,%store-single->double ,lvalue ,%zero
-                                ,(%constant flonum-data-disp)))]
+                             `(set! ,(%mref ,lvalue ,%zero ,(constant flonum-data-disp) fp) ,(%inline single->double ,%Cfpretval)))]
                           [(fp-integer ,bits)
                            (case bits
                              [(8) (lambda (lvalue) `(set! ,lvalue ,(%inline sext8 ,%rax)))]
@@ -3322,15 +3316,13 @@
           (define load-double-stack
             (lambda (offset)
               (lambda (x) ; requires var
-                (%seq
-                  (inline ,(make-info-loadfl %fptmp1) ,%load-double ,%sp ,%zero (immediate ,offset))
-                  (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+                `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                       ,(%mref ,%sp ,%zero ,offset fp)))))
           (define load-single-stack
             (lambda (offset)
               (lambda (x) ; requires var
-                (%seq
-                  (inline ,(make-info-loadfl %fptmp1) ,%load-single->double ,%sp ,%zero (immediate ,offset))
-                  (inline ,(make-info-loadfl %fptmp1) ,%store-double ,x ,%zero ,(%constant flonum-data-disp))))))
+                `(set! ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)
+                       ,(%inline load-single->double ,(%mref ,%sp ,%zero ,offset fp))))))
           (define load-int-stack
             (lambda (type offset)
               (lambda (lvalue)
@@ -3370,15 +3362,13 @@
                         [(fp-double-float)
                          (if (< i 4)
                              (%seq
-                               (inline ,(make-info-loadfl (vector-ref vfp i)) ,%store-double
-                                 ,%sp ,%zero (immediate ,isp))
+                               (set! ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp i))
                                ,(f (cdr types) (fx+ i 1) (fx+ isp 8)))
                              (f (cdr types) i isp))]
                         [(fp-single-float)
                          (if (< i 4)
                              (%seq
-                               (inline ,(make-info-loadfl (vector-ref vfp i)) ,%store-single
-                                 ,%sp ,%zero (immediate ,isp))
+                               ,(%inline store-single ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp i))
                                ,(f (cdr types) (fx+ i 1) (fx+ isp 8)))
                              (f (cdr types) i isp))]
                         [(fp-ftd& ,ftd)
@@ -3393,8 +3383,7 @@
                                     (eq? 'float (caar ($ftd->members ftd))))
                                ;; float or double
                                `(seq
-                                 (inline ,(make-info-loadfl (vector-ref vfp i)) ,%store-double
-                                         ,%sp ,%zero (immediate ,isp))
+                                 (set! ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp i))
                                  ,(f (cdr types) (fx+ i 1) (fx+ isp 8)))]
                               [else
                                ;; integer
@@ -3428,15 +3417,13 @@
                         [(fp-double-float)
                          (if (< ifp 8)
                              (%seq
-                               (inline ,(make-info-loadfl (vector-ref vfp ifp)) ,%store-double
-                                 ,%sp ,%zero (immediate ,isp))
+                               (set! ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp ifp))
                                ,(f (cdr types) iint (fx+ ifp 1) (fx+ isp 8)))
                              (f (cdr types) iint ifp isp))]
                         [(fp-single-float)
                          (if (< ifp 8)
                              (%seq
-                               (inline ,(make-info-loadfl (vector-ref vfp ifp)) ,%store-single
-                                 ,%sp ,%zero (immediate ,isp))
+                               ,(%inline store-single ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp ifp))
                                ,(f (cdr types) iint (fx+ ifp 1) (fx+ isp 8)))
                              (f (cdr types) iint ifp isp))]
                         [(fp-ftd& ,ftd)
@@ -3455,8 +3442,7 @@
                                  (f (cdr types) iint ifp isp)]
                                 [(eq? (car classes) 'sse)
                                  `(seq
-                                   (inline ,(make-info-loadfl (vector-ref vfp ifp)) ,%store-double
-                                           ,%sp ,%zero (immediate ,isp))
+                                   (set! ,(%mref ,%sp ,%zero ,isp fp) ,(vector-ref vfp ifp))
                                    ,(reg-loop (cdr classes) iint (fx+ ifp 1) (+ isp 8)))]
                                 [else
                                  `(seq
@@ -3576,7 +3562,7 @@
                            (fx+ offset 8)
                            int*
                            (cdr fp*)
-                           (cons `(inline ,(make-info-loadfl (car fp*)) ,%load-double ,%sp ,%zero (immediate ,offset))
+                           (cons `(set! ,(car fp*) ,(%mref ,%sp ,%zero ,offset fp))
                                  accum)
                            live*
                            (cons (car fp*) fp-live*))]))]
@@ -3589,13 +3575,13 @@
               [(fp-double-float)
                (values
                 (lambda (x)
-                  `(inline ,(make-info-loadfl %Cfpretval) ,%load-double ,x ,%zero ,(%constant flonum-data-disp)))
+                  `(set! ,%Cfpretval ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp)))
                 '()
                 (list %Cfpretval))]
               [(fp-single-float)
                (values
                 (lambda (x)
-                  `(inline ,(make-info-loadfl %Cfpretval) ,%load-double->single ,x ,%zero ,(%constant flonum-data-disp)))
+                  `(set! ,%Cfpretval ,(%inline double->single ,(%mref ,x ,%zero ,(constant flonum-data-disp) fp))))
                 '()
                 (list %Cfpretval))]
               [(fp-void)
@@ -3710,5 +3696,5 @@
                              (set! ,%rbp ,(%inline pop))
                              (set! ,%rbx ,(%inline pop))
                              (set! ,%sp ,(%inline + ,%sp (immediate 136)))))
-                        (asm-c-return ,null-info ,callee-save-regs ... ,result-regs ...))))))))))))))
+                        (asm-c-return ,null-info ,callee-save-regs ... ,result-regs ... ,result-fp-regs ...))))))))))))))
   )