From 0dae7b28e4f73d7700b3ef677fdb387b903bbd52 Mon Sep 17 00:00:00 2001
From: Jared Wasinger <j-wasinger@hotmail.com>
Date: Tue, 4 Aug 2020 20:38:26 -0700
Subject: [PATCH] add poemm C impl of f6m_mul

---
 src/{ => evm}/v1/benchmark.yul |   0
 src/{ => evm}/v1/test.yul      |   0
 src/{ => evm}/v2/benchmark.yul |   0
 src/{ => evm}/v2/test.yul      |   0
 src/native/CMakeLists.txt      |   4 +
 src/native/bigint.h            | 528 +++++++++++++++++++++++++++++++++
 src/native/f6m_mul.c           | 313 +++++++++++++++++++
 7 files changed, 845 insertions(+)
 rename src/{ => evm}/v1/benchmark.yul (100%)
 rename src/{ => evm}/v1/test.yul (100%)
 rename src/{ => evm}/v2/benchmark.yul (100%)
 rename src/{ => evm}/v2/test.yul (100%)
 create mode 100644 src/native/CMakeLists.txt
 create mode 100644 src/native/bigint.h
 create mode 100644 src/native/f6m_mul.c

diff --git a/src/v1/benchmark.yul b/src/evm/v1/benchmark.yul
similarity index 100%
rename from src/v1/benchmark.yul
rename to src/evm/v1/benchmark.yul
diff --git a/src/v1/test.yul b/src/evm/v1/test.yul
similarity index 100%
rename from src/v1/test.yul
rename to src/evm/v1/test.yul
diff --git a/src/v2/benchmark.yul b/src/evm/v2/benchmark.yul
similarity index 100%
rename from src/v2/benchmark.yul
rename to src/evm/v2/benchmark.yul
diff --git a/src/v2/test.yul b/src/evm/v2/test.yul
similarity index 100%
rename from src/v2/test.yul
rename to src/evm/v2/test.yul
diff --git a/src/native/CMakeLists.txt b/src/native/CMakeLists.txt
new file mode 100644
index 0000000..6ad999a
--- /dev/null
+++ b/src/native/CMakeLists.txt
@@ -0,0 +1,4 @@
+cmake_minimum_required (VERSION 3.0)
+project (f6m_mul_native)
+
+add_executable(f6m_mul_native f6m_mul.c)
diff --git a/src/native/bigint.h b/src/native/bigint.h
new file mode 100644
index 0000000..8d1690a
--- /dev/null
+++ b/src/native/bigint.h
@@ -0,0 +1,528 @@
+#ifndef BIGINT_H
+#define BIGINT_H
+
+#if !WASM
+#include <stdint.h>
+#else
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+typedef unsigned __int128 uint128_t;
+#endif
+
+#define uint128_t __uint128_t
+
+/*
+This header-only library has three parameters:
+BIGINT_BITS - the number of bits of the big integer
+LIMB_BITS - the number of bits in each limb, must correspond to a uint*_t type
+LIMB_BITS_OVERFLOW - the number of bits output by multiplication, i.e. 2*LIMB_BITS, must correspond to a uint*_t type
+
+To use this library, define a limb size and include it:
+  #define BIGINT_BITS 256
+  #define LIMB_BITS 64
+  #define LIMB_BITS_OVERFLOW 128
+  #include "bigint.h"
+  #undef BIGINT_BITS
+  #undef LIMB_BITS
+  #undef LIMB_BITS_OVERFLOW
+ 
+And if you need other sizes, define them:
+  #define BIGINT_BITS 512
+  #define LIMB_BITS 32
+  #define LIMB_BITS_OVERFLOW 64
+  #include "bigint.h"
+  #undef BIGINT_BITS
+  #undef LIMB_BITS
+  #undef LIMB_BITS_OVERFLOW
+
+Now you can use functions like:
+  mulmodmont256_64bitlimbs(out,x,y,m,inv);
+  sub512_32bitlimbs(out,a,b);
+
+Warning: LIMB_BITS corresponds to the uint*_t type, and multiplication requires double the bits, for example 64-bit limbs require type uint128_t, which may be unavailable on some targets like Wasm.
+*/
+
+
+// Define macros used in this file:
+
+// define types UINT and UNT2, where UINT2 is for overflow of operations on UINT; for multiplication should be double the number of bits
+// UINT is the limb type, uint*_t where * is the number of bits per limb, eg uint32_t
+// UINT2 is also needed for multiplication UINTxUINT->UINT2, e.g. uint32_txuint32_t->uint64_t or uint64_txuint64_t->uint128_t
+#define TYPE_(num) uint##num##_t
+#define TYPE(num) TYPE_(num)
+#define UINT TYPE(LIMB_BITS)
+#define UINT2 TYPE(LIMB_BITS_OVERFLOW)
+
+// define NUM_LIMBS to be the number of limbs
+// eg UINT=uint32_t with NUM_LIMBS=8 limbs is for 256-bit
+// eg UINT=uint64_t with NUM_LIMBS=8 limbs is for 512-bit
+#define NUM_LIMBS (BIGINT_BITS/LIMB_BITS)
+
+// define the function name, use concatenation
+// eg BIGINT_BITS=256, LIMB_BITS=64: FUNCNAME(myname) is replaced with myname256_64bitlimbs
+#define FUNCNAME__(name,A,B) name##A##_##B##bitlimbs
+#define FUNCNAME_(name,A,B) FUNCNAME__(name,A,B)
+#define FUNCNAME(name) FUNCNAME_(name,BIGINT_BITS,LIMB_BITS)
+
+
+
+// add two numbers using two's complement for overflow, returning an overflow bit
+// algorithm 14.7, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+//   except we ignore the final carry in step 3 since we assume that there is no extra limb
+UINT FUNCNAME(add)(UINT* const out, const UINT* const x, const UINT* const y){
+  UINT c=0;
+  #pragma unroll
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT temp = x[i]+c;
+    out[i] = temp+y[i];
+    c = (temp<c || out[i]<temp) ? 1:0;
+  }
+  return c;
+}
+
+// algorithm 14.9, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+// the book says it computes x-y for x>=y, but actually it computes the 2's complement for x<y
+// note: algorithm 14.9 allow adding c=-1, but we just subtract c=1 instead
+UINT FUNCNAME(sub)(UINT* const out, const UINT* const x, const UINT* const y){
+  UINT c=0;
+  #pragma unroll
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT temp = x[i]-c;
+    c = (temp<y[i] || x[i]<c) ? 1:0;
+    out[i] = temp-y[i];
+  }
+  return c;
+}
+
+
+// checks whether x<y
+uint8_t FUNCNAME(less_than)(const UINT* const x, const UINT* const y){
+  for (int i=NUM_LIMBS-1;i>=0;i--){
+    if (x[i]>y[i])
+      return 0;
+    else if (x[i]<y[i])
+      return 1;
+  }
+  // they are equal
+  return 0;
+}
+
+// checks whether x<=y
+uint8_t FUNCNAME(less_than_or_equal)(const UINT* const x, const UINT* const y){
+  for (int i=NUM_LIMBS-1;i>=0;i--){
+    if (x[i]>y[i])
+      return 0;
+    else if (x[i]<y[i])
+      return 1;
+  }
+  // they are equal
+  return 1;
+}
+
+
+// computes quotient x/y and remainder x%y
+// algorithm 14.20, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+// it works, but the implementation is naive, see notes
+// y = q*x + r
+void FUNCNAME(div)(UINT* const q, UINT* const r, const UINT* const x, const UINT* const y){
+
+  for (int i=0; i<NUM_LIMBS; i++){
+    q[i]=0;
+  }
+
+  // book has n and t given, we compute these
+  int n = 0;  // idx of first significant("nonzero") limbs of x
+  int t = 0;  // n minus the idx of first significant limb of y
+  for (int i=NUM_LIMBS-1;i>=0;i--){
+    if (x[i] != 0) { n=i; break; }
+  }
+  for (int i=n;i>=0;i--){
+    if (y[i] != 0) { t=n-i; break; }
+  }
+
+  // not in the textbook
+  // special case for y=1, this hack is needed for now
+  if( n-t==0 && y[0]==1 ){
+    for (int i=0;i<NUM_LIMBS;i++){
+      r[i]=0;
+      q[i]=x[i];
+    }
+    return;
+  }
+
+  // save input x from getting clobbered below
+  // note that x_ it will end up as remainder
+  UINT *x_ = r;
+  for (int i=0; i<NUM_LIMBS; i++){
+    x_[i]=x[i];
+  }
+
+  /* WIP
+  // step 1 in book
+  for (int j=0;j<n-t;j++)
+    q[i]=0;
+
+  // step 2 in book
+  // first get y*b^{n-t} by shifting y up by n-t limbs
+  UINT y_n_t[NUM_LIMBS];
+  for (int i=NUM_LIMBS;i>t;i--)
+    y_n_t[i] = 0;
+  for (int i=t;i>n-t;i--)
+    y_n_t[i] = y[i+n-t];
+  for (int i=n-t;i>=0;i--)
+    y_n_t[i] = 0;
+  // now the while subtract loop
+  while (FUNCNAME(less_than_or_equal)(y_n_t,x)){
+    q[n-t]+=1;
+    FUNCNAME(sub(x_,x,y_n_t));
+  }
+
+  // step 3 in book
+  // TODO
+  */
+
+
+
+  // THIS IS A NAIVE IMPLEMENTATION OF WHAT IS IN THE BOOK
+  // naive loop: while( y<x_ ) { q++; x_=x_-y }
+
+  // leq = (y<x_)
+  UINT leq = 1;
+  for (int i=n;i>=0;i--){
+    if (y[i]>x_[i]){ leq = 0; break;}
+    else if (y[i]<x_[i]){ leq = 1; break;}
+  }
+
+  while (leq){
+
+    // q = q + 1
+    for(int i=0;i<=n;i++){
+      q[i]+=1;
+      if(q[i]!=0)
+	break;
+    }
+
+    // x_ = x_ - y
+    UINT c=0;
+    for (int i=0; i<=n; i++){
+      UINT temp = x_[i]-c;
+      c = (temp<y[i] || x_[i]<c) ? 1:0;
+      x_[i] = temp-y[i];
+    }
+
+    // leq = (y<x_)
+    for (int i=n;i>=0;i--){
+      if (y[i]>x_[i]){ leq = 0; break;}
+      else if (y[i]<x_[i]){ leq = 1; break;}
+    }
+
+  }
+
+}
+
+// algorithm 14.12, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+// but assume they both have the same number of limbs, this can be changed
+// out should have double the number of limbs as the inputs
+// num_limbs corresponds to n+1 in the book
+void FUNCNAME(mul)(UINT* const out, const UINT* const x, const UINT* const y){
+  UINT w[NUM_LIMBS*2];
+  for (int i=0; i<2*NUM_LIMBS; i++)
+    w[i]=0;
+  #pragma unroll
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT c = 0;
+    #pragma unroll
+    for (int j=0; j<NUM_LIMBS; j++){
+      UINT2 uv = (UINT2)w[i+j] + (UINT2)x[j]*y[i];
+      uv += c;
+      UINT2 u = uv >> LIMB_BITS;
+      UINT v = (UINT)uv;
+      w[i+j] = v;
+      c = (UINT)u;
+    }
+    w[i+NUM_LIMBS] = c;
+  }
+  for (int i=0; i< 2*NUM_LIMBS; i++)
+    out[i]=w[i];
+}
+
+// algorithm 14.16, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+// NUM_LIMBS is t (number of limbs) in the book, and the base is UINT*, usually uint32_t or uint64_t
+// output out should have double the limbs of input x
+void FUNCNAME(square)(UINT* const out, const UINT* const x){
+  UINT w[NUM_LIMBS*2];
+  for (int i=0; i< 2*NUM_LIMBS; i++)
+    w[i]=0;
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT2 uv = (UINT2)(x[i])*x[i] + w[2*i];
+    UINT u = (UINT)(uv >> LIMB_BITS);	// / base
+    UINT v = (UINT)uv;			// % base
+    w[2*i] = v;
+    UINT c = u;
+    for (int j=i+1; j<NUM_LIMBS; j++){
+      UINT2 xixj = (UINT2)(x[i])*x[j];
+      UINT2 partial_sum = xixj + c + w[i+j];
+      uv = xixj + partial_sum;
+      u = (UINT)(uv >> LIMB_BITS);	// / base
+      v = (UINT)uv;			// % base
+      w[i+j] = v;
+      c = u;
+      // may have more overflow, so keep carrying
+      // this passes sume tests, but needs review
+      if (uv<partial_sum){
+        int k=2;
+        while ( i+j+k<NUM_LIMBS*2 && w[i+j+k]==(UINT)0-1 ){ // note 0-1 is 0xffffffff
+          w[i+j+k]=0;
+          k++;
+        }
+        if (i+j+k<NUM_LIMBS*2)
+          w[i+j+k]+=1;
+      }
+    }
+    // this passes some tests, but not sure if += is correct
+    w[i+NUM_LIMBS] += u;
+  }
+  for (int i=0; i< 2*NUM_LIMBS; i++)
+    out[i]=w[i];
+}
+
+
+
+////////////////////////
+// Modular arithmetic //
+////////////////////////
+
+
+// compute a+b (mod m), where x,y < m
+// algorithm 14.27, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+void FUNCNAME(addmod)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m){
+  UINT carry = FUNCNAME(add)(out,x,y);
+  // In textbook 14.27, says addmod is add and an extra step: subtract m iff x+y>=m
+  if (carry || FUNCNAME(less_than_or_equal)(m,out)){
+    FUNCNAME(sub)(out, out, m);
+  }
+  // note: we don't consider the case x+y-m>m. Because, for our crypto application, we assume x,y<m.
+}
+
+// compute x-y (mod m) for x,y < m
+// uses fact 14.27, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+void FUNCNAME(submod)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m){
+  UINT c = FUNCNAME(sub)(out,x,y);
+  // if c, then x<y, so result is negative, need to get it's magnitude and subtract it from m 
+  if (c){
+    FUNCNAME(add)(out, m, out);		// add m to overflow back
+  }
+  // note: we don't consider the case x-y>m. Because, for our crypto application, we assume x,y<m.
+}
+
+
+// returns (aR * bR) % m, where aR and bR are in Montgomery form
+// algorithm 14.32, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+// T has 2*NUM_LIMBS limbs, otherwise pad most significant bits with zeros
+void FUNCNAME(montreduce)(UINT* const out, const UINT* const T, const UINT* const m, const UINT inv){
+
+  UINT A[NUM_LIMBS*2+1];
+  for (int i=0; i<2*NUM_LIMBS; i++)
+    A[i] = T[i];
+  A[NUM_LIMBS*2]=0;
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT ui = A[i]*inv;
+    UINT carry=0;
+    int j;
+    // add ui*m*b^i to A in a loop, since m is NUM_LIMBS long
+    for (j=0; j<NUM_LIMBS; j++){
+      UINT2 sum = (UINT2)ui*m[j] + A[i+j] + carry;
+      A[i+j] = (UINT)sum;		// % b;
+      carry = (UINT)(sum >> LIMB_BITS);	// / b
+    }
+    // carry may be nonzero, so keep carrying
+    int k=0;
+    while (carry && i+j+k<2*NUM_LIMBS+1){
+      UINT2 sum = (UINT2)(A[i+j+k])+carry;
+      A[i+j+k] = (UINT)sum;		// % b
+      carry = (UINT)(sum >> LIMB_BITS);	// / b
+      k+=1;
+    }
+  }
+
+  // instead of right shift, we just get the correct values
+  for (int i=0; i<NUM_LIMBS; i++)
+    out[i] = A[i+NUM_LIMBS];
+
+  // final subtraction, first see if necessary
+  if (A[NUM_LIMBS*2] || FUNCNAME(less_than_or_equal)(m,out)){
+    FUNCNAME(sub)(out, out, m);
+  }
+}
+
+// algorithm 14.16 followed by 14.32
+// this might be faster than algorithm 14.36, as described in remark 14.40
+void FUNCNAME(montsquare)(UINT* const out, const UINT* const x, const UINT* const m, const UINT inv){
+  UINT out_internal[NUM_LIMBS*2];
+  FUNCNAME(square)(out_internal, x);
+  FUNCNAME(montreduce)(out, out_internal, m, inv);
+}
+
+// algorithm 14.12 followed by 14.32
+// this might be slower than algorithm 14.36, which interleaves these steps
+// Known as the Separated Operand Scanning (SOS) Method
+void FUNCNAME(mulmodmontSOS)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){
+  UINT out_internal[NUM_LIMBS*2];
+  FUNCNAME(mul)(out_internal, x, y);
+  FUNCNAME(montreduce)(out, out_internal, m, inv);
+}
+
+// algorithm 14.36, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf
+void FUNCNAME(mulmodmontHAC)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){
+  UINT A[NUM_LIMBS*2+1];
+  for (int i=0;i<NUM_LIMBS*2+1;i++)
+    A[i]=0;
+  #pragma unroll	// this unroll increases binary size by a lot
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT ui = (A[i]+x[i]*y[0])*inv;
+    UINT carry = 0;
+    #pragma unroll
+    for (int j=0; j<NUM_LIMBS; j++){
+      UINT2 xiyj = (UINT2)x[i]*y[j];
+      UINT2 uimj = (UINT2)ui*m[j];
+      UINT2 partial_sum = xiyj+carry;
+      UINT2 sum = uimj+A[i+j]+partial_sum;
+      A[i+j] = (UINT)sum;
+      carry = (UINT)(sum>>LIMB_BITS);
+      // if there was overflow in the sum beyond the carry:
+      if (sum<partial_sum){
+        int k=2;
+        while ( i+j+k<NUM_LIMBS*2 && A[i+j+k]==(UINT)0-1 ){ // note 0-1 is 0xffffffff
+	  // this is rare, need limb to be all 1's
+          A[i+j+k]=0;
+          k++;
+        }
+        if (i+j+k<NUM_LIMBS*2+1){
+          A[i+j+k]+=1;
+	}
+      }
+      //printf("%d %d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",i,j,x[i],x[i]*y[0],ui,xiyj,uimj,partial_sum,sum,A[i+j],carry);
+    }
+    A[i+NUM_LIMBS]+=carry;
+  }
+
+  // instead of right shift, we just get the correct values
+  for (int i=0; i<NUM_LIMBS;i++)
+    out[i] = A[i+NUM_LIMBS];
+
+  // final subtraction, first see if necessary
+  if (A[NUM_LIMBS*2]>0 || FUNCNAME(less_than_or_equal)(m,out))
+    FUNCNAME(sub)(out, out, m);
+}
+
+// From paper Çetin K. Koç; Tolga Acar; Burton S. Kaliski, Jr. (June 1996). "Analyzing and Comparing Montgomery Multiplication Algorithms". IEEE Micro. 16 (3): 26–33.
+void FUNCNAME(mulmodmontFIOS)(UINT* const out, const UINT* const a, const UINT* const b, const UINT* const mod, const UINT inv){
+  UINT t[NUM_LIMBS+2];
+  for (int i=0;i<NUM_LIMBS+2;i++)
+    t[i]=0;
+  #pragma unroll	// this unroll increases binary size by a lot
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT carry = 0;
+    UINT2 sum = 0;
+    sum = (UINT2)(t[0])+(UINT2)(a[0])*b[i];
+    carry = (UINT)(sum>>LIMB_BITS);
+    int k=1;
+    while (carry && k<=NUM_LIMBS+1){
+      UINT2 temp = (UINT2)t[k] + carry;
+      t[k]=(UINT)temp;
+      carry = (UINT)(temp >> LIMB_BITS);
+      k++;
+    }
+    UINT m = ((UINT)sum)*inv;
+    sum = (UINT)sum + (UINT2)m*mod[0]; // lower limb of sum should be zero
+    carry = (UINT)(sum >> LIMB_BITS);
+    #pragma unroll
+    for (int j=1; j<NUM_LIMBS; j++){
+      sum = (UINT2)t[j] + (UINT2)a[j]*b[i] + carry;
+      carry = (UINT)(sum >> LIMB_BITS);
+      k=j+1;
+      while (carry && k<=NUM_LIMBS+1){
+        UINT2 temp = (UINT2)t[k] + carry;
+        t[k]=(UINT)temp;
+        carry = (UINT)(temp >> LIMB_BITS);
+        k++;
+      }
+      sum = (UINT)sum + (UINT2)m*mod[j];
+      carry = (UINT)(sum>>LIMB_BITS);
+      t[j-1] = (UINT)sum;
+    }
+    sum = (UINT2)t[NUM_LIMBS] + carry;
+    carry = (UINT)(sum >> LIMB_BITS);
+    t[NUM_LIMBS-1] = (UINT)sum;
+    t[NUM_LIMBS] = t[NUM_LIMBS+1]+carry;
+    t[NUM_LIMBS+1] = 0;
+  }
+
+  // output correct values
+  for (int i=0; i<NUM_LIMBS;i++)
+    out[i] = t[i];
+
+  // final subtraction, first see if necessary
+  if (t[NUM_LIMBS]>0 || FUNCNAME(less_than_or_equal)(mod,out))
+    FUNCNAME(sub)(out, out, mod);
+}
+
+// see description for mulmodmontCIOS
+void FUNCNAME(mulmodmont)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){
+  UINT A[NUM_LIMBS+2];
+  for (int i=0;i<NUM_LIMBS+2;i++)
+    A[i]=0;
+  #pragma unroll	// this unroll increases binary size by a lot
+  for (int i=0; i<NUM_LIMBS; i++){
+    UINT carry = 0;
+    UINT2 sum = 0;
+    #pragma unroll
+    for (int j=0; j<NUM_LIMBS; j++){
+      sum = (UINT2)A[j] + (UINT2)x[i]*y[j] + carry;
+      carry = (UINT)(sum>>LIMB_BITS);
+      A[j] = (UINT)sum;
+    }
+    sum = (UINT2)(A[NUM_LIMBS]) + carry;
+    carry = (UINT)(sum>>LIMB_BITS);
+    A[NUM_LIMBS] = (UINT) sum;
+    A[NUM_LIMBS+1] = carry;
+    UINT A0inv = A[0]*inv;
+    sum = (UINT2)(A[0]) + (UINT2)A0inv*m[0];
+    carry = (UINT)(sum>>LIMB_BITS);
+    #pragma unroll
+    for (int j=1; j<NUM_LIMBS; j++){
+      sum = (UINT2)(A[j]) + (UINT2)A0inv*m[j] + carry;
+      carry = (UINT)(sum>>LIMB_BITS);
+      A[j-1] = (UINT)sum;
+    }
+    sum = (UINT2)(A[NUM_LIMBS])+carry;
+    carry = (UINT)(sum>>LIMB_BITS);
+    A[NUM_LIMBS-1]=(UINT)sum;
+    A[NUM_LIMBS]=A[NUM_LIMBS+1]+carry;
+  }
+
+  // copy to out
+  for (int i=0; i<NUM_LIMBS;i++)
+    out[i] = A[i];
+
+  // final subtraction, first see if necessary
+  if (A[NUM_LIMBS]>0 || FUNCNAME(less_than_or_equal)(m,out))
+    FUNCNAME(sub)(out, out, m);
+}
+
+// Uses CIOS method for montgomery multiplication, based on algorithm from (but using notation of above mulmodmont) Çetin K. Koç; Tolga Acar; Burton S. Kaliski, Jr. (June 1996). "Analyzing and Comparing Montgomery Multiplication Algorithms". IEEE Micro. 16 (3): 26–33.
+// Known as the Coarsely Integrated Operand Scanning (CIOS)
+void FUNCNAME(mulmodmontCIOS)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){
+  FUNCNAME(mulmodmont)(out, x, y, m, inv);
+}
+
+// like mulmodmont, but with two of the args hard-coded
+void FUNCNAME(mulmodmont_3args_)(UINT* const out, const UINT* const x, const UINT* const y){
+  UINT* m = (UINT*)4444444;    // hard-code m or address to m here
+  UINT inv = 6666666;  // hard-code inv here
+  FUNCNAME(mulmodmont)(out, x, y, m, inv);
+}
+
+#endif
diff --git a/src/native/f6m_mul.c b/src/native/f6m_mul.c
new file mode 100644
index 0000000..e8915c8
--- /dev/null
+++ b/src/native/f6m_mul.c
@@ -0,0 +1,313 @@
+/*
+This is a direct translation of: https://github.com/ewasm/evm384_f6m_mul/blob/master/src/v2/test.yul
+
+To execute:
+git clone https://github.com/poemm/bigint_experiments.git
+cd bigint_experiments
+git clone https://gist.github.com/4ad8279ea1693c13a16b134970d67101.git f6m_mul_gist
+cd f6m_mul_gist
+gcc f6m_mul.c -o f6m_mul -O4 -march=native
+./f6m_mul
+*/
+
+
+#include<stdio.h>
+
+#define BIGINT_BITS 384
+#define LIMB_BITS 64
+#define LIMB_BITS_OVERFLOW 128
+#include "bigint.h"
+
+void mulNR2(uint8_t* x0, uint8_t* x1, uint8_t* r0, uint8_t* r1, uint8_t* modulus){
+  //printf("mulNR2()\n");
+  FUNCNAME(submod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)x1,(uint64_t*)modulus);
+  FUNCNAME(addmod)((uint64_t*)r1,(uint64_t*)x0,(uint64_t*)x1,(uint64_t*)modulus);
+}
+
+void f2m_add(uint8_t* x0, uint8_t* x1, uint8_t* y0, uint8_t* y1, uint8_t* r0, uint8_t* r1, uint8_t* modulus, uint8_t* arena){
+  //printf("f2m_add()\n");
+  FUNCNAME(addmod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)y0,(uint64_t*)modulus);
+  FUNCNAME(addmod)((uint64_t*)r1,(uint64_t*)x1,(uint64_t*)y1,(uint64_t*)modulus);
+}
+
+void f2m_sub(uint8_t* x0, uint8_t* x1, uint8_t* y0, uint8_t* y1, uint8_t* r0, uint8_t* r1, uint8_t* modulus, uint8_t* arena){
+  //printf("f2m_sub()\n");
+  FUNCNAME(submod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)y0,(uint64_t*)modulus);
+  FUNCNAME(submod)((uint64_t*)r1,(uint64_t*)x1,(uint64_t*)y1,(uint64_t*)modulus);
+}
+
+void f2m_mul(uint8_t* x, uint8_t* y, uint8_t* r, uint8_t* modulus, uint64_t inv, uint8_t* mem){
+  //printf("f2m_mul()\n");
+  uint8_t* tmp = mem+64;
+  uint8_t* tmp2 = tmp+64;
+  uint8_t* zero = tmp2+64;
+ 
+  FUNCNAME(mulmodmont)((uint64_t*)tmp2,(uint64_t*)(x+64),(uint64_t*)(y+64),(uint64_t*)modulus,inv);
+  FUNCNAME(submod)((uint64_t*)r,(uint64_t*)zero,(uint64_t*)tmp2,(uint64_t*)modulus);
+  FUNCNAME(mulmodmont)((uint64_t*)tmp,(uint64_t*)x,(uint64_t*)y,(uint64_t*)modulus,inv);
+  FUNCNAME(addmod)((uint64_t*)r,(uint64_t*)r,(uint64_t*)tmp,(uint64_t*)modulus);
+
+  //FUNCNAME(mulmodmont)((uint64_t*)tmp2,(uint64_t*)(x+64),(uint64_t*)(y+64),(uint64_t*)modulus,inv);	// why is this repeated from above?
+  FUNCNAME(addmod)((uint64_t*)tmp2,(uint64_t*)tmp,(uint64_t*)tmp2,(uint64_t*)modulus);
+  FUNCNAME(addmod)((uint64_t*)tmp,(uint64_t*)y,(uint64_t*)(y+64),(uint64_t*)modulus);
+  FUNCNAME(addmod)((uint64_t*)(r+64),(uint64_t*)x,(uint64_t*)(x+64),(uint64_t*)modulus);
+  FUNCNAME(mulmodmont)((uint64_t*)(r+64),(uint64_t*)(r+64),(uint64_t*)tmp,(uint64_t*)modulus,inv);
+  FUNCNAME(submod)((uint64_t*)(r+64),(uint64_t*)(r+64),(uint64_t*)tmp2,(uint64_t*)modulus);
+
+}
+
+void f6m_mul_r2(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r2, uint8_t* modulus, uint64_t inv, uint8_t* mem){
+  //printf("f6m_mul_r2()\n");
+  uint8_t* tmp1 = mem;
+  uint8_t* tmp2 = mem+128;
+  uint8_t* tmp3 = tmp2+128;
+  uint8_t* arena = tmp3+128;
+
+  f2m_add(abc,abc+64,abc+256,abc+320,tmp1,tmp1+64,modulus,arena);
+  f2m_add(ABC,ABC+64,ABC+256,ABC+320,tmp2,tmp2+64,modulus,arena);
+
+  f2m_mul(tmp1,tmp2,tmp3,modulus,inv,arena);
+  f2m_add(aA,aA+64,cC,cC+64,tmp1,tmp1+64,modulus,arena);
+  f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena);
+  f2m_add(tmp2,tmp2+64,bB,bB+64,r2,r2+64,modulus,arena);
+}
+
+void f6m_mul_r1(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r1, uint8_t* modulus, uint64_t inv, uint8_t* mem){
+  //printf("f6m_mul_r1()\n");
+  uint8_t* tmp1 = mem;
+  uint8_t* tmp2 = mem+128;
+  uint8_t* tmp3 = tmp2+128;
+  uint8_t* arena = tmp3+128;
+
+  f2m_add(abc,abc+64,abc+128,abc+192,tmp1,tmp1+64,modulus,arena);
+  f2m_add(ABC,ABC+64,ABC+128,ABC+192,tmp2,tmp2+64,modulus,arena);
+
+  f2m_mul(tmp2,tmp1,tmp3,modulus,inv,arena);
+  f2m_add(aA,aA+64,bB,bB+64,tmp1,tmp1+64,modulus,arena);
+  f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena);
+  mulNR2(cC,cC+64,tmp1,tmp1+64,modulus);
+  f2m_add(tmp2,tmp2+64,tmp1,tmp1+64,r1,r1+64,modulus,arena);
+
+}
+
+void f6m_mul_r0(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r0, uint8_t* modulus, uint64_t inv, uint8_t* mem){
+  //printf("f6m_mul_r0()\n");
+  uint8_t* tmp1 = mem;
+  uint8_t* tmp2 = mem+128;
+  uint8_t* tmp3 = tmp2+128;
+  uint8_t* arena = tmp3+128;
+
+  f2m_add(abc+128,abc+192,abc+256,abc+320,tmp1,tmp1+64,modulus,arena);
+  f2m_add(ABC+128,ABC+192,ABC+256,ABC+320,tmp2,tmp2+64,modulus,arena);
+
+  f2m_mul(tmp1,tmp2,tmp3,modulus,inv,arena);
+  f2m_add(bB,bB+64,cC,cC+64,tmp1,tmp1+64,modulus,arena);
+  f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena);
+  mulNR2(tmp2,tmp2+64,tmp3,tmp3+64,modulus);
+  f2m_add(tmp3,tmp3+64,aA,aA+64,r0,r0+64,modulus,arena);
+}
+
+void f6m_mul(uint8_t* abc, uint8_t* ABC, uint8_t* r, uint8_t* modulus, uint64_t inv, uint8_t* mem){
+  //printf("f6m_mul()\n");
+  uint8_t* aA = mem;
+  uint8_t* bB = aA+128;
+  uint8_t* cC = bB+128;
+  uint8_t* arena = cC+128;
+
+  f2m_mul(abc,ABC,aA,modulus,inv,arena);
+  f2m_mul(abc+128,ABC+128,bB,modulus,inv,arena);
+  f2m_mul(abc+256,ABC+256,cC,modulus,inv,arena);
+
+  f6m_mul_r2(abc,ABC,aA,bB,cC,r+256,modulus,inv,arena);
+  f6m_mul_r1(abc,ABC,aA,bB,cC,r+128,modulus,inv,arena);
+  f6m_mul_r0(abc,ABC,aA,bB,cC,r,modulus,inv,arena);
+}
+
+void test_f6m_mul(){
+  printf("f6m_mul test\n");
+  uint64_t buffer[] = {
+	  //a
+	          0x8f2990f3e598f5b1, 0xb8f480a3c388306b, 0xc023fac151c0104d, 0x13ec3aa181599402,
+	          0x72d1c8c528a1ce3b, 0xcaa280a8e735aa0d, 0x0000000000000000, 0x0000000000000000,
+		  0x992d7a27906d4cd5, 0x30b23a7e8c48c077, 0x8f8653fbc3332d63, 0xdb24339d8bc65d7e,
+		  0xe83b6e91c6550f5a, 0xceab102e88e91809, 0x0000000000000000, 0x0000000000000000,
+          //b
+	          0x7299907146816f08, 0xc4c6a394e91374ed, 0x6ff3618a57358cfb, 0x124ee6ab4c560e5c,
+	          0xac40700b41e2ee86, 0x74680728f0c5a618, 0x0000000000000000, 0x0000000000000000,
+		  0x0fd77f62b39eb952, 0xa0f8d21cec1f93b1, 0xd62dd7923aa86882, 0xddf7dd4d3532b0b7,
+		  0xede8f3fc89fa4a79, 0x574067e2d9a9d200, 0x0000000000000000, 0x0000000000000000,
+          //c
+	          0x7a69de46b13d8cb4, 0xc4833224aaf9ef7e, 0xa6a48975ab35c6e1, 0x23b8539ab84c381a,
+	          0x2533401a73c4e79f, 0x47d714899d01ac13, 0x0000000000000000, 0x0000000000000000,
+		  0xa9fa0b0d8156c36a, 0x1a9ddacb73ef278f, 0x4d149b560e88789f, 0x2bfeb9f708b6cc2f,
+		  0x988927bfe0186d5b, 0xf9cb40cb07f21b18, 0x0000000000000000, 0x0000000000000000,
+          //A
+	          0xecd347c808af644c, 0x7a3a971a556576f4, 0x34e302b6b490004f, 0xb418a4a7da330a67,
+	          0x43adeca931169b8b, 0x92e91df73ae1e115, 0x0000000000000000, 0x0000000000000000,
+		  0x12a2829e11e843d7, 0x64d5e3b80e75432d, 0x93f69b23ad79c38d, 0x43ebbc9bd2b17b9e,
+		  0x903033351357b036, 0x02624762e5ad360d, 0x0000000000000000, 0x0000000000000000,
+          //B
+	          0xd7f9857dce663301, 0xf393f9fac66f5c49, 0x168494e0d20797a6, 0xc4f96327ed4fa47d,
+	          0xd36d0078d217a712, 0x407d35046871d40f, 0x0000000000000000, 0x0000000000000000,
+	          0x2f1b767f6c1ec190, 0xeb76a0bce7906ad2, 0xe4a7548d03e8aa74, 0x5e34e1bf49d83ad6,
+		  0x4c04f57fb4d31039, 0xcb4cf01987fda213, 0x0000000000000000, 0x0000000000000000,
+          //C
+	          0x7b3f8da2f2ae4788, 0x5890b0d433a3eeed, 0x2f9f37cbcfc444e4, 0xf1d880390fcdb765,
+	          0x18d558857be01b2b, 0x10a8010bcdc6d606, 0x0000000000000000, 0x0000000000000000,
+		  0x319c02f6132c8a78, 0x6377868b5825ada9, 0xa5fe303e9ae3b03c, 0xe56e90734a17ce97,
+		  0x0c88b321012cf8da, 0xbb58211e3d50f610, 0x0000000000000000, 0x0000000000000000,
+          //r_0
+	          0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+          //r_1
+	          0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+          //r_2
+	          0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+     //bls12_mod
+                  0xabaafffffffffeb9, 0xffff53b1feffab1e, 0x24f6b0f6a0d23067, 0xbf1285f3844b7764,
+ 	          0xd7ac4b43b6a71b4b, 0x9ae67f39ea11011a, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+     //mem
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                 };
+ 
+  uint8_t* a = (uint8_t*)buffer;
+  uint8_t* b = a+128;
+  uint8_t* c = b+128;
+  uint8_t* A = c+128;
+  uint8_t* B = A+128;
+  uint8_t* C = B+128;
+
+  uint8_t* r_0 = C+128;
+  uint8_t* r_1 = r_0+128;
+  uint8_t* r_2 = r_1+128;
+
+  uint8_t* bls12_mod = r_2+128;
+  uint64_t bls12_r_inv = 0x89f3fffcfffcfffd;
+  
+  f6m_mul(a,A,r_0,bls12_mod,bls12_r_inv,bls12_mod+128);
+
+  uint64_t* output = (uint64_t*)r_0;
+  uint64_t expected[] = {
+           //r_0
+		  0xf4f3f4e0a35068ea, 0xac665aee2e71f682, 0xaecd20923b420023, 0xb6d5420ba01ea982,
+		  0x87c314107a998a65, 0x0ab3247ef39c920e, 0x0000000000000000, 0x0000000000000000,
+                  0x2c9620d993a22bad, 0xe623d165a9f4aa64, 0x8af87cb7292b7821, 0xc0fcd0adcd14ba65,
+		  0x5da54df2ad93262e, 0x24fc62bcd97e7208, 0x0000000000000000, 0x0000000000000000,
+	  //r_1
+	          0xead1838e6c5e1685, 0x43093c87eaeb576f, 0x940670026292dcb7, 0xa812600f4fb20a28,
+		  0x1be71ce1ef79f675, 0xe4a283b73906ca17, 0x0000000000000000, 0x0000000000000000,
+		  0x9c8b2c76405445b2, 0x0dd7635d562309f6, 0x9c2c87601d9055a5, 0xe10df2ea1d28237f,
+		  0xafd0d32f7e8c19d4, 0xcd5a3d1ef65b120b, 0x0000000000000000, 0x0000000000000000,
+	  //r_2
+	          0x40591ef0c74dbec9, 0x83b7bef145a87957, 0xc1e09049dbc85fbb, 0x3e9bb1174892ee83,
+		  0x294ef8c4a5954fff, 0xbff4ca6aca74c718, 0x0000000000000000, 0x0000000000000000,
+		  0x9b242b8f1c5d63bb, 0x525121bd68eda084, 0xab7e6d015052d5ad, 0xeb79ddb24091d2a8,
+		  0xe5b1da00212d0e6c, 0x11f01d2379011308, 0x0000000000000000, 0x0000000000000000,
+                };
+
+  for (int i=0; i<48; i++){
+    if (output[i]!=expected[i]){
+      printf("ERROR %i %lx %lx\n",i, output[i], expected[i]);
+      break;
+    }
+  }
+	          
+}
+
+
+void test_f2m_mul(){
+  printf("f2m_mul test\n");
+  uint64_t bls12_r_inv = 0x89f3fffcfffcfffd;
+  uint64_t buffer[] = {
+	  //bls12_mod
+                  0xabaafffffffffeb9, 0xffff53b1feffab1e, 0x24f6b0f6a0d23067, 0xbf1285f3844b7764,
+ 	          0xd7ac4b43b6a71b4b, 0x9ae67f39ea11011a, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+          //x
+	          0x8f2990f3e598f5b1, 0xb8f480a3c388306b, 0xc023fac151c0104d, 0x13ec3aa181599402,
+		  0x72d1c8c528a1ce3b, 0xcaa280a8e735aa0d, 0x0000000000000000, 0x0000000000000000,
+		  0x992d7a27906d4cd5, 0x30b23a7e8c48c077, 0x8f8653fbc3332d63, 0xdb24339d8bc65d7e,
+		  0xe83b6e91c6550f5a, 0xceab102e88e91809, 0x0000000000000000, 0x0000000000000000,
+          //y     
+	          0xecd347c808af644c, 0x7a3a971a556576f4, 0x34e302b6b490004f, 0xb418a4a7da330a67,
+		  0x43adeca931169b8b, 0x92e91df73ae1e115, 0x0000000000000000, 0x0000000000000000,
+		  0x12a2829e11e843d7, 0x64d5e3b80e75432d, 0x93f69b23ad79c38d, 0x43ebbc9bd2b17b9e,
+		  0x903033351357b036, 0x02624762e5ad360d, 0x0000000000000000, 0x0000000000000000,
+          //r     
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                  0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000,
+                };
+
+  uint8_t* bls12_mod = (uint8_t*)buffer;
+  uint8_t* x = bls12_mod+128;
+  uint8_t* y = x+128;
+  uint8_t* r = y+128;
+
+  f2m_mul(x,y,r,bls12_mod,bls12_r_inv,r+128);
+
+  uint64_t* output = (uint64_t*)r;
+  uint64_t expected[] = {
+           //r
+		  0x1a984f235709ab39, 0x41e22b5e67d5ba89, 0x2ce9242e227c0c6b, 0xb38aa1ace4d4b64a,
+	          0xaba753d350d98f4c, 0x05570f525d67a901, 0x0000000000000000, 0x0000000000000000,
+		  0xb1297e4e9ca0c757, 0xdfe693ea0d2f5216, 0xdaeaa4ad06964e2f, 0x7c242200049d386d,
+		  0x860b25d4718a2c42, 0x40fb89c90abe4e10, 0x0000000000000000, 0x0000000000000000,
+                };
+
+  for (int i=0; i<16; i++){
+    if (output[i]!=expected[i]){
+      printf("ERROR %i %lx %lx\n",i, output[i], expected[i]);
+      break;
+    }
+  }
+
+
+}
+
+int main(int argc, char** argv){
+  test_f6m_mul();	// doesn't pass
+  test_f2m_mul();	// doesn't pass
+  return 0;
+}