From 0dae7b28e4f73d7700b3ef677fdb387b903bbd52 Mon Sep 17 00:00:00 2001 From: Jared Wasinger Date: Tue, 4 Aug 2020 20:38:26 -0700 Subject: [PATCH] add poemm C impl of f6m_mul --- src/{ => evm}/v1/benchmark.yul | 0 src/{ => evm}/v1/test.yul | 0 src/{ => evm}/v2/benchmark.yul | 0 src/{ => evm}/v2/test.yul | 0 src/native/CMakeLists.txt | 4 + src/native/bigint.h | 528 +++++++++++++++++++++++++++++++++ src/native/f6m_mul.c | 313 +++++++++++++++++++ 7 files changed, 845 insertions(+) rename src/{ => evm}/v1/benchmark.yul (100%) rename src/{ => evm}/v1/test.yul (100%) rename src/{ => evm}/v2/benchmark.yul (100%) rename src/{ => evm}/v2/test.yul (100%) create mode 100644 src/native/CMakeLists.txt create mode 100644 src/native/bigint.h create mode 100644 src/native/f6m_mul.c diff --git a/src/v1/benchmark.yul b/src/evm/v1/benchmark.yul similarity index 100% rename from src/v1/benchmark.yul rename to src/evm/v1/benchmark.yul diff --git a/src/v1/test.yul b/src/evm/v1/test.yul similarity index 100% rename from src/v1/test.yul rename to src/evm/v1/test.yul diff --git a/src/v2/benchmark.yul b/src/evm/v2/benchmark.yul similarity index 100% rename from src/v2/benchmark.yul rename to src/evm/v2/benchmark.yul diff --git a/src/v2/test.yul b/src/evm/v2/test.yul similarity index 100% rename from src/v2/test.yul rename to src/evm/v2/test.yul diff --git a/src/native/CMakeLists.txt b/src/native/CMakeLists.txt new file mode 100644 index 0000000..6ad999a --- /dev/null +++ b/src/native/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required (VERSION 3.0) +project (f6m_mul_native) + +add_executable(f6m_mul_native f6m_mul.c) diff --git a/src/native/bigint.h b/src/native/bigint.h new file mode 100644 index 0000000..8d1690a --- /dev/null +++ b/src/native/bigint.h @@ -0,0 +1,528 @@ +#ifndef BIGINT_H +#define BIGINT_H + +#if !WASM +#include +#else +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef unsigned __int128 uint128_t; +#endif + +#define uint128_t __uint128_t + +/* +This header-only library has three parameters: +BIGINT_BITS - the number of bits of the big integer +LIMB_BITS - the number of bits in each limb, must correspond to a uint*_t type +LIMB_BITS_OVERFLOW - the number of bits output by multiplication, i.e. 2*LIMB_BITS, must correspond to a uint*_t type + +To use this library, define a limb size and include it: + #define BIGINT_BITS 256 + #define LIMB_BITS 64 + #define LIMB_BITS_OVERFLOW 128 + #include "bigint.h" + #undef BIGINT_BITS + #undef LIMB_BITS + #undef LIMB_BITS_OVERFLOW + +And if you need other sizes, define them: + #define BIGINT_BITS 512 + #define LIMB_BITS 32 + #define LIMB_BITS_OVERFLOW 64 + #include "bigint.h" + #undef BIGINT_BITS + #undef LIMB_BITS + #undef LIMB_BITS_OVERFLOW + +Now you can use functions like: + mulmodmont256_64bitlimbs(out,x,y,m,inv); + sub512_32bitlimbs(out,a,b); + +Warning: LIMB_BITS corresponds to the uint*_t type, and multiplication requires double the bits, for example 64-bit limbs require type uint128_t, which may be unavailable on some targets like Wasm. +*/ + + +// Define macros used in this file: + +// define types UINT and UNT2, where UINT2 is for overflow of operations on UINT; for multiplication should be double the number of bits +// UINT is the limb type, uint*_t where * is the number of bits per limb, eg uint32_t +// UINT2 is also needed for multiplication UINTxUINT->UINT2, e.g. uint32_txuint32_t->uint64_t or uint64_txuint64_t->uint128_t +#define TYPE_(num) uint##num##_t +#define TYPE(num) TYPE_(num) +#define UINT TYPE(LIMB_BITS) +#define UINT2 TYPE(LIMB_BITS_OVERFLOW) + +// define NUM_LIMBS to be the number of limbs +// eg UINT=uint32_t with NUM_LIMBS=8 limbs is for 256-bit +// eg UINT=uint64_t with NUM_LIMBS=8 limbs is for 512-bit +#define NUM_LIMBS (BIGINT_BITS/LIMB_BITS) + +// define the function name, use concatenation +// eg BIGINT_BITS=256, LIMB_BITS=64: FUNCNAME(myname) is replaced with myname256_64bitlimbs +#define FUNCNAME__(name,A,B) name##A##_##B##bitlimbs +#define FUNCNAME_(name,A,B) FUNCNAME__(name,A,B) +#define FUNCNAME(name) FUNCNAME_(name,BIGINT_BITS,LIMB_BITS) + + + +// add two numbers using two's complement for overflow, returning an overflow bit +// algorithm 14.7, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf +// except we ignore the final carry in step 3 since we assume that there is no extra limb +UINT FUNCNAME(add)(UINT* const out, const UINT* const x, const UINT* const y){ + UINT c=0; + #pragma unroll + for (int i=0; i=y, but actually it computes the 2's complement for x=0;i--){ + if (x[i]>y[i]) + return 0; + else if (x[i]=0;i--){ + if (x[i]>y[i]) + return 0; + else if (x[i]=0;i--){ + if (x[i] != 0) { n=i; break; } + } + for (int i=n;i>=0;i--){ + if (y[i] != 0) { t=n-i; break; } + } + + // not in the textbook + // special case for y=1, this hack is needed for now + if( n-t==0 && y[0]==1 ){ + for (int i=0;it;i--) + y_n_t[i] = 0; + for (int i=t;i>n-t;i--) + y_n_t[i] = y[i+n-t]; + for (int i=n-t;i>=0;i--) + y_n_t[i] = 0; + // now the while subtract loop + while (FUNCNAME(less_than_or_equal)(y_n_t,x)){ + q[n-t]+=1; + FUNCNAME(sub(x_,x,y_n_t)); + } + + // step 3 in book + // TODO + */ + + + + // THIS IS A NAIVE IMPLEMENTATION OF WHAT IS IN THE BOOK + // naive loop: while( y=0;i--){ + if (y[i]>x_[i]){ leq = 0; break;} + else if (y[i]=0;i--){ + if (y[i]>x_[i]){ leq = 0; break;} + else if (y[i]> LIMB_BITS; + UINT v = (UINT)uv; + w[i+j] = v; + c = (UINT)u; + } + w[i+NUM_LIMBS] = c; + } + for (int i=0; i< 2*NUM_LIMBS; i++) + out[i]=w[i]; +} + +// algorithm 14.16, Handbook of Applied Cryptography, http://cacr.uwaterloo.ca/hac/about/chap14.pdf +// NUM_LIMBS is t (number of limbs) in the book, and the base is UINT*, usually uint32_t or uint64_t +// output out should have double the limbs of input x +void FUNCNAME(square)(UINT* const out, const UINT* const x){ + UINT w[NUM_LIMBS*2]; + for (int i=0; i< 2*NUM_LIMBS; i++) + w[i]=0; + for (int i=0; i> LIMB_BITS); // / base + UINT v = (UINT)uv; // % base + w[2*i] = v; + UINT c = u; + for (int j=i+1; j> LIMB_BITS); // / base + v = (UINT)uv; // % base + w[i+j] = v; + c = u; + // may have more overflow, so keep carrying + // this passes sume tests, but needs review + if (uv=m + if (carry || FUNCNAME(less_than_or_equal)(m,out)){ + FUNCNAME(sub)(out, out, m); + } + // note: we don't consider the case x+y-m>m. Because, for our crypto application, we assume x,ym. Because, for our crypto application, we assume x,y> LIMB_BITS); // / b + } + // carry may be nonzero, so keep carrying + int k=0; + while (carry && i+j+k<2*NUM_LIMBS+1){ + UINT2 sum = (UINT2)(A[i+j+k])+carry; + A[i+j+k] = (UINT)sum; // % b + carry = (UINT)(sum >> LIMB_BITS); // / b + k+=1; + } + } + + // instead of right shift, we just get the correct values + for (int i=0; i>LIMB_BITS); + // if there was overflow in the sum beyond the carry: + if (sum0 || FUNCNAME(less_than_or_equal)(m,out)) + FUNCNAME(sub)(out, out, m); +} + +// From paper Çetin K. Koç; Tolga Acar; Burton S. Kaliski, Jr. (June 1996). "Analyzing and Comparing Montgomery Multiplication Algorithms". IEEE Micro. 16 (3): 26–33. +void FUNCNAME(mulmodmontFIOS)(UINT* const out, const UINT* const a, const UINT* const b, const UINT* const mod, const UINT inv){ + UINT t[NUM_LIMBS+2]; + for (int i=0;i>LIMB_BITS); + int k=1; + while (carry && k<=NUM_LIMBS+1){ + UINT2 temp = (UINT2)t[k] + carry; + t[k]=(UINT)temp; + carry = (UINT)(temp >> LIMB_BITS); + k++; + } + UINT m = ((UINT)sum)*inv; + sum = (UINT)sum + (UINT2)m*mod[0]; // lower limb of sum should be zero + carry = (UINT)(sum >> LIMB_BITS); + #pragma unroll + for (int j=1; j> LIMB_BITS); + k=j+1; + while (carry && k<=NUM_LIMBS+1){ + UINT2 temp = (UINT2)t[k] + carry; + t[k]=(UINT)temp; + carry = (UINT)(temp >> LIMB_BITS); + k++; + } + sum = (UINT)sum + (UINT2)m*mod[j]; + carry = (UINT)(sum>>LIMB_BITS); + t[j-1] = (UINT)sum; + } + sum = (UINT2)t[NUM_LIMBS] + carry; + carry = (UINT)(sum >> LIMB_BITS); + t[NUM_LIMBS-1] = (UINT)sum; + t[NUM_LIMBS] = t[NUM_LIMBS+1]+carry; + t[NUM_LIMBS+1] = 0; + } + + // output correct values + for (int i=0; i0 || FUNCNAME(less_than_or_equal)(mod,out)) + FUNCNAME(sub)(out, out, mod); +} + +// see description for mulmodmontCIOS +void FUNCNAME(mulmodmont)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){ + UINT A[NUM_LIMBS+2]; + for (int i=0;i>LIMB_BITS); + A[j] = (UINT)sum; + } + sum = (UINT2)(A[NUM_LIMBS]) + carry; + carry = (UINT)(sum>>LIMB_BITS); + A[NUM_LIMBS] = (UINT) sum; + A[NUM_LIMBS+1] = carry; + UINT A0inv = A[0]*inv; + sum = (UINT2)(A[0]) + (UINT2)A0inv*m[0]; + carry = (UINT)(sum>>LIMB_BITS); + #pragma unroll + for (int j=1; j>LIMB_BITS); + A[j-1] = (UINT)sum; + } + sum = (UINT2)(A[NUM_LIMBS])+carry; + carry = (UINT)(sum>>LIMB_BITS); + A[NUM_LIMBS-1]=(UINT)sum; + A[NUM_LIMBS]=A[NUM_LIMBS+1]+carry; + } + + // copy to out + for (int i=0; i0 || FUNCNAME(less_than_or_equal)(m,out)) + FUNCNAME(sub)(out, out, m); +} + +// Uses CIOS method for montgomery multiplication, based on algorithm from (but using notation of above mulmodmont) Çetin K. Koç; Tolga Acar; Burton S. Kaliski, Jr. (June 1996). "Analyzing and Comparing Montgomery Multiplication Algorithms". IEEE Micro. 16 (3): 26–33. +// Known as the Coarsely Integrated Operand Scanning (CIOS) +void FUNCNAME(mulmodmontCIOS)(UINT* const out, const UINT* const x, const UINT* const y, const UINT* const m, const UINT inv){ + FUNCNAME(mulmodmont)(out, x, y, m, inv); +} + +// like mulmodmont, but with two of the args hard-coded +void FUNCNAME(mulmodmont_3args_)(UINT* const out, const UINT* const x, const UINT* const y){ + UINT* m = (UINT*)4444444; // hard-code m or address to m here + UINT inv = 6666666; // hard-code inv here + FUNCNAME(mulmodmont)(out, x, y, m, inv); +} + +#endif diff --git a/src/native/f6m_mul.c b/src/native/f6m_mul.c new file mode 100644 index 0000000..e8915c8 --- /dev/null +++ b/src/native/f6m_mul.c @@ -0,0 +1,313 @@ +/* +This is a direct translation of: https://github.com/ewasm/evm384_f6m_mul/blob/master/src/v2/test.yul + +To execute: +git clone https://github.com/poemm/bigint_experiments.git +cd bigint_experiments +git clone https://gist.github.com/4ad8279ea1693c13a16b134970d67101.git f6m_mul_gist +cd f6m_mul_gist +gcc f6m_mul.c -o f6m_mul -O4 -march=native +./f6m_mul +*/ + + +#include + +#define BIGINT_BITS 384 +#define LIMB_BITS 64 +#define LIMB_BITS_OVERFLOW 128 +#include "bigint.h" + +void mulNR2(uint8_t* x0, uint8_t* x1, uint8_t* r0, uint8_t* r1, uint8_t* modulus){ + //printf("mulNR2()\n"); + FUNCNAME(submod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)x1,(uint64_t*)modulus); + FUNCNAME(addmod)((uint64_t*)r1,(uint64_t*)x0,(uint64_t*)x1,(uint64_t*)modulus); +} + +void f2m_add(uint8_t* x0, uint8_t* x1, uint8_t* y0, uint8_t* y1, uint8_t* r0, uint8_t* r1, uint8_t* modulus, uint8_t* arena){ + //printf("f2m_add()\n"); + FUNCNAME(addmod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)y0,(uint64_t*)modulus); + FUNCNAME(addmod)((uint64_t*)r1,(uint64_t*)x1,(uint64_t*)y1,(uint64_t*)modulus); +} + +void f2m_sub(uint8_t* x0, uint8_t* x1, uint8_t* y0, uint8_t* y1, uint8_t* r0, uint8_t* r1, uint8_t* modulus, uint8_t* arena){ + //printf("f2m_sub()\n"); + FUNCNAME(submod)((uint64_t*)r0,(uint64_t*)x0,(uint64_t*)y0,(uint64_t*)modulus); + FUNCNAME(submod)((uint64_t*)r1,(uint64_t*)x1,(uint64_t*)y1,(uint64_t*)modulus); +} + +void f2m_mul(uint8_t* x, uint8_t* y, uint8_t* r, uint8_t* modulus, uint64_t inv, uint8_t* mem){ + //printf("f2m_mul()\n"); + uint8_t* tmp = mem+64; + uint8_t* tmp2 = tmp+64; + uint8_t* zero = tmp2+64; + + FUNCNAME(mulmodmont)((uint64_t*)tmp2,(uint64_t*)(x+64),(uint64_t*)(y+64),(uint64_t*)modulus,inv); + FUNCNAME(submod)((uint64_t*)r,(uint64_t*)zero,(uint64_t*)tmp2,(uint64_t*)modulus); + FUNCNAME(mulmodmont)((uint64_t*)tmp,(uint64_t*)x,(uint64_t*)y,(uint64_t*)modulus,inv); + FUNCNAME(addmod)((uint64_t*)r,(uint64_t*)r,(uint64_t*)tmp,(uint64_t*)modulus); + + //FUNCNAME(mulmodmont)((uint64_t*)tmp2,(uint64_t*)(x+64),(uint64_t*)(y+64),(uint64_t*)modulus,inv); // why is this repeated from above? + FUNCNAME(addmod)((uint64_t*)tmp2,(uint64_t*)tmp,(uint64_t*)tmp2,(uint64_t*)modulus); + FUNCNAME(addmod)((uint64_t*)tmp,(uint64_t*)y,(uint64_t*)(y+64),(uint64_t*)modulus); + FUNCNAME(addmod)((uint64_t*)(r+64),(uint64_t*)x,(uint64_t*)(x+64),(uint64_t*)modulus); + FUNCNAME(mulmodmont)((uint64_t*)(r+64),(uint64_t*)(r+64),(uint64_t*)tmp,(uint64_t*)modulus,inv); + FUNCNAME(submod)((uint64_t*)(r+64),(uint64_t*)(r+64),(uint64_t*)tmp2,(uint64_t*)modulus); + +} + +void f6m_mul_r2(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r2, uint8_t* modulus, uint64_t inv, uint8_t* mem){ + //printf("f6m_mul_r2()\n"); + uint8_t* tmp1 = mem; + uint8_t* tmp2 = mem+128; + uint8_t* tmp3 = tmp2+128; + uint8_t* arena = tmp3+128; + + f2m_add(abc,abc+64,abc+256,abc+320,tmp1,tmp1+64,modulus,arena); + f2m_add(ABC,ABC+64,ABC+256,ABC+320,tmp2,tmp2+64,modulus,arena); + + f2m_mul(tmp1,tmp2,tmp3,modulus,inv,arena); + f2m_add(aA,aA+64,cC,cC+64,tmp1,tmp1+64,modulus,arena); + f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena); + f2m_add(tmp2,tmp2+64,bB,bB+64,r2,r2+64,modulus,arena); +} + +void f6m_mul_r1(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r1, uint8_t* modulus, uint64_t inv, uint8_t* mem){ + //printf("f6m_mul_r1()\n"); + uint8_t* tmp1 = mem; + uint8_t* tmp2 = mem+128; + uint8_t* tmp3 = tmp2+128; + uint8_t* arena = tmp3+128; + + f2m_add(abc,abc+64,abc+128,abc+192,tmp1,tmp1+64,modulus,arena); + f2m_add(ABC,ABC+64,ABC+128,ABC+192,tmp2,tmp2+64,modulus,arena); + + f2m_mul(tmp2,tmp1,tmp3,modulus,inv,arena); + f2m_add(aA,aA+64,bB,bB+64,tmp1,tmp1+64,modulus,arena); + f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena); + mulNR2(cC,cC+64,tmp1,tmp1+64,modulus); + f2m_add(tmp2,tmp2+64,tmp1,tmp1+64,r1,r1+64,modulus,arena); + +} + +void f6m_mul_r0(uint8_t* abc, uint8_t* ABC, uint8_t* aA, uint8_t* bB, uint8_t* cC, uint8_t* r0, uint8_t* modulus, uint64_t inv, uint8_t* mem){ + //printf("f6m_mul_r0()\n"); + uint8_t* tmp1 = mem; + uint8_t* tmp2 = mem+128; + uint8_t* tmp3 = tmp2+128; + uint8_t* arena = tmp3+128; + + f2m_add(abc+128,abc+192,abc+256,abc+320,tmp1,tmp1+64,modulus,arena); + f2m_add(ABC+128,ABC+192,ABC+256,ABC+320,tmp2,tmp2+64,modulus,arena); + + f2m_mul(tmp1,tmp2,tmp3,modulus,inv,arena); + f2m_add(bB,bB+64,cC,cC+64,tmp1,tmp1+64,modulus,arena); + f2m_sub(tmp3,tmp3+64,tmp1,tmp1+64,tmp2,tmp2+64,modulus,arena); + mulNR2(tmp2,tmp2+64,tmp3,tmp3+64,modulus); + f2m_add(tmp3,tmp3+64,aA,aA+64,r0,r0+64,modulus,arena); +} + +void f6m_mul(uint8_t* abc, uint8_t* ABC, uint8_t* r, uint8_t* modulus, uint64_t inv, uint8_t* mem){ + //printf("f6m_mul()\n"); + uint8_t* aA = mem; + uint8_t* bB = aA+128; + uint8_t* cC = bB+128; + uint8_t* arena = cC+128; + + f2m_mul(abc,ABC,aA,modulus,inv,arena); + f2m_mul(abc+128,ABC+128,bB,modulus,inv,arena); + f2m_mul(abc+256,ABC+256,cC,modulus,inv,arena); + + f6m_mul_r2(abc,ABC,aA,bB,cC,r+256,modulus,inv,arena); + f6m_mul_r1(abc,ABC,aA,bB,cC,r+128,modulus,inv,arena); + f6m_mul_r0(abc,ABC,aA,bB,cC,r,modulus,inv,arena); +} + +void test_f6m_mul(){ + printf("f6m_mul test\n"); + uint64_t buffer[] = { + //a + 0x8f2990f3e598f5b1, 0xb8f480a3c388306b, 0xc023fac151c0104d, 0x13ec3aa181599402, + 0x72d1c8c528a1ce3b, 0xcaa280a8e735aa0d, 0x0000000000000000, 0x0000000000000000, + 0x992d7a27906d4cd5, 0x30b23a7e8c48c077, 0x8f8653fbc3332d63, 0xdb24339d8bc65d7e, + 0xe83b6e91c6550f5a, 0xceab102e88e91809, 0x0000000000000000, 0x0000000000000000, + //b + 0x7299907146816f08, 0xc4c6a394e91374ed, 0x6ff3618a57358cfb, 0x124ee6ab4c560e5c, + 0xac40700b41e2ee86, 0x74680728f0c5a618, 0x0000000000000000, 0x0000000000000000, + 0x0fd77f62b39eb952, 0xa0f8d21cec1f93b1, 0xd62dd7923aa86882, 0xddf7dd4d3532b0b7, + 0xede8f3fc89fa4a79, 0x574067e2d9a9d200, 0x0000000000000000, 0x0000000000000000, + //c + 0x7a69de46b13d8cb4, 0xc4833224aaf9ef7e, 0xa6a48975ab35c6e1, 0x23b8539ab84c381a, + 0x2533401a73c4e79f, 0x47d714899d01ac13, 0x0000000000000000, 0x0000000000000000, + 0xa9fa0b0d8156c36a, 0x1a9ddacb73ef278f, 0x4d149b560e88789f, 0x2bfeb9f708b6cc2f, + 0x988927bfe0186d5b, 0xf9cb40cb07f21b18, 0x0000000000000000, 0x0000000000000000, + //A + 0xecd347c808af644c, 0x7a3a971a556576f4, 0x34e302b6b490004f, 0xb418a4a7da330a67, + 0x43adeca931169b8b, 0x92e91df73ae1e115, 0x0000000000000000, 0x0000000000000000, + 0x12a2829e11e843d7, 0x64d5e3b80e75432d, 0x93f69b23ad79c38d, 0x43ebbc9bd2b17b9e, + 0x903033351357b036, 0x02624762e5ad360d, 0x0000000000000000, 0x0000000000000000, + //B + 0xd7f9857dce663301, 0xf393f9fac66f5c49, 0x168494e0d20797a6, 0xc4f96327ed4fa47d, + 0xd36d0078d217a712, 0x407d35046871d40f, 0x0000000000000000, 0x0000000000000000, + 0x2f1b767f6c1ec190, 0xeb76a0bce7906ad2, 0xe4a7548d03e8aa74, 0x5e34e1bf49d83ad6, + 0x4c04f57fb4d31039, 0xcb4cf01987fda213, 0x0000000000000000, 0x0000000000000000, + //C + 0x7b3f8da2f2ae4788, 0x5890b0d433a3eeed, 0x2f9f37cbcfc444e4, 0xf1d880390fcdb765, + 0x18d558857be01b2b, 0x10a8010bcdc6d606, 0x0000000000000000, 0x0000000000000000, + 0x319c02f6132c8a78, 0x6377868b5825ada9, 0xa5fe303e9ae3b03c, 0xe56e90734a17ce97, + 0x0c88b321012cf8da, 0xbb58211e3d50f610, 0x0000000000000000, 0x0000000000000000, + //r_0 + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + //r_1 + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + //r_2 + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + //bls12_mod + 0xabaafffffffffeb9, 0xffff53b1feffab1e, 0x24f6b0f6a0d23067, 0xbf1285f3844b7764, + 0xd7ac4b43b6a71b4b, 0x9ae67f39ea11011a, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + //mem + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + }; + + uint8_t* a = (uint8_t*)buffer; + uint8_t* b = a+128; + uint8_t* c = b+128; + uint8_t* A = c+128; + uint8_t* B = A+128; + uint8_t* C = B+128; + + uint8_t* r_0 = C+128; + uint8_t* r_1 = r_0+128; + uint8_t* r_2 = r_1+128; + + uint8_t* bls12_mod = r_2+128; + uint64_t bls12_r_inv = 0x89f3fffcfffcfffd; + + f6m_mul(a,A,r_0,bls12_mod,bls12_r_inv,bls12_mod+128); + + uint64_t* output = (uint64_t*)r_0; + uint64_t expected[] = { + //r_0 + 0xf4f3f4e0a35068ea, 0xac665aee2e71f682, 0xaecd20923b420023, 0xb6d5420ba01ea982, + 0x87c314107a998a65, 0x0ab3247ef39c920e, 0x0000000000000000, 0x0000000000000000, + 0x2c9620d993a22bad, 0xe623d165a9f4aa64, 0x8af87cb7292b7821, 0xc0fcd0adcd14ba65, + 0x5da54df2ad93262e, 0x24fc62bcd97e7208, 0x0000000000000000, 0x0000000000000000, + //r_1 + 0xead1838e6c5e1685, 0x43093c87eaeb576f, 0x940670026292dcb7, 0xa812600f4fb20a28, + 0x1be71ce1ef79f675, 0xe4a283b73906ca17, 0x0000000000000000, 0x0000000000000000, + 0x9c8b2c76405445b2, 0x0dd7635d562309f6, 0x9c2c87601d9055a5, 0xe10df2ea1d28237f, + 0xafd0d32f7e8c19d4, 0xcd5a3d1ef65b120b, 0x0000000000000000, 0x0000000000000000, + //r_2 + 0x40591ef0c74dbec9, 0x83b7bef145a87957, 0xc1e09049dbc85fbb, 0x3e9bb1174892ee83, + 0x294ef8c4a5954fff, 0xbff4ca6aca74c718, 0x0000000000000000, 0x0000000000000000, + 0x9b242b8f1c5d63bb, 0x525121bd68eda084, 0xab7e6d015052d5ad, 0xeb79ddb24091d2a8, + 0xe5b1da00212d0e6c, 0x11f01d2379011308, 0x0000000000000000, 0x0000000000000000, + }; + + for (int i=0; i<48; i++){ + if (output[i]!=expected[i]){ + printf("ERROR %i %lx %lx\n",i, output[i], expected[i]); + break; + } + } + +} + + +void test_f2m_mul(){ + printf("f2m_mul test\n"); + uint64_t bls12_r_inv = 0x89f3fffcfffcfffd; + uint64_t buffer[] = { + //bls12_mod + 0xabaafffffffffeb9, 0xffff53b1feffab1e, 0x24f6b0f6a0d23067, 0xbf1285f3844b7764, + 0xd7ac4b43b6a71b4b, 0x9ae67f39ea11011a, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + //x + 0x8f2990f3e598f5b1, 0xb8f480a3c388306b, 0xc023fac151c0104d, 0x13ec3aa181599402, + 0x72d1c8c528a1ce3b, 0xcaa280a8e735aa0d, 0x0000000000000000, 0x0000000000000000, + 0x992d7a27906d4cd5, 0x30b23a7e8c48c077, 0x8f8653fbc3332d63, 0xdb24339d8bc65d7e, + 0xe83b6e91c6550f5a, 0xceab102e88e91809, 0x0000000000000000, 0x0000000000000000, + //y + 0xecd347c808af644c, 0x7a3a971a556576f4, 0x34e302b6b490004f, 0xb418a4a7da330a67, + 0x43adeca931169b8b, 0x92e91df73ae1e115, 0x0000000000000000, 0x0000000000000000, + 0x12a2829e11e843d7, 0x64d5e3b80e75432d, 0x93f69b23ad79c38d, 0x43ebbc9bd2b17b9e, + 0x903033351357b036, 0x02624762e5ad360d, 0x0000000000000000, 0x0000000000000000, + //r + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + }; + + uint8_t* bls12_mod = (uint8_t*)buffer; + uint8_t* x = bls12_mod+128; + uint8_t* y = x+128; + uint8_t* r = y+128; + + f2m_mul(x,y,r,bls12_mod,bls12_r_inv,r+128); + + uint64_t* output = (uint64_t*)r; + uint64_t expected[] = { + //r + 0x1a984f235709ab39, 0x41e22b5e67d5ba89, 0x2ce9242e227c0c6b, 0xb38aa1ace4d4b64a, + 0xaba753d350d98f4c, 0x05570f525d67a901, 0x0000000000000000, 0x0000000000000000, + 0xb1297e4e9ca0c757, 0xdfe693ea0d2f5216, 0xdaeaa4ad06964e2f, 0x7c242200049d386d, + 0x860b25d4718a2c42, 0x40fb89c90abe4e10, 0x0000000000000000, 0x0000000000000000, + }; + + for (int i=0; i<16; i++){ + if (output[i]!=expected[i]){ + printf("ERROR %i %lx %lx\n",i, output[i], expected[i]); + break; + } + } + + +} + +int main(int argc, char** argv){ + test_f6m_mul(); // doesn't pass + test_f2m_mul(); // doesn't pass + return 0; +}