From 943c07066e737a935b3b7f345610cdb5077d7633 Mon Sep 17 00:00:00 2001 From: John Doe Date: Fri, 9 Sep 2022 02:47:49 -0400 Subject: [PATCH] initial stuff --- .vscode/settings.json | 13 + assembly.o | Bin 0 -> 43592 bytes blindsig.c | 182 ++ blst/aggregate.c | 674 +++++ blst/asm/add_mod_256-armv8.pl | 412 +++ blst/asm/add_mod_256-x86_64.pl | 547 ++++ blst/asm/add_mod_384-armv8.pl | 872 ++++++ blst/asm/add_mod_384-x86_64.pl | 1430 +++++++++ blst/asm/add_mod_384x384-x86_64.pl | 260 ++ blst/asm/arm-xlate.pl | 381 +++ blst/asm/ct_inverse_mod_256-armv8.pl | 586 ++++ blst/asm/ct_inverse_mod_256-x86_64.pl | 837 ++++++ blst/asm/ct_inverse_mod_384-armv8.pl | 610 ++++ blst/asm/ct_is_square_mod_384-armv8.pl | 398 +++ blst/asm/ct_is_square_mod_384-x86_64.pl | 494 ++++ blst/asm/ctq_inverse_mod_384-x86_64.pl | 886 ++++++ blst/asm/ctx_inverse_mod_384-x86_64.pl | 995 +++++++ blst/asm/div3w-armv8.pl | 122 + blst/asm/div3w-x86_64.pl | 184 ++ blst/asm/mul_mont_256-armv8.pl | 409 +++ blst/asm/mul_mont_384-armv8.pl | 2015 +++++++++++++ blst/asm/mulq_mont_256-x86_64.pl | 513 ++++ blst/asm/mulq_mont_384-x86_64.pl | 2675 +++++++++++++++++ blst/asm/mulx_mont_256-x86_64.pl | 486 +++ blst/asm/mulx_mont_384-x86_64.pl | 2384 +++++++++++++++ blst/asm/sha256-armv8.pl | 541 ++++ blst/asm/sha256-portable-x86_64.pl | 337 +++ blst/asm/sha256-x86_64.pl | 789 +++++ blst/asm/x86_64-xlate.pl | 1781 +++++++++++ blst/assembly.S | 123 + blst/blst.h | 480 +++ blst/blst_aux.h | 79 + blst/bulk_addition.c | 168 ++ blst/client_min_pk.c | 17 + blst/client_min_sig.c | 17 + blst/consts.c | 36 + blst/consts.h | 30 + blst/e1.c | 558 ++++ blst/e2.c | 632 ++++ blst/ec_mult.h | 289 ++ blst/ec_ops.h | 787 +++++ blst/elf/add_mod_256-armv8.S | 379 +++ blst/elf/add_mod_256-x86_64.s | 572 ++++ blst/elf/add_mod_384-armv8.S | 931 ++++++ blst/elf/add_mod_384-x86_64.s | 1809 +++++++++++ blst/elf/add_mod_384x384-x86_64.s | 252 ++ blst/elf/ct_inverse_mod_256-armv8.S | 784 +++++ blst/elf/ct_inverse_mod_256-x86_64.s | 1185 ++++++++ blst/elf/ct_inverse_mod_384-armv8.S | 717 +++++ blst/elf/ct_is_square_mod_384-armv8.S | 324 ++ blst/elf/ct_is_square_mod_384-x86_64.s | 479 +++ blst/elf/ctq_inverse_mod_384-x86_64.s | 1195 ++++++++ blst/elf/ctx_inverse_mod_384-x86_64.s | 1574 ++++++++++ blst/elf/div3w-armv8.S | 88 + blst/elf/div3w-x86_64.s | 123 + blst/elf/mul_mont_256-armv8.S | 464 +++ blst/elf/mul_mont_384-armv8.S | 2372 +++++++++++++++ blst/elf/mulq_mont_256-x86_64.s | 714 +++++ blst/elf/mulq_mont_384-x86_64.s | 3620 +++++++++++++++++++++++ blst/elf/mulx_mont_256-x86_64.s | 627 ++++ blst/elf/mulx_mont_384-x86_64.s | 2968 +++++++++++++++++++ blst/elf/sha256-armv8.S | 1077 +++++++ blst/elf/sha256-portable-x86_64.s | 1754 +++++++++++ blst/elf/sha256-x86_64.s | 1446 +++++++++ blst/errors.h | 19 + blst/exp.c | 55 + blst/exports.c | 584 ++++ blst/fields.h | 211 ++ blst/fp12_tower.c | 771 +++++ blst/hash_to_field.c | 176 ++ blst/keygen.c | 182 ++ blst/map_to_g1.c | 559 ++++ blst/map_to_g2.c | 444 +++ blst/multi_scalar.c | 414 +++ blst/no_asm.h | 1287 ++++++++ blst/pairing.c | 443 +++ blst/point.h | 61 + blst/rb_tree.c | 145 + blst/recip-addchain.h | 489 +++ blst/recip.c | 139 + blst/server.c | 24 + blst/sha256.h | 140 + blst/sqrt-addchain.h | 489 +++ blst/sqrt.c | 261 ++ blst/vect.c | 176 ++ blst/vect.h | 483 +++ build.sh | 11 + ctm | Bin 0 -> 190000 bytes ctm.c | 404 +++ debugprint.c | 30 + debugprint.h | 13 + fstoken.c | 25 + fstoken.h | 20 + libblst.a | Bin 0 -> 228564 bytes main.c | 114 + nonblind.c | 131 + server.o | Bin 0 -> 178592 bytes set_token_path.sh | 3 + test | Bin 0 -> 180488 bytes 99 files changed, 58786 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 assembly.o create mode 100644 blindsig.c create mode 100644 blst/aggregate.c create mode 100755 blst/asm/add_mod_256-armv8.pl create mode 100755 blst/asm/add_mod_256-x86_64.pl create mode 100755 blst/asm/add_mod_384-armv8.pl create mode 100755 blst/asm/add_mod_384-x86_64.pl create mode 100755 blst/asm/add_mod_384x384-x86_64.pl create mode 100755 blst/asm/arm-xlate.pl create mode 100755 blst/asm/ct_inverse_mod_256-armv8.pl create mode 100755 blst/asm/ct_inverse_mod_256-x86_64.pl create mode 100755 blst/asm/ct_inverse_mod_384-armv8.pl create mode 100755 blst/asm/ct_is_square_mod_384-armv8.pl create mode 100755 blst/asm/ct_is_square_mod_384-x86_64.pl create mode 100755 blst/asm/ctq_inverse_mod_384-x86_64.pl create mode 100755 blst/asm/ctx_inverse_mod_384-x86_64.pl create mode 100755 blst/asm/div3w-armv8.pl create mode 100755 blst/asm/div3w-x86_64.pl create mode 100755 blst/asm/mul_mont_256-armv8.pl create mode 100755 blst/asm/mul_mont_384-armv8.pl create mode 100755 blst/asm/mulq_mont_256-x86_64.pl create mode 100755 blst/asm/mulq_mont_384-x86_64.pl create mode 100755 blst/asm/mulx_mont_256-x86_64.pl create mode 100755 blst/asm/mulx_mont_384-x86_64.pl create mode 100755 blst/asm/sha256-armv8.pl create mode 100755 blst/asm/sha256-portable-x86_64.pl create mode 100755 blst/asm/sha256-x86_64.pl create mode 100755 blst/asm/x86_64-xlate.pl create mode 100644 blst/assembly.S create mode 100644 blst/blst.h create mode 100644 blst/blst_aux.h create mode 100644 blst/bulk_addition.c create mode 100644 blst/client_min_pk.c create mode 100644 blst/client_min_sig.c create mode 100644 blst/consts.c create mode 100644 blst/consts.h create mode 100644 blst/e1.c create mode 100644 blst/e2.c create mode 100644 blst/ec_mult.h create mode 100644 blst/ec_ops.h create mode 100644 blst/elf/add_mod_256-armv8.S create mode 100644 blst/elf/add_mod_256-x86_64.s create mode 100644 blst/elf/add_mod_384-armv8.S create mode 100644 blst/elf/add_mod_384-x86_64.s create mode 100644 blst/elf/add_mod_384x384-x86_64.s create mode 100644 blst/elf/ct_inverse_mod_256-armv8.S create mode 100644 blst/elf/ct_inverse_mod_256-x86_64.s create mode 100644 blst/elf/ct_inverse_mod_384-armv8.S create mode 100644 blst/elf/ct_is_square_mod_384-armv8.S create mode 100644 blst/elf/ct_is_square_mod_384-x86_64.s create mode 100644 blst/elf/ctq_inverse_mod_384-x86_64.s create mode 100644 blst/elf/ctx_inverse_mod_384-x86_64.s create mode 100644 blst/elf/div3w-armv8.S create mode 100644 blst/elf/div3w-x86_64.s create mode 100644 blst/elf/mul_mont_256-armv8.S create mode 100644 blst/elf/mul_mont_384-armv8.S create mode 100644 blst/elf/mulq_mont_256-x86_64.s create mode 100644 blst/elf/mulq_mont_384-x86_64.s create mode 100644 blst/elf/mulx_mont_256-x86_64.s create mode 100644 blst/elf/mulx_mont_384-x86_64.s create mode 100644 blst/elf/sha256-armv8.S create mode 100644 blst/elf/sha256-portable-x86_64.s create mode 100644 blst/elf/sha256-x86_64.s create mode 100644 blst/errors.h create mode 100644 blst/exp.c create mode 100644 blst/exports.c create mode 100644 blst/fields.h create mode 100644 blst/fp12_tower.c create mode 100644 blst/hash_to_field.c create mode 100644 blst/keygen.c create mode 100644 blst/map_to_g1.c create mode 100644 blst/map_to_g2.c create mode 100644 blst/multi_scalar.c create mode 100644 blst/no_asm.h create mode 100644 blst/pairing.c create mode 100644 blst/point.h create mode 100644 blst/rb_tree.c create mode 100644 blst/recip-addchain.h create mode 100644 blst/recip.c create mode 100644 blst/server.c create mode 100644 blst/sha256.h create mode 100644 blst/sqrt-addchain.h create mode 100644 blst/sqrt.c create mode 100644 blst/vect.c create mode 100644 blst/vect.h create mode 100755 build.sh create mode 100755 ctm create mode 100644 ctm.c create mode 100644 debugprint.c create mode 100644 debugprint.h create mode 100644 fstoken.c create mode 100644 fstoken.h create mode 100644 libblst.a create mode 100644 main.c create mode 100644 nonblind.c create mode 100644 server.o create mode 100755 set_token_path.sh create mode 100755 test diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b3d8c2f --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,13 @@ +{ + "files.associations": { + "vector": "c", + "memory": "c", + "optional": "c", + "string_view": "c", + "string": "c", + "system_error": "c", + "thread": "c", + "typeindex": "c", + "variant": "c" + } +} \ No newline at end of file diff --git a/assembly.o b/assembly.o new file mode 100644 index 0000000000000000000000000000000000000000..5ade239cac5ef35dcc16b12f122a32d61f15bcb1 GIT binary patch literal 43592 zcmeHw3wTsjmTsK{Awil-I$k^Jc(u}0+sR;K3Tcczj#gon?4##ER0LF-gjWQGBqjmz z(GVN#>zOjG?exsp(>*=Rv`)XA3p3NxtuwY(cqJf@g!fZ|fEMxwF}#HPul=ZV>ZB?O z%JkRwez(4_a!#$Y_u6Z(z4m(TeRk$YjJq>5&7(e==MInkOF=!J7k**i2l;K#bG4_x z=hd74a>NhAw?47{eC$77-}xU0A6Yu~wiCH09(wJ2cfPWD;&s1y_1FEjX^$-U@;lE5 z{L8*O;K7FPyi<4aU*B*1Mc-XNX*ltd=H?kc+5i2K?;KnC_s!ll|2V*3bJM|Z{(51< zJ-dFjWWtUY&d$02zB^u>wKDL_)^B7kX_^1fyQi-1SUK{Czu0`=l+}G7Eh)L?`ToEE zMp;qekPEYJGV{Jv_Q06>{?q=e|6_B8tXlfq^EcJK@{jR%|LNYUul}+&dhgPo-#4?R zApT0bzweK?JpZkMpVTgTEa%$h*9x*`-g>?9B>TpHyuD=DzYn z&5-FwzxPr@&6L;YUNg6KVOB=Vi{~!A^YWpY<*#-A{Omg?=RUDAGeh(A>w8t7tNUkX zc?W)JfJe*7%*yV6^(EIw!5?6uq2C(rn>@2-%HzIbg*_lAX>^^V(9-s{h?rZ1hoc+!A1!Jleh zmAA~Z2X}Kyfg{vd<-NF!Kp#=n4c@Z%+JPuG@MZeB3O`pm!t4uXW`GN@C9BN`bpzsP5KJ6@=j^^W+L9=!_BdbMwyEn z=+7$e^LKo9=@Q8{3HVth>a!r0-}Y|JI^~^@`7nQ6&Z%XN3FEjG@_+rg2OkKH2|XAZ z8yd%&Kj_0294=-}*Zd8$Zuc{5j6a822Uycoe=ORME!ZE6R`iLE&Wes35RQ(_2}eg? z6ON9{4@WDm6Y~lE`iuDaOP8268wAw!#dj3Vt+5!JGEWQcCYG|MZ>=~-gqL~Nqx_Bd zR%4vYJX_J@4*Z(K3-7_*JQ}5g_@2ZUb6~#TAF@{BI@6zHp}_nGzb^nJ^PlyXMywTw zJZ!#TXG}XxsbIsqHRHqi$qDp zIvKLi^>x|wJR=lX8wn6i^zf=kU`wb7T&lg)hzH?J!=DwgFcS59vQP?@KqeTm7iIfW zr@;tRiD(#=+B4p$0@U}~OIhK-)`&)8F`P(1?@{LxXj>Q2;*kK!j2wtqL_33D$3xbN zh(AqHw+s#vmtRweTWB6e5B3VH$vB zQ9$Le_Hmfx@8WvmLP*=lWkEX^AuiOOITzBl2ofL)VH$;mC`5=0yYNtmLMT9^bs$o- z3lF#`U?8rcae(AdIUxw)KucKLOC=>Qcs;N?tR0MSNq}?_Ap-V6%{f zuyrD05jG(Sk-+AVEeRp8v}hBT1U$%`;Li_P=oCq~-!qLvsQ6g&0Ig3ONHpGz}n^k)kyKLJF>L(UP1g}D%*n=3+G2parUg%rZRghDt=9O$ADK5!_6Gf)T<{JzW7 z2%HC9G{V_0suBF3LnF|1hjO|y8lkv5jj+|J5zbv{jc^mmLOP8w$LEv=r%I3`Ayon< zo>U327g8EJ5<1}=NdsnovQF6N&~rV@n($XjCzN%k z6ZSZD!uczz6K?NDCzLv+z^N1D2uPg(?UT?6rH%9M@_3(L+j;2{$%D`gU9^I55RzR3 z;TBMfytBd^5R+|(Z~~Y<@$O=`6EoB{*TPgwD;?nmT9E26&LP__+cQv@;{>hJ3JY3kx{0YKXeFa7usSA} zFty1>Rv1`9%1M=Wm5^foaC0gxsW761(>9>6fx>PIlPS%mFqM){6cH9upj@3Wj6|nx z6oowm3_c>+LfZ^>v49ClPZ&OK^rXELrcRLQFmtwrOISB<*b-Dqb0(-Hu@xdKs3dPz zpmcb$N<@>4)s>S`GCfyZM#-F9c^SRwze7eHO8<(BgggV z<7f1yL8q2cKFEmnHKL=kjOf@L-P>wJC+6$kRcNJ;?p=+B2I$`R(AG72bKFPm>CFdm z?WZ??jLYaOz4;KX<3LhHzQD&yI=#Q^3(B<@!na;R96tN_sb$_9D8+>QQBr%mKB_nDwmR$1KBN?C2P8C|cz+ zqSXKoHKI>vvFO7&Ec#eJi`MyA^yy;5@)#Ba9p#{7Jjkl_8P;@wYfBAlZW*)22bndU zF{`$mSu2?}cl^6|phxlSFL_XLs-GFwOn*6RX`zHD3v|ZY`r~Sit7OqJe#W8(9yZM{ zjlZS9&wWZI1(M|h(jbZQ(qsH+-|z=rsH*8<^zKR{`m>io3bQuq)>76v(_f}rF~i!X zTkZ5KsB1Ajx81Ntq9>!v^yag7;u^#kG>cu(cLvY zw5O);>)P}k0O`%2L3(N+Ni}8o2JwXy)j*1B#wU?=0&Go5)|etIP5(uAP5XLk`p&LRzesAwIT+``)|9(}MOw7p(VYA#v&bKOlh zrfYg+cTN9UPfhDxoBkWpaL!&|m9FVE#Ikg~UMwG`?e#hu*yLUpx|+6iJPh5$viJ>Y zDB)IS9CE2#om9r_yP}RcG}*PJ+@gKWz$%!mZB5$i#m@HKGmv zVg|L_eCAH*PTgvT4mJk$H3p5!(ye92ps_i|AedBZxiP59#|GV9Y-%gb-1SE8Mk9B( zk-L}WE;XztF`wo`$5#Xy<`aWfhUFMqX zOOun;FQ>Av((Ey7GiyF~Ckw<_NgEl-EM_6ugFcnAz-Fec&;xCHpj{7a&;uLwz$QJg z*?`?x(C*O}Y`}LTzMJsf>|vb;7#ZzWZeI@QS~HmrDr9Ydamd1RT|=j~3D`z5?Hsnu zusQ^66PbDrTW(l!0o!a?kNQ~W5nWqB04Nmj6+=cuTjRi-)%IZLBpdWO82ouK7!!jm zS`Aw}3KnGEcxG*J^f<7EX)C+-Q4eg<16x^OBQYV+&H`K0^jcrA1-;*jl=(({!3A)J z(eKU=xy=;Z_%MkZ^vuZ#{@cL?{@cL;{(FT1w~%vl3;(^+u+E6zTlw!6CO8XokKasN z#smjxfa&)NX0=jzM%Ev{o0+K3tgSSevY2=AN|M?frkyb|GmN}@vW&d@bBw%)^Nqa8 zJ|pk3Vk57v6h2Xzo_mSqo@cqAvD~vP_Y5!?R)%4rf`tkeDp;ssp@LNh34R)b^e`m& zF-Y(|kl+#Hlf=_|LrVWIrF>qtL^{SGSt8xi=$1sc6oNe~NCatKK;)9WQxZ`U%rugi zJjebN`p{-%_A~O}UgyER&a1!(qXT*9KGPMfj{l^X9xY+M}H2}pVN2f%@?8ij~FGV&A@RpaKa3nGy|v1 zz$a$lw64WdsrfV_VgybaffG#om}v)-#6_)-BACSdm^dEl*avl7Zba|ShZTjF*lJ{N zGrZyV@ZGI@!`txv7*CE02WaNEJG_i(!ww}%8uj6$oXw;$9y zTL?7}F2{GbsqHtlBdla8E7`u_nC5-{8SDfh?M&p_TR5Ea&k_GFIGhzPAwuDffKSlr zMs%hh+@6XMI648234X%qnd&d25S;dJbdA4NSZtdD(^^T^z_iwLgJ89Bn_%r^1sjaP zno+1j{~s_4@5?a?AIdihCqX41H40}Gvjsam;KfRO*Q4icBthO0JIN=&^#Bj?y(5m2 z?N^F4sh3sUe@W_JKC_1Lb@eQ+kelvDZxBQe<6OcMZqS86XNKHn8OOoD6ORA31;Q_Q$2sbMXgJ6nTx#bW@}^!{HcRn+&dBTUK)6iw2CM zl}7X__*3t*XedW7`jkZ<%GZlJS#*Yv6&+_qr&!TxHt30jOW9`R?lf`_7`aEyf+KqS zQPVo3w;zK~%G%?)wS%=Iyg$U+kLcDXto^9o+;OMgy!TEs`?%ix0WN3t=6!eS*-Q22 z{kZJZn-Ab}SkFG4l)p*l54;SOJ#zYF@u4&xsf|ppq}9ouWd0<#pQ>?FiL;$1}Qp`8PTe8qzuQ$r&8&brKOnnEEAR1V?$1_ znBoxQuKekazLVIi6hA_L;`l=Zrj@xgG%umbzo9?>j6W*-2PF(!TLEkQXgNID@xALs zYX08OqSvbVr^8B6Ha8nl8uF~!8+fWUdk1S-iNU>OWPe~}!`KIyH?oB*HLlUN2#too zgfv^+rLPQLxh}IA9bUW!bAtZ}>RAXW>2|%Dfd9 zn3(xlxE^Lmj^lb7ku)qvjgMJR!VL7vI5uKzUHYHo_sZGV!5QejQHbCPI^8I|KZlgM zQTT8^@xUmY42wO(XB0jLqYW+;Y|x9g=tWzJE8vLWhIZV{JqtO|DQAG(JG4DwYd!Zc znN&UZbZV<=tIt0%`wISIs-@Hq21(O_$XKS(4%hfSq~`UF25@|k`l)2I5Ur}Khj z{-~dUJ&Q&Z5n~k7C^{OFIj$IPtYNj-#wm9v%U#8CH?Re59%FD{qYz6-g-CW4qJxE_ ze5`ORsNaV42vS2IX>3792FpH^uHj_;BJ2iME25*(5$LOE1(G_*iA9kUvzD3Jt!8$c znY~TVZq>8fSW`b1z_QP7rtOqw-=+=unZehmrfrXhWeW~~zK=lPF(V3J8jCa+9&?RX`HfXoRF8B z>c@gw3j0yzt;S};WBaj>>~S3#2V@fj9qs-u^&y8r9dRHm7M(3RPAHmX`+3) zr}l9b?PDXFXdgEMXB!~A{o}ad?c+wYPlbu)6MnL&Tsrb(v{HJ^3 zKdz$DR@EqOM5Av~qs0GhxZ(WAjUbf@b5i4~7H&i>D$Hx0QMGU*YEfZcD@)bFji^P1 zd98d^3pb(`73Q^yRW00zT2z?VDpR#^BWh9Mq#4L#_l4x1aQ`F)e||@a4Vy);$e-ZS zyW&=m(WuD44JYG&)xtGXl)S7c!HuAV%5(Y`sfxG}^iyF@{}NRTH=-65=Cxv~7H&i> zD$HxOsam)ZwWu(!)uC$PM%1Fhyw-753pb(`6{cEOP+O4Ll&XesBO3C_hQ&^PslTO% z6r*$bU`-9!cF__G0%!(e-u!;7>Gpo?woc|9c~SR{{BMGwoi6)HoDrW9g@1+|6BcS{ z$pQNog0$>_EFq(06VDW4`GG~}j2G!9_W<<-#5LUc2Vk`n*@lI&!aDeQ$hc9M z(54r)>qWTPs26S0i#E#zMp{&adx}oMJEcA$GeliNcF69P!vZS(n;gH#OAz|@GK31I zil0_pZvFq)`J1kPe=7OcboBiR#;?yEW=NINLa5mE)!;{Pg>Z~E5n(RDyKj2dUmWjC z8x5KsLFyQ5dQETpUw~$epWyo%P>wZ?MGWz_KOd;#kMOh*66t`VEE3B)L7Hx0O;47_ z|3Gz`Ze>klD7B8d@p~ybLy&Qc;&<|U+78h4Bi@E+y_mq(dr-?%37HM#BYg3H!-k>w zH@$@KxBfT?;*Th|{VF}l!xLKmr~dVJ`CsaP$NYWnFm;t#XcUXVjx~Sk8&NTnXhWzd zyZfxaJbu;&;oUQy5E~AqhM1@dF;R7XJUuK zVTxJq5$;_eeZn0KGsQ^O`S7Hl5Xk0uxKW;R|5z4mN+vjU{}WPgOp4wj=->P{N0;EeQa&qEr>@*ozXqD@!2RBpXT8 z!mcKa5^v1LZQwEA>mu7!O_*A%fKz!ity(g*b^(_ZsLd~?wnM-rA>A}4FBhusRksjY4m7d}r=_&pr(qqNkR!}0TW^$i2XiH zzr@zDO=?>cFGH%5wxT6%E$cvV$oKJ`F|}1Xvbeg|P8W(tcniQLH=aQ#fgKyw_-|8N zLk+9le?Th1YZG9!rM9h^+ByNo?lm;IiqHL4Q`-QgN!w0LZK z!6TvLI&d4|k!p!=u>-pOq}i;v@Oo$)q^Z5H=VCJ)0!RGfioMHLJk-Np#^nsdrXf*| z&Ta-??DvGXIy%TVQc{4~*HJ4mncqe20)5*W9 zW!6XW-*Z2dY(71L;h#<}XKnmWa&Z3>F09cM_Rl4QGq-o!N|W=yN$o?UZ;XEkOBA0( zhlo4S7-|M$t#!4ts7T5GK__!gt^nJkk$%e&xN-V4w20U!4)A1Q{{cpTre z_@eBsxU41T0FEX(l{xrLPGvqM4K*rY-+Qq%=pd5YUp_yv*n#(DocRsRQ|4Wa-M=x` zIf#otUb?^z&M8@ZOQ_T)Cun?WM_T6;SXj^~;U{f4tQ}^afMW-ackwDyJ7qtH*~LVZ z)jqM|u)dgyax$Dwz(EQ&EVV*f1u1A!mN_TN5FDhSNeMr1!$At#5^yZRc^9{v+6DVD zNU@vz%!Y#$%>Fh4nA*hz9Hhhm)^jn}h%0DPmRTms5FDhSNeN$W!$At#5^xJUuNH4N z#Xe!KAf|v6yGdI^OaUqAKA+PjW;|y;ngJ=$r`zyWj35O~$}+ZUBT~?$gxk7p3P?d) z0&aZ(QuZcv9H9d#c9ZKxlLQAT=za$QoZbyc!I*UDa0NM6(4;K0S=1#sNI{blzQu-v z6tpGa)*+B`(A2P=#kXP$QtT!<@3{3KQqcXlAjNj5Knlhr4s|R@L6fq~`vNDyK?<6b z@I5vhq@XPc=gi?&iP&McwcRcc8qn|Kf(GThC22gz#B;6>Qe1$F6c-o>3J`0-79&g; zi^yo=c(Y)CFZ6`IV3!BV;Bs_DBERa^f2SzAz;>1n3N!+Uh)>(mvDk$KpPw(1a>bRT zwF)08mW!l1Uxe()FtX#9tCbvl>gsxJF+OPm zm$)4%!j69e<>L9IEI!%zb6q<~#rRtRlQ3zOtf}_8kX(o+F`pq zwVw#_v?DNZ{FMl+1>vN#mr$`)pha~PJ|1rc`kP0Vs+T7E%av6d8Fuet%Es|bg&JCyTucGabR0{qqSCeGy$dP58f;X)R-YDaj!~^g~89&Jl!2|Gy_uRID z3EqI=t(-3Dwvl^13B1936(Z`?wWEp%$s5HA5ln+OHV@ttU4)IGsN3$kGDVa`^Dd@0 zH#l!>9@sXCcmoev@q%g5IxfBCdmPK!yQ&G zj@+;$c%#fPSU~*}aPX!DyvY|#JEn+`yiu&+#);&{=D{Yx8;bTpi`{i)8x3nWctg1@ z&KsKtwvi;>z}J_&v8^NVM)(tW<%w(;xdZgBlj6;`1aFjkki-L`N4f!$8wt7LJ?FBx z&E^4-ktl!6hJ!cADBa}~yn*!$(TkwW59-zhUx-!kz-hc<(|48Bi(UX8Xn@F zh`w-y;EmGTCgL_Cfy2R@59k+<3!zfTVWKcl@g|<&jdGxp<^s_p9VN+)gxv6++kRAn zH_CxZ(&>;JB7DH;&bK(TR&=ijY$JOxiN(|uS!-A!3|hYxfPz7Xa= z#Y^ibUc!~fONg6xJL9Ev?in`EEQFyH>!S|;NaW8G)|6sZtTZ6!OsVVqQz7rXT?M1(md>Rgq-Nf8>xD-6i4f* ze0nc>Vu$1o?@zoxfxN5oKjIGmL&%nu@IU15jr2lGlK&we*#3tExcm>{d~jJtpo=Kd z>3o>lSutM-4uMzBhpE|7=cIg76R~E(`Oxu-V&^2xJLP;h!%Z@<3FpJqKI5-FQ68I5 zQ%NN1OUHu)ob8b!fTfXe73G>%E4uG=JY)eo&Xtabgxhg084wZOlCo<}bhwDmV0a<$ zc8Dp;>u`EO#1!Rq*j^CIgDJx0As$Gaga=GfUWe29N%A`22hm9I!#~};wH**C-5CnBA;daDLOc8N5X*n5lq61(Gv>4nEcqPOX zrU>tbcpz~SoM=dtxBVPqO2WZ_%R?g}nWA`*aJ+13x7(A>h3uR1t4!-K znBsIUWC6Qjk`#Nb%T9|BQ$#X^B!gGu1b`_J4tN?+%fu9PUgESp9b$^|EKE&#I)W*J z6^RTDcVvhu%Cm4fIK&h*ew?@pR|f-;=KU`w*T>cV0@_D_s5#xrc$z)^JH(#;eU3f- z#$cp)5_|c_VlV$hxtAYnsogH4;~h?{$~=qB?kHC&^WqaIFHY;?*4zF@cqx48x9iRZ z%97{D--1igHG#voD9~0a+L~~g+Y%lHMi%`oZ(6(^fW=z8TJQ<&WGLlt7SKD2c%cvQ zpn9vX!OzT+4~=M%S%7WnGqUso%-&EAUOVOQ%oKb~uZ!pfr|^D*d_MsQ9B%}EXyzi$ zf@y|O!#g~c#vf5WNB8?BQu}w~c=BWjcD^ASUWL`FYAj&l3?k$|4)6sl|A{GEjIu-qk%tT3#tpuUaX10`=sVMtI9 zDrHl4s)sb2gG!=Alt)v_=87x_QVWWDWD_LyXhUdv;o>EzM^h4RH#LbEVrK_rVu5Ft z#IoQXP``~IZ3XHTh6MGXQljKF1sv2ns7#_>8Ak+Puz_ZPOb+Kg;(bXy+7PN*tac=* zM^h4RH$~JdQzYX&T|@p!o}1!bjA@J4xp5W@H*JJEp~V$GQ#+6#QJOZ{jx%;PtW(bL znd^VPW(zfKT`&fh7==$DQTtR5CDX9uO2*o{6Lk1Bq9(?f^5P2c*Xt- zP-e$jc7B)E=f^_hO~m>iCNum6FRlAwa$y05<_uPv#R7_&!=}i&@@4obcnxd|+=F&| zBAPM@OKC(}T~Z=$$^<0Aftmz5v81RbSkl@cQ11ci?M@VqB|@5KQi4tGG*>RP5((5a zCwLG?>vBr^oxvATy%(tWI#Jl6kX#d!{+I}gD3$|i^6j|SPpHwnETg@WWPMpfX*}EnGr}2 zpExnh7NkSy3)W}xSK`^Q+{*V_*R9Qq@KMDx_bpM!{~Dy+QYeWB8tc2E=6t-WZ_Qkq=>yA3Wk3Xu%%%nCxGrlHb7@L=k_nfRsGF zIPnk^Fugc2g9MqsC%i?xDlAyZ-xJbt_P==-m zWHgg!mzLvxJie9qP6vU(1c5)7=XZALW&dsnJStBP#T=*JN6`(i0o$x`XfLti2U)bS z+R>-t_%j4@HvDJl8vY?io|-eWduaH7bZt1p-SEE>cd=!>OONxrY4`$R!Dh2`J^m@3 zl9=3Mi$KmEm!)g?A2{;VJzmg5!-Kjuoa1izU34sEk8kXz;r=}|{A)TtGr7lqOCV>D z?@rh7zYvL5>K^})KTFT*K-Y$^cQ?Ei#HZ{rKfg9j!}&cl{3g9klicIi350`TVe5+V zEycGii38zug#3m`PD98`{8@Sq+}xFr3F0VNdtSeX&ZOkP&~6&Owugp)M@l!D1N+2j ztaiiqr)&5vqB>2F|DHcf-{ZgN+VErUhPQ+Glsyi1)3CpXhL_9ZQWJW2^eRHy0jZ}_wHJ-)YV!*ksYe+1%F_IOk`4G-<1;SK!786r$tw&M6Dm_ya$o4G4b zfuJw`3lbNyIoJ%Sj)vVxWpk0EX`j)p~2j82E8qhZ;gvB+32H*?pT z*h+)+18rNu%PyVUShhS;HgzSP&Z%3a2bx()vmRK=0!wwRRV5uLvU6gkEzh*eIg!MK z!$6Kh_c=e?IkN94f98rE_!~Ree{P>G8`y|LNu4LH;%HUZp{o1<)y{o<`#G{|SPI$4 zBSoIbfpJE<1Zf$-5JB@y2>Gv+2rt@A@dG_shE;a*XyG9g!unB4LEKBJ*T#$rouG z+K!Lsh|%Q|1d@vQjlfzYZ784@k@WFI6pr7Bg3pNG2%T~II$uX)II)e++)h5nQ_tO{ zV`qCVwiz9x5S7AH>EgRb8xDC=e&q^|pMxEa@pF$|*KwosIHOe5y9_#*+95BbIzxh>DM_uZUNfHoG|~MhbgCSxKA)4)IwK*8>Ojz{h&v zkW0FV3NB4Trrnbu6%!L;ua0b_SL*5DW4v;SL;ZQftt{{!Z+ML!SgQxt>4EibrcuLO z6gf9~!!~L-1;beMD>#-1yX)U*y#asP>CcWEbnhF7Zs6VkXM`TGSwMlQN{sSXF_IY` z_?QI_GVB%9nvxq@P%Z*9D=RNrLW1P%{?V4N+>+`Jxyf#C5hSez zX-AP2!EvEx$!7~b)@J`!Z$9R;R!8tUB>pFd-h9Mol$o2Vlv!Ga@8<4KHl<9>^QTiCXd1$arnCRKtN!8d zTB5jnj>?}lBH$=-c@ch z3u1WRhayaIY&-U8n7Mcv`71ClS(xewb~yI1;2+`lF1{9+^b+I`p$taS8A*=(M&kd2 zK6f~u+z6Q4uaML7bq^%KNKUWVubRsDU`eP)A($NN!x`qq_*Wx62I};WNf={Rc{%-8 z5}fk;RzCQ8C@(2@AjThYYEJz3M3dDAF*oHyT3VK%H2uR8l2k_!i+A5=0Rs0Y2VPGj zZjC{B->Lnmp`E62_;<3p@sG(t6FC-2=Pfv{(TXBVq@`w5NPYhCNK-%x=L`>o?lR?>$+05`kC8)_y$zc2Fw{Ag3B&uy1;q7YmnR>PgNP^RyF9^>eWOb8Op6Z3xKlL)l6DC)j_XB)1Rj zEwTvo%V>J$(z|?l0Jfp17030Ox3xkuLM^hUKE=AVOGh%Cn~P>B#?{O_0;gJ!Sc7Za zENNvW?-|;8y^|km4Vs}m!#ICBybIs8_#V}pLK)E0tT~=RFC#YHR17;o+W94JL$)#e z7bo(Q55k0;F-p!0Ly|NG3H|mifAKsGorgi4qL3)-O4H#aT2>Gd|8{B_aStB^B_15& z;6X&^89C}f57bDEU{Esr^Z%cB&!1}~pR}j%_h@r-w88=XvR;PiA)a0*pWP^#8OCoK zO8T@7#GlB(obP9Z2Ieow95K-MVxJKMi<_<*F|hOpeMbx|d#N8ASXut1zL`%C9B_5k zxb@+6;kDs4;bq~K9QRy~#p5ZLS8DG?d0*gwEt2ZpXbnrV8bNrXB;A6bnFU!ekCrRAg&W6 z>~?~6&#Umj;63Xj>mq9-Ya+`cDFvpCG=6C2Rr#dZcZZ8!g9CD(AwJJ>$TbW<=abf~eOZas04n9N_fSOe5pF6kd|`o^aG= zj?-Kn-&m+DTC<>tt`1gn*>SR?M~L$FN}wv$5yEG}W~5*Ebuf`onEu_GKoq%!IAd2tZE z3ndOk8^KFyUrrv2RQ71(z{bpseaWVP%UU0@BlvuyH#lNi#+ej2 zn8fiw?{H-LwPYkgwEv>SLB}riAg|C1nK^EGEk?CAiR0_IcjGWzw&j8>`=HbVK~%{h z!-LNk8P72iM~;-!An;9>uwK}O@W3V7ykzbO{r!f-5henAn$N-eU9(x>I4*Jgs8=|; z>Ryt=VOX7`&t%Crf-wpEE`K80njm3+Nw6O2%^2?{IEZSCB#vIlH`$wxR5W^0Q1cpX;S>GK`>(|!LiAFI3YEr@EB5=SNOd&be#SEK&DCUJb( zF;Cx=Fm7AAr;X$y>?SSZ@8wb%A#6AuBEF?bTL-uiuggs3Fn>MnK zQGLA}apBCUw~-Rg+hy6hp_8-MJD*{}IyB4%MBj>QcEy zvfTH(maA3ef_T;{%RQcjA1=CGaoNa$@vO|-vyvwg(K&!(I(&X7$C6+^2^;Ew4U@3P z9WW+guW$^)=a?X2O%7PCggJepg%ZZ+8Ol<-uS(b>j=FD1*jFVih-*y3N{E)8`rXzi z<6LsLd0TyObS8ZUIQ*)738U0JeTb%;By0(X2v|_U9+I%`E}0(5Fh?PRXer>N$}#}& zC=!KLta+V;$8HdC6_*Z|a1Z5#@KG@=$sFOSEaYI1ig(9I_`1VH(&>m)oz z9mGfZh#k1+?UoVLhM&Sc!9BNeut(9K0T96#Nw{j4`|yDOmxPO29?xJ|zVS{0Q9jae z36I?^;K~QMSHgq$2)ObCDkZ$b5O8HbYbD%wmw+o@=@|*Hr2iO(kFsg6NOjhltfiFvV>yd~it30HIU z$AEtsgWrh$(MRR7{vhEzSqM1KXW_a-!e7p#(sq98kc4+gc!7BA`Aot+*9ZXVZ2EX% ztG^7s1tpy4ssOLTm*@$~dBklWU|*y1fcqqZDS*-E2lx_vk%T|ap&rl8vV2U=yT>G) z>>I~_qW~ee10e{Ja<8GPl*8V`~z;kujrBR z?XoV05P9Eo?Q|i zyjj4N9Mb9&(G!z!#h)_@f2k<1V)1@`M0wv(0ax+(wGtk@MZi@|{&fkjlzOaDJodbU zE9b+l0-*TNCgB|}IPEr~^1*M5^4$N&E#+GXUis$&&SO^G(#|E0Un1slDB$BH+;fL0 z&tps6Rx12bzezFjOqcME5u&`JvsRUl2)L5B?@D+~!o%XRCo17SsV9FV;XhX8rJnpt z38!scykGYUfa)*(A9I2?O8xn;EdP##cRU~fKa=owRep>Bv`YAX36DvKg2x4Tc2dE= zBMQokhfV;ec70OcR!TfymGDKA1VG7Eu_`YG^JQ6nxP(_u6#%KuJR>B$L&6oGV~}^= z-)W+JSmJp=m6vdu=k%E@;fd!CVS&vXGG zy+fauRCx*iz5siEPH@t1GX#M9ow)s-!Xp#>il0j*yi&sDFnHEWcuc~-D)H}>_&X$= z`*-y0-l5+b?y0DlG`V6@ZEao6+!?bbHO#1~t{6TP(E3@kXFgGJOW7?I)fM&AXH+#* zOn%DsWbW`=5(RG^eyjL7qq=%pUB%3rnp#nM($uLHvudUS(=ckw{#7xlVn#g*PMSJx zDxTDdM(m<9-GKVplT%}wD{woWCqb$ksBsAr1PBlj&vCkba$;TNNp4Ud_$`v#unyBCuVO2Iz zQ9E;XeZ>q_Xs)OJ(dyEQ+J>hTqzhoKXU?=K74_3*PD7`_V8^f9hC6=+ZX25PbJ#HF z&!J_aYHi)Ls%Z^VrgMsvID?EQ>Og^S|^s za5=jp=a-|$sy)J5O>@yv&kef8Nh3&0ly*$6L@yjLn;e>0HmUSW$pqcY(G=s$i`qJi zekR72eyFjfdqPrOF=hHR$TkU^dhUj~WRi-J7~WDbbH=R6^`0kY*ECerO`BB_7+Plk z7(UEH_3A6?pO`(VPD%4Hk5AeIxY@8a$QRNde1?e&ViN&Ze*Cj8xbowbJyzwFAOD)G zyz=AUbitJ$f58P;adnQQOW{{>xZed=aeJ8yuHt;d1y}KTr3H(hWQ zS6pzxRa_Ay!tqh`t9X8l3$EggS6%R+j5l_<;41z425Jt`0AcfnQOu*?OIHHi9# z3$F5#l`go-8#cJ$jZX?ZFSy_y88`ma1y}Ls5*J*>pUYiv6@O;QJiqF%%1d77f~!1c zsSB?1l3^EIYtj`FljQb4Rzu=UEMWM>@bAF3D1zp_4S@1)in*%hG4hjH|rZF zO?h1Yezbb_5Cp`v)9ML$&SA|x2~IsX=-9?6|l!wD=wc~>wm z0pd3buAoJ@q59H%E3oGwS3UKB$~r&azz?F6Ba1aMvG=2jqeTRRMVo{!HA&Wd*v>7^(XCvRtaR>Cy2Y xhyuz^kovz7Uw3_iyLnGF%U`9xz;l(js=qFFcyjR|RsG^$h#C`BMR|4C|37DEM?U}n literal 0 HcmV?d00001 diff --git a/blindsig.c b/blindsig.c new file mode 100644 index 0000000..219290b --- /dev/null +++ b/blindsig.c @@ -0,0 +1,182 @@ +// This is a (very rough) test of BLST blind signatures based on run.me from BLST's Python example code +// Do not trust this to be secure, also this doesn't do a lot of the sanity checking yet + +#include +#include +#include +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +byte signer_private_key[32]; +byte signer_public_key[96]; + +void printbytes(byte *toprint, int length){ + for(int i=0;ictrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx->nelems = 0; + ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 + : DST; + ctx->DST_len = DST_len; +} + +static const void *pairing_get_dst(const PAIRING *ctx) +{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing + : ctx->DST; +} + +const void *blst_pairing_get_dst(const PAIRING *ctx) +{ return pairing_get_dst(ctx); } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) + +/* + * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated + * signature vetification as discussed at + * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. + * Usage pattern is not finalized yet, because (sig != NULL) is better and + * will be handled separately... + */ +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_groupcheck, + const POINTonE1_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_SIG; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE1 *S = &ctx->AggrSign.e1; + POINTonE1 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + POINTonE1_mult_w5(P, P, scalar, nbits); + POINTonE1_dadd(S, S, P, NULL); + } else { + POINTonE1_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE1 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE2 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(H, H, scalar, nbits); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_groupcheck, + const POINTonE2_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_PK; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE2 *S = &ctx->AggrSign.e2; + POINTonE2 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + + POINTonE2_mult_w5(P, P, scalar, nbits); + POINTonE2_dadd(S, S, P, NULL); + } else { + POINTonE2_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE2_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE2 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE1 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + if (nbits != 0 && scalar != NULL) { + POINTonE1 pk[1]; + + FROM_AFFINE(pk, PK); + POINTonE1_mult_w5(pk, pk, scalar, nbits); + POINTonE1_from_Jacobian(pk, pk); + PK = (const POINTonE1_affine *)pk; + } + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + unsigned int n; + + if ((n = ctx->nelems) != 0) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + break; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->ctrl & AGGR_GT_SET) { + ctx->ctrl |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->ctrl & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->ctrl & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + /* + * The aggregated signature was infinite, relation between the + * hashes and the public keys has to be VERY special... + */ + vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return (int)PAIRING_FinalVerify(ctx, GTsig); } + +int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) +{ + vec384fp12 GT; + + vec_copy(GT, GT1, sizeof(GT)); + conjugate_fp12(GT); + mul_fp12(GT, GT, GT2); + final_exp(GT, GT); + + /* return GT==1 */ + return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); +} + +void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, + const POINTonE1_affine *p) +{ + unsigned int n; + + if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) + return; + + n = ctx->nelems; + vec_copy(ctx->Q + n, q, sizeof(*q)); + vec_copy(ctx->P + n, p, sizeof(*p)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; +} + +vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) +{ + PAIRING_Commit(ctx); + return (vec384fp12 *)ctx->GT; +} + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1 P[1]; + BLST_ERROR ret; + + ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE1_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) + vec_copy(out, P, sizeof(P)); + else + POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2 P[1]; + BLST_ERROR ret; + + ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE2_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out, P, sizeof(P)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/blst/asm/add_mod_256-armv8.pl b/blst/asm/add_mod_256-armv8.pl new file mode 100755 index 0000000..34d9145 --- /dev/null +++ b/blst/asm/add_mod_256-armv8.pl @@ -0,0 +1,412 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..7)); +@a=map("x$_",(8..11)); +@b=map("x$_",(12..15)); +@t=map("x$_",(16,17,1..3)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + adds @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + adcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + + adds @a[0],@b[0],@b[0] + ldp @mod[0],@mod[1],[$b_ptr] + adcs @a[1],@b[1],@b[1] + ldp @mod[2],@mod[3],[$b_ptr,#16] + adcs @a[2],@b[2],@b[2] + adcs @a[3],@b[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@t[3],lo + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_lshift_mod_256: + adds @a[0],@a[0],@a[0] + sub $b_ptr,$b_ptr,#1 + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adc @t[4],xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + + cbnz $b_ptr,.Loop_lshift_mod_256 + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + +.Loop_rshift: + adds @b[0],@a[0],@mod[0] + sub $b_ptr,$b_ptr,#1 + adcs @b[1],@a[1],@mod[1] + adcs @b[2],@a[2],@mod[2] + adcs @b[3],@a[3],@mod[3] + adc @t[4],xzr,xzr + tst @a[0],#1 + + csel @b[0],@b[0],@a[0],ne + csel @b[1],@b[1],@a[1],ne + csel @b[2],@b[2],@a[2],ne + csel @b[3],@b[3],@a[3],ne + csel @t[4],@t[4],xzr,ne + + extr @a[0],@b[1],@b[0],#1 + extr @a[1],@b[2],@b[1],#1 + extr @a[2],@b[3],@b[2],#1 + extr @a[3],@t[4],@b[3],#1 + + cbnz $b_ptr,.Loop_rshift + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @b[0],@mod[0],@a[0] + ldp @mod[2],@mod[3],[$n_ptr,#16] + orr @mod[0],@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr @mod[1],@a[2],@a[3] + sbcs @b[2],@mod[2],@a[2] + orr @t[4],@mod[0],@mod[1] + sbc @b[3],@mod[3],@a[3] + + cmp @t[4],#0 + csetm @t[4],ne + ands $b_ptr,$b_ptr,@t[4] + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[3],@a[3],@b[3],eq + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + + ldp @a[2],@a[3],[$a_ptr,#16] + subs @a[0],@a[0],@b[0] + ldp @b[2],@b[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + stp @a[0],@a[1],[$r_ptr] + adc @a[3],@a[3],@mod[3] + stp @a[2],@a[3],[$r_ptr,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + subs xzr,@a[0],@mod[0] + sbcs xzr,@a[1],@mod[1] + orr @a[0],@a[0],@a[1] + sbcs xzr,@a[2],@mod[2] + orr @a[0],@a[0],@a[2] + sbcs xzr,@a[3],@mod[3] + orr @a[0],@a[0],@a[3] + sbc $a_ptr,xzr,xzr + + cmp @a[0],#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,$a_ptr + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + adds @a[0],@a[0],@b[0] + ldp @mod[0],@mod[1],[$n_ptr] + adcs @a[1],@a[1],@b[1] + ldp @mod[2],@mod[3],[$n_ptr,#16] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adc @t[4],xzr,xzr + + subs @t[0],@a[0],@mod[0] + sbcs @t[1],@a[1],@mod[1] + sbcs @t[2],@a[2],@mod[2] + sbcs @t[3],@a[3],@mod[3] + sbcs xzr,@t[4],xzr + + csel @a[0],@a[0],@t[0],lo + csel @a[1],@a[1],@t[1],lo + csel @a[2],@a[2],@t[2],lo + csel @a[3],@a[3],@t[3],lo + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @b[0],@b[0] + rev @a[1],@a[1] + rev @b[1],@b[1] + rev @a[2],@a[2] + rev @b[2],@b[2] + rev @a[3],@a[3] + rev @b[3],@b[3] +#endif + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + ldp @mod[0],@mod[1],[$n_ptr] + sbcs @a[2],@a[2],@b[2] + ldp @mod[2],@mod[3],[$n_ptr,#16] + sbcs @a[3],@a[3],@b[3] + sbc @t[4],xzr,xzr + + and @mod[0],@mod[0],@t[4] + and @mod[1],@mod[1],@t[4] + adds @a[0],@a[0],@mod[0] + and @mod[2],@mod[2],@t[4] + adcs @a[1],@a[1],@mod[1] + and @mod[3],@mod[3],@t[4] + adcs @a[2],@a[2],@mod[2] + adc @a[3],@a[3],@mod[3] + + orr @t[0], @a[0], @a[1] + orr @t[1], @a[2], @a[3] + orr @t[0], @t[0], @t[1] + +#ifdef __AARCH64EB__ + rev @a[0],@a[0] + rev @a[1],@a[1] + rev @a[2],@a[2] + rev @a[3],@a[3] +#endif + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + mov @t[1], #1 + cmp @t[0], #0 + csel x0, @t[1], xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ + +print $code; + +close STDOUT; diff --git a/blst/asm/add_mod_256-x86_64.pl b/blst/asm/add_mod_256-x86_64.pl new file mode 100755 index 0000000..1d656fb --- /dev/null +++ b/blst/asm/add_mod_256-x86_64.pl @@ -0,0 +1,547 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr) = ("%rdi","%rsi","%rdx","%rcx"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits add +my @acc=map("%r$_",(8..11, "ax", "si", "bx", "bp", 12)); + +$code.=<<___; +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,\@function,4,"unwind" +.align 32 +add_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loaded_a_add_mod_256: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_256,.-add_mod_256 + +######################################################################## +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,\@function,3,"unwind" +.align 32 +mul_by_3_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org,$n_ptr + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $a_ptr,$b_org + mov 8*3($a_ptr), @acc[3] + + call __lshift_mod_256 + mov 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,\@abi-omnipotent +.align 32 +__lshift_mod_256: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + mov @acc[0], @acc[4] + adc @acc[2], @acc[2] + mov @acc[1], @acc[5] + adc @acc[3], @acc[3] + sbb @acc[8], @acc[8] + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, @acc[8] + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + cmovc @acc[6], @acc[2] + cmovc @acc[7], @acc[3] + + ret +.size __lshift_mod_256,.-__lshift_mod_256 + +######################################################################## +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,\@function,4,"unwind" +.align 32 +lshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_lshift_mod_256: + call __lshift_mod_256 + dec %edx + jnz .Loop_lshift_mod_256 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + +######################################################################## +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,\@function,4,"unwind" +.align 32 +rshift_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[7] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + +.Loop_rshift_mod_256: + mov @acc[7], @acc[0] + and \$1, @acc[7] + mov 8*0($n_ptr), @acc[4] + neg @acc[7] + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + + and @acc[7], @acc[4] + and @acc[7], @acc[5] + and @acc[7], @acc[6] + and 8*3($n_ptr), @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + sbb @acc[4], @acc[4] + + shr \$1, @acc[0] + mov @acc[1], @acc[7] + shr \$1, @acc[1] + mov @acc[2], @acc[6] + shr \$1, @acc[2] + mov @acc[3], @acc[5] + shr \$1, @acc[3] + + shl \$63, @acc[7] + shl \$63, @acc[6] + or @acc[0], @acc[7] + shl \$63, @acc[5] + or @acc[6], @acc[1] + shl \$63, @acc[4] + or @acc[5], @acc[2] + or @acc[4], @acc[3] + + dec %edx + jnz .Loop_rshift_mod_256 + + mov @acc[7], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + +######################################################################## +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,\@function,4,"unwind" +.align 32 +cneg_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[8] # load a[0:3] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov @acc[8], @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], @acc[8] + or @acc[2], @acc[8] + or @acc[3], @acc[8] + mov \$-1, @acc[7] + + mov 8*0($n_ptr), @acc[4] # load n[0:3] + cmovnz @acc[7], @acc[8] # mask = a[0:3] ? -1 : 0 + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + and @acc[8], @acc[4] # n[0:3] &= mask + mov 8*3($n_ptr), @acc[7] + and @acc[8], @acc[5] + and @acc[8], @acc[6] + and @acc[8], @acc[7] + + sub @acc[0], @acc[4] # a[0:3] ? n[0:3]-a[0:3] : 0-0 + sbb @acc[1], @acc[5] + sbb @acc[2], @acc[6] + sbb @acc[3], @acc[7] + + or $b_org, $b_org # check condition flag + + cmovz @acc[0], @acc[4] # flag ? n[0:3]-a[0:3] : a[0:3] + cmovz @acc[1], @acc[5] + mov @acc[4], 8*0($r_ptr) + cmovz @acc[2], @acc[6] + mov @acc[5], 8*1($r_ptr) + cmovz @acc[3], @acc[7] + mov @acc[6], 8*2($r_ptr) + mov @acc[7], 8*3($r_ptr) + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + +######################################################################## +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,\@function,4,"unwind" +.align 32 +sub_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + +######################################################################## +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,\@function,2,"unwind" +.align 32 +check_mod_256: +.cfi_startproc + mov 8*0($r_ptr), %rax + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + + mov %rax, @acc[0] # see if it's zero + or @acc[1], %rax + or @acc[2], %rax + or @acc[3], %rax + + sub 8*0($a_ptr), @acc[0] # does subtracting modulus borrow? + sbb 8*1($a_ptr), @acc[1] + sbb 8*2($a_ptr), @acc[2] + sbb 8*3($a_ptr), @acc[3] + sbb $a_ptr, $a_ptr + + mov \$1, %rdx + cmp \$0, %rax + cmovne %rdx, %rax + and $a_ptr, %rax +.cfi_epilogue + ret +.cfi_endproc +.size check_mod_256,.-check_mod_256 + +######################################################################## +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,\@function,4,"unwind" +.align 32 +add_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + mov @acc[0], @acc[4] + adc 8*2($b_org), @acc[2] + mov @acc[1], @acc[5] + adc 8*3($b_org), @acc[3] + sbb $b_org, $b_org + + mov @acc[2], @acc[6] + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], @acc[7] + sbb 8*3($n_ptr), @acc[3] + sbb \$0, $b_org + + cmovc @acc[4], @acc[0] + cmovc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + +######################################################################## +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,\@function,4,"unwind" +.align 32 +sub_n_check_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[4] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[5] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[6] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[7] + sbb $b_org, $b_org + + and $b_org, @acc[4] + and $b_org, @acc[5] + and $b_org, @acc[6] + and $b_org, @acc[7] + + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[6], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[7], @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + or @acc[1], @acc[0] + or @acc[3], @acc[2] + or @acc[2], @acc[0] + mov \$1, %rax + cmovz @acc[0], %rax + + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/add_mod_384-armv8.pl b/blst/asm/add_mod_384-armv8.pl new file mode 100755 index 0000000..c6b2a53 --- /dev/null +++ b/blst/asm/add_mod_384-armv8.pl @@ -0,0 +1,872 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr) = map("x$_", 0..3); + +@mod=map("x$_",(4..9)); +@a=map("x$_",(10..15)); +@b=map("x$_",(16,17,19..22)); +$carry=$n_ptr; + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + +__add_mod_384_ab_are_loaded: + adds @a[0],@a[0],@b[0] + adcs @a[1],@a[1],@b[1] + adcs @a[2],@a[2],@b[2] + adcs @a[3],@a[3],@b[3] + adcs @a[4],@a[4],@b[4] + adcs @a[5],@a[5],@b[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_rshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __rshift_mod_384 + cbnz $b_ptr,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx @b[5],@a[0],#0,#1 + and @b[0],@b[5],@mod[0] + and @b[1],@b[5],@mod[1] + adds @a[0],@a[0],@b[0] + and @b[2],@b[5],@mod[2] + adcs @a[1],@a[1],@b[1] + and @b[3],@b[5],@mod[3] + adcs @a[2],@a[2],@b[2] + and @b[4],@b[5],@mod[4] + adcs @a[3],@a[3],@b[3] + and @b[5],@b[5],@mod[5] + adcs @a[4],@a[4],@b[4] + extr @a[0],@a[1],@a[0],#1 // a[0:5] >>= 1 + adcs @a[5],@a[5],@b[5] + extr @a[1],@a[2],@a[1],#1 + adc @b[5],xzr,xzr + extr @a[2],@a[3],@a[2],#1 + extr @a[3],@a[4],@a[3],#1 + extr @a[4],@a[5],@a[4],#1 + extr @a[5],@b[5],@a[5],#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + +.Loop_lshift_mod_384: + sub $b_ptr,$b_ptr,#1 + bl __lshift_mod_384 + cbnz $b_ptr,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @b[0],@a[0],@mod[0] + sbcs @b[1],@a[1],@mod[1] + sbcs @b[2],@a[2],@mod[2] + sbcs @b[3],@a[3],@mod[3] + sbcs @b[4],@a[4],@mod[4] + sbcs @b[5],@a[5],@mod[5] + sbcs xzr,$carry,xzr + + csel @a[0],@a[0],@b[0],lo + csel @a[1],@a[1],@b[1],lo + csel @a[2],@a[2],@b[2],lo + csel @a[3],@a[3],@b[3],lo + csel @a[4],@a[4],@b[4],lo + csel @a[5],@a[5],@b[5],lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + + bl __add_mod_384_ab_are_loaded + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + + ldp @b[0],@b[1],[$a_ptr,#48] + ldp @b[2],@b[3],[$a_ptr,#64] + ldp @b[4],@b[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @mod[0],@mod[1],[$n_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @mod[2],@mod[3],[$n_ptr,#16] + + subs @b[0],@mod[0],@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @mod[4],@mod[5],[$n_ptr,#32] + orr $carry,@a[0],@a[1] + sbcs @b[1],@mod[1],@a[1] + orr $carry,$carry,@a[2] + sbcs @b[2],@mod[2],@a[2] + orr $carry,$carry,@a[3] + sbcs @b[3],@mod[3],@a[3] + orr $carry,$carry,@a[4] + sbcs @b[4],@mod[4],@a[4] + orr $carry,$carry,@a[5] + sbc @b[5],@mod[5],@a[5] + + cmp $carry,#0 + csetm $carry,ne + ands $b_ptr,$b_ptr,$carry + + csel @a[0],@a[0],@b[0],eq + csel @a[1],@a[1],@b[1],eq + csel @a[2],@a[2],@b[2],eq + csel @a[3],@a[3],@b[3],eq + stp @a[0],@a[1],[$r_ptr] + csel @a[4],@a[4],@b[4],eq + stp @a[2],@a[3],[$r_ptr,#16] + csel @a[5],@a[5],@b[5],eq + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[4],@b[5],[$b_ptr,#32] + + subs @a[0],@a[0],@b[0] + sbcs @a[1],@a[1],@b[1] + sbcs @a[2],@a[2],@b[2] + sbcs @a[3],@a[3],@b[3] + sbcs @a[4],@a[4],@b[4] + sbcs @a[5],@a[5],@b[5] + sbc $carry,xzr,xzr + + and @b[0],@mod[0],$carry + and @b[1],@mod[1],$carry + adds @a[0],@a[0],@b[0] + and @b[2],@mod[2],$carry + adcs @a[1],@a[1],@b[1] + and @b[3],@mod[3],$carry + adcs @a[2],@a[2],@b[2] + and @b[4],@mod[4],$carry + adcs @a[3],@a[3],@b[3] + and @b[5],@mod[5],$carry + adcs @a[4],@a[4],@b[4] + adc @a[5],@a[5],@b[5] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384 + + stp @a[0],@a[1],[$r_ptr] + add $a_ptr,$a_ptr,#48 + stp @a[2],@a[3],[$r_ptr,#16] + add $b_ptr,$b_ptr,#48 + stp @a[4],@a[5],[$r_ptr,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + add $b_ptr,$a_ptr,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp @b[0],@b[1],[$a_ptr] + ldp @b[2],@b[3],[$a_ptr,#16] + ldp @b[4],@b[5],[$a_ptr,#32] + stp @a[0],@a[1],[$r_ptr] + ldp @a[0],@a[1],[$a_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#16] + ldp @a[2],@a[3],[$a_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#32] + ldp @a[4],@a[5],[$a_ptr,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp @a[0],@a[1],[$r_ptr,#48] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $carry,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $carry,$carry,xzr + + mvn $carry,$carry + and $carry,$carry,#2 + orr $r_ptr,$r_ptr,$carry + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + ldp @a[4],@a[5],[$r_ptr,#32] + + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + ldp @a[0],@a[1],[$r_ptr,#48] + ldp @a[2],@a[3],[$r_ptr,#64] + ldp @a[4],@a[5],[$r_ptr,#80] + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $b_ptr,$b_ptr,@b[0] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @b[0],xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc @b[0],@b[0],xzr + + mvn @b[0],@b[0] + and @b[0],@b[0],#2 + orr $r_ptr,$r_ptr,@b[0] + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +if (1) { +sub vec_select { +my $sz = shift; +my @v=map("v$_",(0..5,16..21)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,%function +.align 5 +vec_select_$sz: + dup v6.2d, $n_ptr + ld1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$a_ptr],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {@v[3].2d, @v[4].2d, @v[5].2d}, [$b_ptr],#48 +___ +for($i=0; $i<$sz-48; $i+=48) { +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + ld1 {@v[6].2d, @v[7].2d, @v[8].2d}, [$a_ptr],#48 + bit @v[1].16b, @v[4].16b, v6.16b + ld1 {@v[9].2d, @v[10].2d, @v[11].2d}, [$b_ptr],#48 + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr],#48 +___ + @v = @v[6..11,0..5]; +} +$code.=<<___; + bit @v[0].16b, @v[3].16b, v6.16b + bit @v[1].16b, @v[4].16b, v6.16b + bit @v[2].16b, @v[5].16b, v6.16b + st1 {@v[0].2d, @v[1].2d, @v[2].2d}, [$r_ptr] + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end, $step) = map("x$_", (0..2)); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add $end, $end, $inp + sub $end, $end, #1 + mov $step, #64 + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + csel $step, xzr, $step, hi + prfm pldl1keep, [$inp] + add $inp, $inp, $step + cmp $inp, $end + csel $inp, $end, $inp, hi + prfm pldl1keep, [$inp] + ret +.size vec_prefetch,.-vec_prefetch +___ +} + +print $code; + +close STDOUT; diff --git a/blst/asm/add_mod_384-x86_64.pl b/blst/asm/add_mod_384-x86_64.pl new file mode 100755 index 0000000..88dde45 --- /dev/null +++ b/blst/asm/add_mod_384-x86_64.pl @@ -0,0 +1,1430 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 384 bits add +my @acc=map("%r$_",(8..15, "ax", "bx", "bp")); + push(@acc, $a_ptr); + +$code.=<<___; +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,\@function,4,"unwind" +.align 32 +add_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__add_mod_384_a_is_loaded: + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,\@function,4,"unwind" +.align 32 +add_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __add_mod_384 # add_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __add_mod_384 # add_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + +######################################################################## +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,\@function,4,"unwind" +.align 32 +rshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_rshift_mod_384: + call __rshift_mod_384 + dec %edx + jnz .Loop_rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,\@abi-omnipotent +.align 32 +__rshift_mod_384: + mov \$1, @acc[11] + mov 8*0($n_ptr), @acc[6] + and @acc[0], @acc[11] + mov 8*1($n_ptr), @acc[7] + neg @acc[11] + mov 8*2($n_ptr), @acc[8] + and @acc[11], @acc[6] + mov 8*3($n_ptr), @acc[9] + and @acc[11], @acc[7] + mov 8*4($n_ptr), @acc[10] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], @acc[10] + adc @acc[5], @acc[11] + sbb @acc[5], @acc[5] + + shr \$1, @acc[6] + mov @acc[7], @acc[0] + shr \$1, @acc[7] + mov @acc[8], @acc[1] + shr \$1, @acc[8] + mov @acc[9], @acc[2] + shr \$1, @acc[9] + mov @acc[10], @acc[3] + shr \$1, @acc[10] + mov @acc[11], @acc[4] + shr \$1, @acc[11] + shl \$63, @acc[0] + shl \$63, @acc[1] + or @acc[6], @acc[0] + shl \$63, @acc[2] + or @acc[7], @acc[1] + shl \$63, @acc[3] + or @acc[8], @acc[2] + shl \$63, @acc[4] + or @acc[9], @acc[3] + shl \$63, @acc[5] + or @acc[10], @acc[4] + or @acc[11], @acc[5] + + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,\@function,3,"unwind" +.align 32 +div_by_2_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov $b_org, $n_ptr + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + call __rshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + +######################################################################## +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,\@function,4,"unwind" +.align 32 +lshift_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +.Loop_lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $r_ptr, $r_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov (%rsp), $r_ptr + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + dec %edx + jnz .Loop_lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,\@abi-omnipotent +.align 32 +__lshift_mod_384: + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + cmovc @acc[9], @acc[3] + cmovc @acc[10], @acc[4] + cmovc @acc[11], @acc[5] + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +######################################################################## +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +######################################################################## +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + + mov (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov (%rsp), $a_ptr + lea 8*6($r_ptr), $r_ptr + + mov 8*6($a_ptr), @acc[0] + mov 8*7($a_ptr), @acc[1] + mov 8*8($a_ptr), @acc[2] + mov 8*9($a_ptr), @acc[3] + mov 8*10($a_ptr), @acc[4] + mov 8*11($a_ptr), @acc[5] + + call __lshift_mod_384 + + mov \$8*6, $b_org + add (%rsp), $b_org + call __add_mod_384_a_is_loaded + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov $b_org, $n_ptr + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov (%rsp), $a_ptr + mov @acc[0], 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mov 48+8*0($a_ptr), @acc[0] + mov 48+8*1($a_ptr), @acc[1] + mov 48+8*2($a_ptr), @acc[2] + mov 48+8*3($a_ptr), @acc[3] + mov 48+8*4($a_ptr), @acc[4] + mov 48+8*5($a_ptr), @acc[5] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov @acc[0], 48+8*0($r_ptr) + mov @acc[1], 48+8*1($r_ptr) + mov @acc[2], 48+8*2($r_ptr) + mov @acc[3], 48+8*3($r_ptr) + mov @acc[4], 48+8*4($r_ptr) + mov @acc[5], 48+8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +######################################################################## +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,\@function,4,"unwind" +.align 32 +cneg_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $b_org # condition flag +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), $b_org # load a[0:5] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov $b_org, @acc[0] + mov 8*3($a_ptr), @acc[3] + or @acc[1], $b_org + mov 8*4($a_ptr), @acc[4] + or @acc[2], $b_org + mov 8*5($a_ptr), @acc[5] + or @acc[3], $b_org + mov \$-1, @acc[11] + or @acc[4], $b_org + or @acc[5], $b_org + + mov 8*0($n_ptr), @acc[6] # load n[0:5] + cmovnz @acc[11], $b_org # mask = a[0:5] ? -1 : 0 + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + and $b_org, @acc[6] # n[0:5] &= mask + mov 8*3($n_ptr), @acc[9] + and $b_org, @acc[7] + mov 8*4($n_ptr), @acc[10] + and $b_org, @acc[8] + mov 8*5($n_ptr), @acc[11] + and $b_org, @acc[9] + mov 0(%rsp), $n_ptr # restore condition flag + and $b_org, @acc[10] + and $b_org, @acc[11] + + sub @acc[0], @acc[6] # a[0:5] ? n[0:5]-a[0:5] : 0-0 + sbb @acc[1], @acc[7] + sbb @acc[2], @acc[8] + sbb @acc[3], @acc[9] + sbb @acc[4], @acc[10] + sbb @acc[5], @acc[11] + + or $n_ptr, $n_ptr # check condition flag + + cmovz @acc[0], @acc[6] # flag ? n[0:5]-a[0:5] : a[0:5] + cmovz @acc[1], @acc[7] + cmovz @acc[2], @acc[8] + mov @acc[6], 8*0($r_ptr) + cmovz @acc[3], @acc[9] + mov @acc[7], 8*1($r_ptr) + cmovz @acc[4], @acc[10] + mov @acc[8], 8*2($r_ptr) + cmovz @acc[5], @acc[11] + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + +######################################################################## +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,\@function,4,"unwind" +.align 32 +sub_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,\@function,4,"unwind" +.align 32 +sub_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$24, %rsp +.cfi_adjust_cfa_offset 24 +.cfi_end_prologue + + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + lea 48($a_ptr), $a_ptr # a->im + lea 48($b_org), $b_org # b->im + lea 48($r_ptr), $r_ptr # ret->im + call __sub_mod_384 # sub_mod_384(ret->im, a->im, b->im, mod); + + mov 8*0(%rsp), $a_ptr # a->re + mov 8*1(%rsp), $b_org # b->re + lea -48($r_ptr), $r_ptr # ret->re + call __sub_mod_384 # sub_mod_384(ret->re, a->re, b->re, mod); + + mov 24+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 24+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 24+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 24+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 24+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 24+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 24+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +___ +} +{ ###################################################### ret = a * (1 + i) +my ($r_ptr,$a_ptr,$n_ptr) = ("%rdi","%rsi","%rdx"); +my @acc=map("%r$_",(8..15, "ax", "bx", "cx", "bp")); + +$code.=<<___; +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,\@function,3,"unwind" +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$56, %rsp +.cfi_adjust_cfa_offset 56 +.cfi_end_prologue + + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + mov $r_ptr, 8*6(%rsp) # offload r_ptr + sbb $r_ptr, $r_ptr + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $a_ptr, $a_ptr + + mov @acc[0], 8*0(%rsp) # offload a->re + a->im [without carry] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1(%rsp) + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2(%rsp) + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3(%rsp) + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4(%rsp) + and $a_ptr, @acc[0] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5(%rsp) + and $a_ptr, @acc[1] + mov 8*5($n_ptr), @acc[5] + and $a_ptr, @acc[2] + and $a_ptr, @acc[3] + and $a_ptr, @acc[4] + and $a_ptr, @acc[5] + mov 8*6(%rsp), $a_ptr # restore r_ptr + + add @acc[0], @acc[6] + mov 8*0(%rsp), @acc[0] # restore a->re + a->im + adc @acc[1], @acc[7] + mov 8*1(%rsp), @acc[1] + adc @acc[2], @acc[8] + mov 8*2(%rsp), @acc[2] + adc @acc[3], @acc[9] + mov 8*3(%rsp), @acc[3] + adc @acc[4], @acc[10] + mov 8*4(%rsp), @acc[4] + adc @acc[5], @acc[11] + mov 8*5(%rsp), @acc[5] + + mov @acc[6], 8*0($a_ptr) # ret->re = a->re - a->im + mov @acc[0], @acc[6] + mov @acc[7], 8*1($a_ptr) + mov @acc[8], 8*2($a_ptr) + mov @acc[1], @acc[7] + mov @acc[9], 8*3($a_ptr) + mov @acc[10], 8*4($a_ptr) + mov @acc[2], @acc[8] + mov @acc[11], 8*5($a_ptr) + + sub 8*0($n_ptr), @acc[0] + mov @acc[3], @acc[9] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + mov @acc[4], @acc[10] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($a_ptr) # ret->im = a->re + a->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($a_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($a_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($a_ptr) + mov @acc[4], 8*10($a_ptr) + mov @acc[5], 8*11($a_ptr) + + mov 56+8*0(%rsp),%r15 +.cfi_restore %r15 + mov 56+8*1(%rsp),%r14 +.cfi_restore %r14 + mov 56+8*2(%rsp),%r13 +.cfi_restore %r13 + mov 56+8*3(%rsp),%r12 +.cfi_restore %r12 + mov 56+8*4(%rsp),%rbx +.cfi_restore %rbx + mov 56+8*5(%rsp),%rbp +.cfi_restore %rbp + lea 56+8*6(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +___ +} +{ ###################################################### +my ($r_ptr,$n_ptr) = ("%rdi","%rsi"); +my @acc=map("%r$_",(8..11, "cx", "dx", "bx", "bp")); + +$code.=<<___; +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384: +.cfi_startproc +.cfi_end_prologue + mov 8*0($r_ptr), @acc[0] + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + + xor %rax, %rax + mov @acc[0], $r_ptr + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, $r_ptr + and \$2, %rax + or $r_ptr, %rax # pack sign and parity + +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,\@function,2,"unwind" +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*6($r_ptr), @acc[0] # sgn0(a->im) + mov 8*7($r_ptr), @acc[1] + mov 8*8($r_ptr), @acc[2] + mov 8*9($r_ptr), @acc[3] + mov 8*10($r_ptr), @acc[4] + mov 8*11($r_ptr), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), %rax # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + mov 8*0(%rax), @acc[0] + mov 8*1(%rax), @acc[1] + mov 8*2(%rax), @acc[2] + mov 8*3(%rax), @acc[3] + mov 8*4(%rax), @acc[4] + mov 8*5(%rax), @acc[5] + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp), %rbx +.cfi_restore %rbx + mov 16(%rsp), %rbp +.cfi_restore %rbp + lea 24(%rsp), %rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +___ +} +if (0) { +my $inp = $win64 ? "%rcx" : "%rdi"; +$code.=<<___; +.globl nbits_384 +.hidden nbits_384 +.type nbits_384,\@abi-omnipotent +.align 32 +nbits_384: + mov 8*5($inp), %r8 + mov 8*4($inp), %r9 + mov 8*3($inp), %r10 + mov 8*2($inp), %r11 + mov \$-1, %rdx + mov \$127, %eax + bsr %r8, %r8 + cmovnz %rdx,%r9 + cmovz %rax,%r8 + bsr %r9, %r9 + cmovnz %rdx,%r10 + cmovz %rax,%r9 + xor \$63,%r8 + bsr %r10, %r10 + cmovnz %rdx, %r11 + cmovz %rax, %r10 + xor \$63,%r9 + add %r8, %r9 + mov 8*1($inp), %r8 + bsr %r11, %r11 + cmovnz %rdx, %r8 + cmovz %rax, %r11 + xor \$63, %r10 + add %r9, %r10 + mov 8*0($inp), %r9 + bsr %r8, %r8 + cmovnz %rdx, %r9 + cmovz %rax, %r8 + xor \$63, %r11 + add %r10, %r11 + bsr %r9, %r9 + cmovz %rax, %r9 + xor \$63, %r8 + add %r11, %r8 + xor \$63, %r9 + add %r8, %r9 + mov \$384, %eax + sub %r9, %rax + ret +.size nbits_384,.-nbits_384 +___ +} + +if (1) { +my ($out, $inp1, $inp2, $select) = $win64 ? ("%rcx", "%rdx", "%r8", "%r9d") + : ("%rdi", "%rsi", "%rdx", "%ecx"); + +sub vec_select { +my $sz = shift; +my $half = $sz/2; +my ($xmm0,$xmm1,$xmm2,$xmm3)=map("%xmm$_",(0..3)); + +$code.=<<___; +.globl vec_select_$sz +.hidden vec_select_$sz +.type vec_select_$sz,\@abi-omnipotent +.align 32 +vec_select_$sz: + movd $select, %xmm5 + pxor %xmm4,%xmm4 + pshufd \$0,%xmm5,%xmm5 # broadcast + movdqu ($inp1),$xmm0 + lea $half($inp1),$inp1 + pcmpeqd %xmm4,%xmm5 + movdqu ($inp2),$xmm1 + lea $half($inp2),$inp2 + pcmpeqd %xmm5,%xmm4 + lea $half($out),$out +___ +for($i=0; $i<$sz-16; $i+=16) { +$code.=<<___; + pand %xmm4,$xmm0 + movdqu $i+16-$half($inp1),$xmm2 + pand %xmm5,$xmm1 + movdqu $i+16-$half($inp2),$xmm3 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) +___ + ($xmm0,$xmm1,$xmm2,$xmm3)=($xmm2,$xmm3,$xmm0,$xmm1); +} +$code.=<<___; + pand %xmm4,$xmm0 + pand %xmm5,$xmm1 + por $xmm1,$xmm0 + movdqu $xmm0,$i-$half($out) + ret +.size vec_select_$sz,.-vec_select_$sz +___ +} +vec_select(48); +vec_select(96); +vec_select(192); +vec_select(144); +vec_select(288); +} + +{ +my ($inp, $end) = $win64 ? ("%rcx", "%rdx") : ("%rdi", "%rsi"); + +$code.=<<___; +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,\@abi-omnipotent +.align 32 +vec_prefetch: + leaq -1($inp,$end), $end + mov \$64, %rax + xor %r8, %r8 + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + cmova %r8, %rax + prefetchnta ($inp) + lea ($inp,%rax), $inp + cmp $end, $inp + cmova $end, $inp + prefetchnta ($inp) + ret +.size vec_prefetch,.-vec_prefetch +___ +} +print $code; +close STDOUT; diff --git a/blst/asm/add_mod_384x384-x86_64.pl b/blst/asm/add_mod_384x384-x86_64.pl new file mode 100755 index 0000000..6ee3cf8 --- /dev/null +++ b/blst/asm/add_mod_384x384-x86_64.pl @@ -0,0 +1,260 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +############################################################ 384x384 add/sub +# Double-width addition/subtraction modulo n<<384, as opposite to +# naively expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +{ +my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +.type __add_mod_384x384,\@abi-omnipotent +.align 32 +__add_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + add 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + adc 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + adc 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + adc 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + adc 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + adc 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + adc 8*6($b_org), @acc[6] + mov @acc[1], 8*1($r_ptr) + adc 8*7($b_org), @acc[7] + mov @acc[2], 8*2($r_ptr) + adc 8*8($b_org), @acc[8] + mov @acc[4], 8*4($r_ptr) + mov @acc[6], @acc[0] + adc 8*9($b_org), @acc[9] + mov @acc[3], 8*3($r_ptr) + mov @acc[7], @acc[1] + adc 8*10($b_org), @acc[10] + mov @acc[5], 8*5($r_ptr) + mov @acc[8], @acc[2] + adc 8*11($b_org), @acc[11] + mov @acc[9], @acc[3] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[10], @acc[4] + sbb 8*2($n_ptr), @acc[8] + sbb 8*3($n_ptr), @acc[9] + sbb 8*4($n_ptr), @acc[10] + mov @acc[11], @acc[5] + sbb 8*5($n_ptr), @acc[11] + sbb \$0, $b_org + + cmovc @acc[0], @acc[6] + cmovc @acc[1], @acc[7] + cmovc @acc[2], @acc[8] + mov @acc[6], 8*6($r_ptr) + cmovc @acc[3], @acc[9] + mov @acc[7], 8*7($r_ptr) + cmovc @acc[4], @acc[10] + mov @acc[8], 8*8($r_ptr) + cmovc @acc[5], @acc[11] + mov @acc[9], 8*9($r_ptr) + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,\@function,4,"unwind" +.align 32 +add_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __add_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,\@function,4,"unwind" +.align 32 +sub_mod_384x384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sub_mod_384x384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/arm-xlate.pl b/blst/asm/arm-xlate.pl new file mode 100755 index 0000000..5028a62 --- /dev/null +++ b/blst/asm/arm-xlate.pl @@ -0,0 +1,381 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ARM assembler distiller/adapter by \@dot-asm. + +use strict; + +################################################################ +# Recognized "flavour"-s are: +# +# linux[32|64] GNU assembler, effectively pass-through +# ios[32|64] global symbols' decorations, PIC tweaks, etc. +# win[32|64] Visual Studio armasm-specific directives +# coff[32|64] e.g. clang --target=arm-windows ... +# +my $flavour = shift; + $flavour = "linux" if (!$flavour or $flavour eq "void"); + +my $output = shift; +open STDOUT,">$output" || die "can't open $output: $!"; + +my %GLOBALS; +my $dotinlocallabels = ($flavour !~ /ios/) ? 1 : 0; +my $in_proc; # used with 'windows' flavour + +################################################################ +# directives which need special treatment on different platforms +################################################################ +my $arch = sub { } if ($flavour !~ /linux|coff64/);# omit .arch +my $fpu = sub { } if ($flavour !~ /linux/); # omit .fpu + +my $rodata = sub { + SWITCH: for ($flavour) { + /linux/ && return ".section\t.rodata"; + /ios/ && return ".section\t__TEXT,__const"; + /coff/ && return ".section\t.rdata,\"dr\""; + /win/ && return "\tAREA\t|.rdata|,DATA,READONLY,ALIGN=8"; + last; + } +}; + +my $hidden = sub { + if ($flavour =~ /ios/) { ".private_extern\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $comm = sub { + my @args = split(/,\s*/,shift); + my $name = @args[0]; + my $global = \$GLOBALS{$name}; + my $ret; + + if ($flavour =~ /ios32/) { + $ret = ".comm\t_$name,@args[1]\n"; + $ret .= ".non_lazy_symbol_pointer\n"; + $ret .= "$name:\n"; + $ret .= ".indirect_symbol\t_$name\n"; + $ret .= ".long\t0\n"; + $ret .= ".previous"; + $name = "_$name"; + } elsif ($flavour =~ /win/) { + $ret = "\tCOMMON\t|$name|,@args[1]"; + } elsif ($flavour =~ /coff/) { + $ret = ".comm\t$name,@args[1]"; + } else { + $ret = ".comm\t".join(',',@args); + } + + $$global = $name; + $ret; +}; + +my $globl = sub { + my $name = shift; + my $global = \$GLOBALS{$name}; + my $ret; + + SWITCH: for ($flavour) { + /ios/ && do { $name = "_$name"; last; }; + /win/ && do { $ret = ""; last; }; + } + + $ret = ".globl $name" if (!defined($ret)); + $$global = $name; + $ret; +}; +my $global = $globl; + +my $extern = sub { + &$globl(@_); + if ($flavour =~ /win/) { + return "\tEXTERN\t@_"; + } + return; # return nothing +}; + +my $type = sub { + my $arg = join(',',@_); + my $ret; + + SWITCH: for ($flavour) { + /ios32/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = "#ifdef __thumb2__\n" . + ".thumb_func $1\n" . + "#endif"; + } + last; + }; + /win/ && do { if ($arg =~ /(\w+),\s*%(function|object)/) { + my $type = "[DATA]"; + if ($2 eq "function") { + $in_proc = $1; + $type = "[FUNC]"; + } + $ret = $GLOBALS{$1} ? "\tEXPORT\t|$1|$type" + : ""; + } + last; + }; + /coff/ && do { if ($arg =~ /(\w+),\s*%function/) { + $ret = ".def $1;\n". + ".type 32;\n". + ".endef"; + } + last; + }; + } + return $ret; +} if ($flavour !~ /linux/); + +my $size = sub { + if ($in_proc && $flavour =~ /win/) { + $in_proc = undef; + return "\tENDP"; + } +} if ($flavour !~ /linux/); + +my $inst = sub { + if ($flavour =~ /win/) { "\tDCDU\t".join(',',@_); } + else { ".long\t".join(',',@_); } +} if ($flavour !~ /linux/); + +my $asciz = sub { + my $line = join(",",@_); + if ($line =~ /^"(.*)"$/) + { if ($flavour =~ /win/) { + "\tDCB\t$line,0\n\tALIGN\t4"; + } else { + ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; + } + } else { ""; } +}; + +my $align = sub { + "\tALIGN\t".2**@_[0]; +} if ($flavour =~ /win/); + $align = sub { + ".p2align\t".@_[0]; +} if ($flavour =~ /coff/); + +my $byte = sub { + "\tDCB\t".join(',',@_); +} if ($flavour =~ /win/); + +my $short = sub { + "\tDCWU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $word = sub { + "\tDCDU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $long = $word if ($flavour =~ /win/); + +my $quad = sub { + "\tDCQU\t".join(',',@_); +} if ($flavour =~ /win/); + +my $skip = sub { + "\tSPACE\t".shift; +} if ($flavour =~ /win/); + +my $code = sub { + "\tCODE@_[0]"; +} if ($flavour =~ /win/); + +my $thumb = sub { # .thumb should appear prior .text in source + "# define ARM THUMB\n" . + "\tTHUMB"; +} if ($flavour =~ /win/); + +my $text = sub { + "\tAREA\t|.text|,CODE,ALIGN=8,".($flavour =~ /64/ ? "ARM64" : "ARM"); +} if ($flavour =~ /win/); + +my $syntax = sub {} if ($flavour =~ /win/); # omit .syntax + +my $rva = sub { + # .rva directive comes in handy only on 32-bit Windows, i.e. it can + # be used only in '#if defined(_WIN32) && !defined(_WIN64)' sections. + # However! Corresponding compilers don't seem to bet on PIC, which + # raises the question why would assembler programmer have to jump + # through the hoops? But just in case, it would go as following: + # + # ldr r1,.LOPENSSL_armcap + # ldr r2,.LOPENSSL_armcap+4 + # adr r0,.LOPENSSL_armcap + # bic r1,r1,#1 ; de-thumb-ify link.exe's ideas + # sub r0,r0,r1 ; r0 is image base now + # ldr r0,[r0,r2] + # ... + #.LOPENSSL_armcap: + # .rva .LOPENSSL_armcap ; self-reference + # .rva OPENSSL_armcap_P ; real target + # + # Non-position-independent [and ISA-neutral] alternative is so much + # simpler: + # + # ldr r0,.LOPENSSL_armcap + # ldr r0,[r0] + # ... + #.LOPENSSL_armcap: + # .long OPENSSL_armcap_P + # + "\tDCDU\t@_[0]\n\tRELOC\t2" +} if ($flavour =~ /win(?!64)/); + +################################################################ +# some broken instructions in Visual Studio armasm[64]... + +my $it = sub {} if ($flavour =~ /win32/); # omit 'it' + +my $ext = sub { + "\text8\t".join(',',@_); +} if ($flavour =~ /win64/); + +my $csel = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsel$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +my $csetm = sub { + my ($args,$comment) = split(m|\s*//|,shift); + my @regs = split(m|,\s*|,$args); + my $cond = pop(@regs); + + "\tcsetm$cond\t".join(',',@regs); +} if ($flavour =~ /win64/); + +# ... then conditional branch instructions are also broken, but +# maintaining all the variants is tedious, so I kludge-fix it +# elsewhere... +################################################################ +my $adrp = sub { + my ($args,$comment) = split(m|\s*//|,shift); + "\tadrp\t$args\@PAGE"; +} if ($flavour =~ /ios64/); + +my $paciasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd503233f" + : &$inst(0xd503233f); +}; + +my $autiasp = sub { + ($flavour =~ /linux/) ? "\t.inst\t0xd50323bf" + : &$inst(0xd50323bf); +}; + +sub range { + my ($r,$sfx,$start,$end) = @_; + + join(",",map("$r$_$sfx",($start..$end))); +} + +sub expand_line { + my $line = shift; + my @ret = (); + + pos($line)=0; + + while ($line =~ m/\G[^@\/\{\"]*/g) { + if ($line =~ m/\G(@|\/\/|$)/gc) { + last; + } + elsif ($line =~ m/\G\{/gc) { + my $saved_pos = pos($line); + $line =~ s/\G([rdqv])([0-9]+)([^\-]*)\-\1([0-9]+)\3/range($1,$3,$2,$4)/e; + pos($line) = $saved_pos; + $line =~ m/\G[^\}]*\}/g; + } + elsif ($line =~ m/\G\"/gc) { + $line =~ m/\G[^\"]*\"/g; + } + } + + $line =~ s/\b(\w+)/$GLOBALS{$1} or $1/ge; + + if ($flavour =~ /win/) { + # adjust alignment hints, "[rN,:32]" -> "[rN@32]" + $line =~ s/(\[\s*(?:r[0-9]+|sp))\s*,?\s*:([0-9]+\s*\])/$1\@$2/; + # adjust local labels, ".Lwhatever" -> "|$Lwhatever|" + $line =~ s/\.(L\w{2,})/|\$$1|/g; + # omit "#:lo12:" on win64 + $line =~ s/#:lo12://; + } elsif ($flavour =~ /coff(?!64)/) { + $line =~ s/\.L(\w{2,})/(\$ML$1)/g; + } elsif ($flavour =~ /ios64/) { + $line =~ s/#:lo12:(\w+)/$1\@PAGEOFF/; + } + + return $line; +} + +while(my $line=<>) { + + # fix up assembler-specific commentary delimiter + $line =~ s/@(?=[\s@])/\;/g if ($flavour =~ /win|coff/); + + if ($line =~ m/^\s*(#|@|;|\/\/)/) { print $line; next; } + + $line =~ s|/\*.*\*/||; # get rid of C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning... + $line =~ s|\s+$||; # ... and at the end + + { + $line =~ s|[\b\.]L(\w{2,})|L$1|g; # common denominator for Locallabel + $line =~ s|\bL(\w{2,})|\.L$1|g if ($dotinlocallabels); + } + + { + $line =~ s|(^[\.\w]+)\:\s*||; + my $label = $1; + if ($label) { + $label = ($GLOBALS{$label} or $label); + if ($flavour =~ /win/) { + $label =~ s|^\.L(?=\w)|\$L|; + printf "|%s|%s", $label, ($label eq $in_proc ? " PROC" : ""); + } else { + $label =~ s|^\.L(?=\w)|\$ML| if ($flavour =~ /coff(?!64)/); + printf "%s:", $label; + } + } + } + + if ($line !~ m/^[#@;]/) { + $line =~ s|^\s*(\.?)(\S+)\s*||; + my $c = $1; $c = "\t" if ($c eq ""); + my $mnemonic = $2; + my $opcode; + if ($mnemonic =~ m/([^\.]+)\.([^\.]+)/) { + $opcode = eval("\$$1_$2"); + } else { + $opcode = eval("\$$mnemonic"); + } + + my $arg=expand_line($line); + + if (ref($opcode) eq 'CODE') { + $line = &$opcode($arg); + } elsif ($mnemonic) { + if ($flavour =~ /win64/) { + # "b.cond" -> "bcond", kludge-fix:-( + $mnemonic =~ s/^b\.([a-z]{2}$)/b$1/; + } + $line = $c.$mnemonic; + $line.= "\t$arg" if ($arg ne ""); + } + } + + print $line if ($line); + print "\n"; +} + +print "\tEND\n" if ($flavour =~ /win/); + +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_256-armv8.pl b/blst/asm/ct_inverse_mod_256-armv8.pl new file mode 100755 index 0000000..ced8c6c --- /dev/null +++ b/blst/asm/ct_inverse_mod_256-armv8.pl @@ -0,0 +1,586 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~4.600 cycles on Apple M1, ~8.900 - +# on Cortex-A57. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(4..11)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(12..17)); +my $cnt = $n_ptr; +my @t = map("x$_",(19..26)); +my ($a_lo, $b_lo) = @acc[3,7]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + paciasp + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + str $out_ptr, [sp] + + ldp @acc[4], @acc[5], [$n_ptr,#8*0] + ldp @acc[6], @acc[7], [$n_ptr,#8*2] + + stp @acc[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] // copy modulus to |b| + stp @acc[6], @acc[7], [$in_ptr,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str $f0,[$out_ptr,#8*8] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str $f0, [$out_ptr,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr @acc[4], [$in_ptr,#8*8] // |u| + ldr @acc[5], [$in_ptr,#8*13] // |v| + madd @acc[0], $f_, @acc[4], xzr // |u|*|f0| + madd @acc[0], $g_, @acc[5], @acc[0] // |v|*|g0| + str @acc[0], [$out_ptr,#8*4] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*5] + stp @acc[1], @acc[1], [$out_ptr,#8*7] + + madd @acc[0], $f0, @acc[4], xzr // |u|*|f1| + madd @acc[0], $g0, @acc[5], @acc[0] // |v|*|g1| + str @acc[0], [$out_ptr,#8*9] + asr @acc[1], @acc[0], #63 // sign extenstion + stp @acc[1], @acc[1], [$out_ptr,#8*10] + stp @acc[1], @acc[1], [$out_ptr,#8*12] +___ +for($i=2; $i<15; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add $out_ptr, $out_ptr, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc @t[3], @t[3], @t[4] + str @t[3], [$out_ptr,#8*4] + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*5 // pointer to destination |v| + bl __smul_256x63 +___ +$code.=<<___ if ($i>7); + bl __smul_512x63_tail +___ +$code.=<<___ if ($i<=7); + adc @t[3], @t[3], @t[4] + stp @t[3], @t[3], [$out_ptr,#8*4] + stp @t[3], @t[3], [$out_ptr,#8*6] +___ +} +$code.=<<___; + ////////////////////////////////////////// two[!] last iterations + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + ldr $b_lo, [$in_ptr,#8*4] + bl __inner_loop_62_256 + + mov $f_, $f1 + mov $g_, $g1 + ldr $out_ptr, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh @t[1], @acc[3], $g_ // figure out top-most limb + ldp @acc[4], @acc[5], [$nx_ptr,#8*0] + adc @t[4], @t[4], @t[6] + ldp @acc[6], @acc[7], [$nx_ptr,#8*2] + + add @t[1], @t[1], @t[4] // @t[1] is 1, 0 or -1 + asr @t[0], @t[1], #63 // sign as mask + + and @t[4], @acc[4], @t[0] // add mod<<256 conditionally + and @t[5], @acc[5], @t[0] + adds @acc[0], @acc[0], @t[4] + and @t[6], @acc[6], @t[0] + adcs @acc[1], @acc[1], @t[5] + and @t[7], @acc[7], @t[0] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @t[3], @t[7] + adc @t[1], @t[1], xzr // @t[1] is 1, 0 or -1 + + neg @t[0], @t[1] + orr @t[1], @t[1], @t[0] // excess bit or sign as mask + asr @t[0], @t[0], #63 // excess bit as mask + + and @acc[4], @acc[4], @t[1] // mask |mod| + and @acc[5], @acc[5], @t[1] + and @acc[6], @acc[6], @t[1] + and @acc[7], @acc[7], @t[1] + + eor @acc[4], @acc[4], @t[0] // conditionally negate |mod| + eor @acc[5], @acc[5], @t[0] + adds @acc[4], @acc[4], @t[0], lsr#63 + eor @acc[6], @acc[6], @t[0] + adcs @acc[5], @acc[5], xzr + eor @acc[7], @acc[7], @t[0] + adcs @acc[6], @acc[6], xzr + adc @acc[7], @acc[7], xzr + + adds @acc[0], @acc[0], @acc[4] // final adjustment for |mod|<<256 + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adc @acc[3], @acc[3], @acc[7] + stp @acc[2], @acc[3], [$out_ptr,#8*6] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + autiasp + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*8+8*5*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldr @t[3+$j], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @t[3+$j], @t[3+$j], $f1 + umulh @t[0], @acc[0], $f_ + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $f_ + adcs @t[3+$j], @t[3+$j], xzr + umulh @t[2], @acc[2], $f_ +___ +$code.=<<___ if ($j!=0); + adc $g1, xzr, xzr // used in __smul_512x63_tail +___ +$code.=<<___; + mul @acc[0], @acc[0], $f_ + cmp $f_, #0 + mul @acc[1], @acc[1], $f_ + csel @t[3+$j], @t[3+$j], xzr, ne + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @t[5+$j], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[5+$j], @t[5+$j], @t[2] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @t[5], @t[5], @t[6] + stp @acc[2], @t[5], [$out_ptr,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh @t[5], @acc[3], $f_ + ldp @acc[1], @acc[2], [$in_ptr,#8*18] // load rest of |v| + adc @t[7], @t[7], xzr + ldr @acc[3], [$in_ptr,#8*20] + and @t[3], @t[3], $f_ + + umulh @acc[7], @acc[7], $g_ // resume |v|*|g1| chain + + sub @t[5], @t[5], @t[3] // tie up |u|*|f1| chain + asr @t[6], @t[5], #63 + + eor @acc[1], @acc[1], $f1 // conditionally negate rest of |v| + eor @acc[2], @acc[2], $f1 + adds @acc[1], @acc[1], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + umulh @t[0], @t[4], $g_ + adc @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $g_ + add @acc[7], @acc[7], @t[7] + umulh @t[2], @acc[2], $g_ + + mul @acc[0], @t[4], $g_ + mul @acc[1], @acc[1], $g_ + adds @acc[0], @acc[0], @acc[7] + mul @acc[2], @acc[2], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @t[3], @acc[3], $g_ + adcs @acc[2], @acc[2], @t[1] + adcs @t[3], @t[3], @t[2] + adc @t[4], xzr, xzr // used in the final step + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*4] + adcs @t[3], @t[3], @t[6] // carry is used in the final step + stp @acc[2], @t[3], [$out_ptr,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[4..7] if ($j); +my $k = 8*4*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[5], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[6], $f0, @t[5] // conditionally negate |f0| (or |g0|) + + eor @acc[0], @acc[0], @t[5] // conditionally negate |a| (or |b|) + sub @t[6], @t[6], @t[5] + eor @acc[1], @acc[1], @t[5] + adds @acc[0], @acc[0], @t[5], lsr#63 + eor @acc[2], @acc[2], @t[5] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[5] + umulh @t[0], @acc[0], @t[6] + adcs @acc[2], @acc[2], xzr + umulh @t[1], @acc[1], @t[6] + adc @acc[3], @acc[3], xzr + umulh @t[2], @acc[2], @t[6] + and @t[5], @t[5], @t[6] + umulh @t[3+$j], @acc[3], @t[6] + neg @t[5], @t[5] + + mul @acc[0], @acc[0], @t[6] + mul @acc[1], @acc[1], @t[6] + mul @acc[2], @acc[2], @t[6] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], @t[1] + adcs @acc[3], @acc[3], @t[2] + adc @t[3+$j], @t[3+$j], @t[5] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[4] + adcs @acc[1], @acc[1], @acc[5] + adcs @acc[2], @acc[2], @acc[6] + adcs @acc[3], @acc[3], @acc[7] + adc @acc[4], @t[3], @t[4] + + extr @acc[0], @acc[1], @acc[0], #31 + extr @acc[1], @acc[2], @acc[1], #31 + extr @acc[2], @acc[3], @acc[2], #31 + asr @t[4], @acc[4], #63 // result's sign as mask + extr @acc[3], @acc[4], @acc[3], #31 + + eor @acc[0], @acc[0], @t[4] // ensure the result is positive + eor @acc[1], @acc[1], @t[4] + adds @acc[0], @acc[0], @t[4], lsr#63 + eor @acc[2], @acc[2], @t[4] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[4] + adcs @acc[2], @acc[2], xzr + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adc @acc[3], @acc[3], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + + eor $f0, $f0, @t[4] // adjust |f/g| accordingly + eor $g0, $g0, @t[4] + sub $f0, $f0, @t[4] + sub $g0, $g0, @t[4] + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +___ + +{ +my @a = @acc[0..3]; +my @b = @acc[4..7]; +my ($fg0, $fg1, $bias) = ($g0, $g1, @t[4]); + +$code.=<<___; +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*6] + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*4] + +.Lab_approximation_31_256_loaded: + orr @t[0], @a[3], @b[3] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[1], ne + orr @t[0], @a[3], @b[3] // and ones before top-most, ... + csel @b[2], @b[2], @b[1], ne + + cmp @t[0], #0 + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + csel @a[2], @a[2], @a[0], ne + orr @t[0], @a[3], @b[3] // and one more, ... + csel @b[2], @b[2], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[3], @a[3], @a[2], ne + csel @b[3], @b[3], @b[2], ne + neg @t[1], @t[0] + + lslv @a[3], @a[3], @t[0] // align high limbs to the left + lslv @b[3], @b[3], @t[0] + lsrv @a[2], @a[2], @t[1] + lsrv @b[2], @b[2], @t[1] + and @a[2], @a[2], @t[1], asr#6 + and @b[2], @b[2], @t[1], asr#6 + orr $a_lo, @a[3], @a[2] + orr $b_lo, @b[3], @b[2] + + bfxil $a_lo, @a[0], #0, #31 + bfxil $b_lo, @b[0], #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov $cnt, #31 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $fg1 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + cbnz $cnt, .Loop_31_256 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62_256: + sbfx @t[3], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + and @t[0], $b_lo, @t[3] + sub @t[1], $b_lo, $a_lo // |b_|-|a_| + subs @t[2], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $a_lo, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov @t[1], $g0 + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + lsr $a_lo, $a_lo, #1 + and @t[0], $f1, @t[3] + and @t[1], $g1, @t[3] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +foreach(split("\n",$code)) { + s/\b(smaddl\s+x[0-9]+,\s)x([0-9]+,\s+)x([0-9]+)/$1w$2w$3/; + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_256-x86_64.pl b/blst/asm/ct_inverse_mod_256-x86_64.pl new file mode 100755 index 0000000..24ab545 --- /dev/null +++ b/blst/asm/ct_inverse_mod_256-x86_64.pl @@ -0,0 +1,837 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. ~5.300 cycles on Coffee Lake. +# +# void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, +# const vec256 modx); +# +$python_ref.=<<'___'; +def ct_inverse_mod_256(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 512 // k - 1): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_256_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_512x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 512 % k + k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 512 % k + k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + mod <<= 512 - mod.bit_length() # align to the left + if v < 0: + v += mod + if v < 0: + v += mod + elif v == 1<<512 + v -= mod + + return v & (2**512 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15)); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edx"; + +$frame = 8*6+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,\@function,4,"unwind" +.align 32 +ct_inverse_mod_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*6+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + + mov 8*0($n_ptr), @acc[4] # load modulus + mov 8*1($n_ptr), @acc[5] + mov 8*2($n_ptr), @acc[6] + mov 8*3($n_ptr), @acc[7] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + + mov @acc[4], 8*4(%rax) # copy modulus to |b| + mov @acc[5], 8*5(%rax) + mov @acc[6], 8*6(%rax) + mov @acc[7], 8*7(%rax) + mov %rax, $in_ptr + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*0(%rsp) # corrected |f0| + #mov $g0, 8*1(%rsp) # corrected |g0| + mov $f0, 8*8($out_ptr) # initialize |u| with |f0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + mov $f0, 8*9($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + #mov $f0, 8*2(%rsp) # corrected |f1| + #mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*8($in_ptr), @acc[0] # |u| + mov 8*13($in_ptr), @acc[4] # |v| + mov @acc[0], @acc[1] + imulq 8*0(%rsp), @acc[0] # |u|*|f0| + mov @acc[4], @acc[5] + imulq 8*1(%rsp), @acc[4] # |v|*|g0| + add @acc[4], @acc[0] + mov @acc[0], 8*4($out_ptr) # destination |u| + sar \$63, @acc[0] # sign extension + mov @acc[0], 8*5($out_ptr) + mov @acc[0], 8*6($out_ptr) + mov @acc[0], 8*7($out_ptr) + mov @acc[0], 8*8($out_ptr) + lea 8*8($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + imulq $f0, @acc[1] # |u|*|f1| + imulq $g0, @acc[5] # |v|*|g1| + add @acc[5], @acc[1] + mov @acc[1], 8*9($out_ptr) # destination |v| + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + mov @acc[1], 8*12($out_ptr) + mov @acc[1], 8*13($out_ptr) +___ +for($i=2; $i<15; $i++) { +my $smul_512x63 = $i>8 ? "__smulq_512x63" + : "__smulq_256x63"; +$code.=<<___; + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + mov $f1, 8*2(%rsp) + mov $g1, 8*3(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_256_n_shift_by_31 + mov $f0, 8*0(%rsp) # corrected |f0| + mov $g0, 8*1(%rsp) # corrected |g0| + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*4($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_256_n_shift_by_31 + mov $f0, 8*2(%rsp) # corrected |f1| + mov $g0, 8*3(%rsp) # corrected |g1| + + mov 8*0(%rsp), $f0 # |f0| + mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*4($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_256x63 + + mov 8*2(%rsp), $f0 # |f1| + mov 8*3(%rsp), $g0 # |g1| + lea 8*5($out_ptr),$out_ptr # pointer to destination |v| + call $smul_512x63 +___ +$code.=<<___ if ($i==8); + sar \$63, %rbp # sign extension + mov %rbp, 8*5($out_ptr) + mov %rbp, 8*6($out_ptr) + mov %rbp, 8*7($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*8, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$47, $cnt # 31 + 512 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*4($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62_256 + #mov $f0, 8*0(%rsp) + #mov $g0, 8*1(%rsp) + #mov $f1, 8*2(%rsp) + #mov $g1, 8*3(%rsp) + + #mov 8*0(%rsp), $f0 # |f0| + #mov 8*1(%rsp), $g0 # |g0| + lea 8*8($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_256x63 + + #mov 8*2(%rsp), $f0 # |f1| + #mov 8*3(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original |out_ptr| + call __smulq_512x63 + adc %rbp, %rdx # the excess limb of the result + + mov 8*5(%rsp), $in_ptr # original |nx_ptr| + mov %rdx, %rax + sar \$63, %rdx # result's sign as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + add @acc[0], @acc[4] # conditionally add |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + adc \$0, %rax + + mov %rax, %rdx + neg %rax + or %rax, %rdx # excess bit or sign as mask + sar \$63, %rax # excess bit as mask + + mov %rdx, @acc[0] # mask |modulus| + mov %rdx, @acc[1] + and 8*0($in_ptr), @acc[0] + mov %rdx, @acc[2] + and 8*1($in_ptr), @acc[1] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), %rdx + + xor %rax, @acc[0] # conditionally negate |modulus| + xor %rcx, %rcx + xor %rax, @acc[1] + sub %rax, %rcx + xor %rax, @acc[2] + xor %rax, %rdx + add %rcx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, %rdx + + add @acc[0], @acc[4] # final adjustment for |modulus|<<256 + adc @acc[1], @acc[5] + adc @acc[2], @acc[6] + adc %rdx, @acc[7] + + mov @acc[4], 8*4($out_ptr) # store absolute value + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +$code.=<<___; +.type __smulq_512x63,\@abi-omnipotent +.align 32 +__smulq_512x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), %rbp # sign limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, %rbp + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, %rbp + + mulq %rbx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov @acc[$i], 8*$i($out_ptr) + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, %rbp + neg %rbp + mulq %rbx + add %rax, @acc[3] + adc %rdx, %rbp + mov @acc[3], 8*3($out_ptr) + + mov 8*5($in_ptr), @acc[0] # load |v| + mov 8*6($in_ptr), @acc[1] + mov 8*7($in_ptr), @acc[2] + mov 8*8($in_ptr), @acc[3] + mov 8*9($in_ptr), @acc[4] + mov 8*10($in_ptr), @acc[5] + mov 8*11($in_ptr), @acc[6] + mov 8*12($in_ptr), @acc[7] + + mov $g0, $f0 + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $g0 # conditionally negate |g0| + add %rax, $g0 + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + + mulq $g0 + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<7; $i++) { +$code.=<<___; + mulq $g0 + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + imulq $g0 + add %rax, @acc[7] + adc \$0, %rdx # used in the final step + + mov %rbp, %rbx + sar \$63, %rbp # sign extension + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc %rbx, @acc[4] + adc %rbp, @acc[5] + adc %rbp, @acc[6] + adc %rbp, @acc[7] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + mov @acc[6], 8*6($out_ptr) + mov @acc[7], 8*7($out_ptr) + + ret +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,\@abi-omnipotent +.align 32 +__smulq_256x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*5*$j; +my @acc=@acc; @acc=@acc[4..7] if($j); +my $top="%rbp"; $top=$g0 if($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), $top # sign/excess limb + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |u| (or |v|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, $top + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, $top + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + and %rbx, $top + neg $top + mulq %rbx + add %rax, @acc[3] + adc %rdx, $top +___ +$code.=<<___ if ($j==0); + mov $g0, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] # accumulate |u|*|f0| + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc %rcx, %rbp + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov %rbp, 8*4($out_ptr) + + ret +.size __smulq_256x63,.-__smulq_256x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulq_256_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulq_256_n_shift_by_31: + mov $f0, 8*0($out_ptr) # offload |f0| + mov $g0, 8*1($out_ptr) # offload |g0| + mov $f0, %rbp +___ +for($j=0; $j<2; $j++) { +my $k = 8*4*$j; +my @acc=@acc; @acc=@acc[4..7] if ($j); +my $f0="%rbp"; $f0=$g0 if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + + mov $f0, %rbx + sar \$63, $f0 # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $f0, %rbx # conditionally negate |f0| (or |g0|) + add %rax, %rbx + + xor $f0, @acc[0] # conditionally negate |a| (or |b|) + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mulq %rbx + mov %rax, @acc[0] + mov @acc[1], %rax + and %rbx, $f0 + neg $f0 + mov %rdx, @acc[1] +___ +for($i=1; $i<3; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mulq %rbx + add %rax, @acc[3] + adc %rdx, $f0 +___ +} +$code.=<<___; + add @acc[4], @acc[0] + adc @acc[5], @acc[1] + adc @acc[6], @acc[2] + adc @acc[7], @acc[3] + adc $g0, %rbp + + mov 8*0($out_ptr), $f0 # restore original |f0| + mov 8*1($out_ptr), $g0 # restore original |g0| + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, %rbp, @acc[3] + + sar \$63, %rbp # sign as mask + xor %rax, %rax + sub %rbp, %rax # sign as bit + + xor %rbp, @acc[0] # conditionally negate the result + xor %rbp, @acc[1] + xor %rbp, @acc[2] + xor %rbp, @acc[3] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + + xor %rbp, $f0 # conditionally negate |f0| + xor %rbp, $g0 # conditionally negate |g0| + add %rax, $f0 + add %rax, $g0 + + ret +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +___ +} + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31_256,\@abi-omnipotent +.align 32 +__ab_approximation_31_256: + mov 8*3($in_ptr), @a[2] # load |a| in reverse order + mov 8*7($in_ptr), @b[2] # load |b| in reverse order + mov 8*2($in_ptr), @a[1] + mov 8*6($in_ptr), @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*5($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*4($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + not %rax + and %rax, @a[2] + and %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31_256 + + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +___ +} +$code.=<<___; +.type __inner_loop_31_256,\@abi-omnipotent +.align 32 # comment and punish Coffee Lake by up to 40% +__inner_loop_31_256: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31_256: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31_256 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,\@abi-omnipotent +.align 32 +__inner_loop_62_256: + mov $cnt, %r15d + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov $f0, $g1 # |g1|=1 + mov $f0, %r14 + +.Loop_62_256: + xor $t0, $t0 + test %r14, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test %r14, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, %r15d + jnz .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ct_inverse_mod_384-armv8.pl b/blst/asm/ct_inverse_mod_384-armv8.pl new file mode 100755 index 0000000..268bf9d --- /dev/null +++ b/blst/asm/ct_inverse_mod_384-armv8.pl @@ -0,0 +1,610 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smul_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smul_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = map("x$_", (0..3)); +my @acc=map("x$_",(3..14)); +my ($f0, $g0, $f1, $g1, $f_, $g_) = map("x$_",(15..17,19..21)); +my $cnt = $n_ptr; +my @t = map("x$_",(22..28,2)); +my ($a_lo, $a_hi, $b_lo, $b_hi) = @acc[0,5,6,11]; + +$frame = 16+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @t[0], @acc[1], [$in_ptr,#8*0] + ldp @acc[2], @acc[3], [$in_ptr,#8*2] + ldp @acc[4], @acc[5], [$in_ptr,#8*4] + + add $in_ptr, sp, #16+511 // find closest 512-byte-aligned spot + and $in_ptr, $in_ptr, #-512 // in the frame... + stp $out_ptr, $nx_ptr, [sp] + + ldp @acc[6], @acc[7], [$n_ptr,#8*0] + ldp @acc[8], @acc[9], [$n_ptr,#8*2] + ldp @acc[10], @acc[11], [$n_ptr,#8*4] + + stp @t[0], @acc[1], [$in_ptr,#8*0] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*2] + stp @acc[4], @acc[5], [$in_ptr,#8*4] + stp @acc[6], @acc[7], [$in_ptr,#8*6] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*8] + stp @acc[10], @acc[11], [$in_ptr,#8*10] + + ////////////////////////////////////////// first iteration + mov $cnt, #62 + bl .Lab_approximation_62_loaded + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str $f0,[$out_ptr,#8*12] // initialize |u| with |f0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str $f0, [$out_ptr,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr @acc[4], [$in_ptr,#8*12] // |u| + ldr @acc[5], [$in_ptr,#8*18] // |v| + mul @acc[0], $f_, @acc[4] // |u|*|f0| + smulh @acc[1], $f_, @acc[4] + mul @acc[2], $g_, @acc[5] // |v|*|g0| + smulh @acc[3], $g_, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*8] + stp @acc[2], @acc[2], [$out_ptr,#8*10] + + mul @acc[0], $f0, @acc[4] // |u|*|f1| + smulh @acc[1], $f0, @acc[4] + mul @acc[2], $g0, @acc[5] // |v|*|g1| + smulh @acc[3], $g0, @acc[5] + adds @acc[0], @acc[0], @acc[2] + adc @acc[1], @acc[1], @acc[3] + stp @acc[0], @acc[1], [$out_ptr,#8*12] + asr @acc[2], @acc[1], #63 // sign extenstion + stp @acc[2], @acc[2], [$out_ptr,#8*14] + stp @acc[2], @acc[2], [$out_ptr,#8*16] +___ +for($i=2; $i<11; $i++) { +$code.=<<___; + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + bl __ab_approximation_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov $f_, $f0 // corrected |f0| + mov $g_, $g0 // corrected |g0| + + mov $f0, $f1 // |f1| + mov $g0, $g1 // |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add $out_ptr, $out_ptr, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov $f_, $f0 // corrected |f1| + mov $g_, $g0 // corrected |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to destination |v| + bl __smul_383x63 +___ +$code.=<<___ if ($i>5); + bl __smul_767x63_tail +___ +$code.=<<___ if ($i==5); + asr @t[5], @t[5], #63 // sign extension + stp @t[5], @t[5], [$out_ptr,#8*6] + stp @t[5], @t[5], [$out_ptr,#8*8] + stp @t[5], @t[5], [$out_ptr,#8*10] +___ +} +$code.=<<___; + ////////////////////////////////////////// iteration before last + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp $a_lo, $a_hi, [$in_ptr,#8*0] // just load + ldp $b_lo, $b_hi, [$in_ptr,#8*6] + bl __inner_loop_62 + + eor $out_ptr, $in_ptr, #256 // pointer to dst |a|b|u|v| + str $a_lo, [$out_ptr,#8*0] + str $b_lo, [$out_ptr,#8*6] + + mov $f_, $f0 // exact |f0| + mov $g_, $g0 // exact |g0| + mov $f0, $f1 + mov $g0, $g1 + add $out_ptr, $out_ptr, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov $f_, $f0 // exact |f1| + mov $g_, $g0 // exact |g1| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor $in_ptr, $in_ptr, #256 // flip-flop src |a|b|u|v| + mov $cnt, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr $a_lo, [$in_ptr,#8*0] // just load + eor $a_hi, $a_hi, $a_hi + ldr $b_lo, [$in_ptr,#8*6] + eor $b_hi, $b_hi, $b_hi + bl __inner_loop_62 + + mov $f_, $f1 + mov $g_, $g1 + ldp $out_ptr, $f0, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr @t[0], @acc[5], #63 // sign as mask + ldp @acc[6], @acc[7], [$f0,#8*0] + ldp @acc[8], @acc[9], [$f0,#8*2] + ldp @acc[10], @acc[11], [$f0,#8*4] + + and @acc[6], @acc[6], @t[0] // add mod<<384 conditionally + and @acc[7], @acc[7], @t[0] + adds @acc[0], @acc[0], @acc[6] + and @acc[8], @acc[8], @t[0] + adcs @acc[1], @acc[1], @acc[7] + and @acc[9], @acc[9], @t[0] + adcs @acc[2], @acc[2], @acc[8] + and @acc[10], @acc[10], @t[0] + adcs @acc[3], @acc[3], @acc[9] + and @acc[11], @acc[11], @t[0] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @acc[11] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: +___ +for($j=0; $j<2; $j++) { +my $f_ = $f_; $f_ = $g_ if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*12+8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |u| (or |v|) + asr $f1, $f_, #63 // |f_|'s sign as mask (or |g_|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $f_, $f_, $f1 // conditionally negate |f_| (or |g_|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], $f1 // conditionally negate |u| (or |v|) + sub $f_, $f_, $f1 + eor @acc[1], @acc[1], $f1 + adds @acc[0], @acc[0], $f1, lsr#63 + eor @acc[2], @acc[2], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $f_ + eor @acc[5], @acc[5], $f1 + umulh @t[1], @acc[1], $f_ + adcs @acc[4], @acc[4], xzr + umulh @t[2], @acc[2], $f_ + adcs @acc[5], @acc[5], xzr + umulh @t[3], @acc[3], $f_ +___ +$code.=<<___ if ($j); + adc $g1, xzr, xzr // used in __smul_767x63_tail +___ +$code.=<<___; + umulh @t[4], @acc[4], $f_ + mul @acc[0], @acc[0], $f_ + mul @acc[1], @acc[1], $f_ + mul @acc[2], @acc[2], $f_ + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $f_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $f_ + adcs @acc[3], @acc[3], @t[2] + mul @t[5+$j],@acc[5], $f_ + adcs @acc[4], @acc[4], @t[3] + adcs @t[5+$j],@t[5+$j],@t[4] +___ +$code.=<<___ if ($j==0); + adc @t[7], xzr, xzr +___ +} +$code.=<<___; + adc @t[7], @t[7], xzr + + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], @acc[10] + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adcs @t[5], @t[5], @t[6] + stp @acc[4], @t[5], [$out_ptr,#8*4] + adc @t[6], @t[7], xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh @t[5], @acc[5], $f_ + ldp @acc[0], @acc[1], [$in_ptr,#8*24] // load rest of |v| + umulh @acc[11],@acc[11], $g_ + ldp @acc[2], @acc[3], [$in_ptr,#8*26] + ldp @acc[4], @acc[5], [$in_ptr,#8*28] + + eor @acc[0], @acc[0], $f1 // conditionally negate rest of |v| + eor @acc[1], @acc[1], $f1 + eor @acc[2], @acc[2], $f1 + adds @acc[0], @acc[0], $g1 + eor @acc[3], @acc[3], $f1 + adcs @acc[1], @acc[1], xzr + eor @acc[4], @acc[4], $f1 + adcs @acc[2], @acc[2], xzr + eor @acc[5], @acc[5], $f1 + adcs @acc[3], @acc[3], xzr + umulh @t[0], @acc[0], $g_ + adcs @acc[4], @acc[4], xzr + umulh @t[1], @acc[1], $g_ + adc @acc[5], @acc[5], xzr + + umulh @t[2], @acc[2], $g_ + add @acc[11], @acc[11], @t[6] + umulh @t[3], @acc[3], $g_ + asr @t[6], @t[5], #63 + umulh @t[4], @acc[4], $g_ + mul @acc[0], @acc[0], $g_ + mul @acc[1], @acc[1], $g_ + mul @acc[2], @acc[2], $g_ + adds @acc[0], @acc[0], @acc[11] + mul @acc[3], @acc[3], $g_ + adcs @acc[1], @acc[1], @t[0] + mul @acc[4], @acc[4], $g_ + adcs @acc[2], @acc[2], @t[1] + mul @acc[5], @acc[5], $g_ + adcs @acc[3], @acc[3], @t[2] + adcs @acc[4], @acc[4], @t[3] + adc @acc[5], @acc[5], @t[4] + + adds @acc[0], @acc[0], @t[5] + adcs @acc[1], @acc[1], @t[6] + adcs @acc[2], @acc[2], @t[6] + adcs @acc[3], @acc[3], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*6] + adcs @acc[4], @acc[4], @t[6] + stp @acc[2], @acc[3], [$out_ptr,#8*8] + adc @acc[5], @acc[5], @t[6] + stp @acc[4], @acc[5], [$out_ptr,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: +___ +for($j=0; $j<2; $j++) { +my $f0 = $f0; $f0 = $g0 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |a| (or |b|) + asr @t[6], $f0, #63 // |f0|'s sign as mask (or |g0|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor @t[7], $f0, @t[6] // conditionally negate |f0| (or |g0|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |a| (or |b|) + sub @t[7], @t[7], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], @t[7] + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], @t[7] + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], @t[7] + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], @t[7] + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], @t[7] + smulh @t[5+$j], @acc[5], @t[7] + mul @acc[0], @acc[0], @t[7] + mul @acc[1], @acc[1], @t[7] + mul @acc[2], @acc[2], @t[7] + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], @t[7] + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], @t[7] + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], @t[7] + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], xzr +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #62 + extr @acc[1], @acc[2], @acc[1], #62 + extr @acc[2], @acc[3], @acc[2], #62 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #62 + extr @acc[4], @acc[5], @acc[4], #62 + extr @acc[5], @acc[6], @acc[5], #62 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + eor $f0, $f0, @t[6] + eor $g0, $g0, @t[6] + sub $f0, $f0, @t[6] + sub $g0, $g0, @t[6] + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; + +$code.=<<___; +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp @a[4], @a[5], [$in_ptr,#8*4] + ldp @b[4], @b[5], [$in_ptr,#8*10] + ldp @a[2], @a[3], [$in_ptr,#8*2] + ldp @b[2], @b[3], [$in_ptr,#8*8] + +.Lab_approximation_62_loaded: + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + ldp @a[0], @a[1], [$in_ptr,#8*0] + ldp @b[0], @b[1], [$in_ptr,#8*6] + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[1], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr @a[5], @a[5], @a[4] + orr @b[5], @b[5], @b[4] + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov $f0, #1 // |f0|=1 + mov $g0, #0 // |g0|=0 + mov $f1, #0 // |f1|=0 + mov $g1, #1 // |g1|=1 + +.Loop_62: + sbfx @t[6], $a_lo, #0, #1 // if |a_| is odd, then we'll be subtracting + sub $cnt, $cnt, #1 + subs @t[2], $b_lo, $a_lo // |b_|-|a_| + and @t[0], $b_lo, @t[6] + sbc @t[3], $b_hi, $a_hi + and @t[1], $b_hi, @t[6] + subs @t[4], $a_lo, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov @t[0], $f0 + sbcs @t[5], $a_hi, @t[1] + mov @t[1], $g0 + csel $b_lo, $b_lo, $a_lo, hs // |b_| = |a_| + csel $b_hi, $b_hi, $a_hi, hs + csel $a_lo, @t[4], @t[2], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $a_hi, @t[5], @t[3], hs + csel $f0, $f0, $f1, hs // exchange |f0| and |f1| + csel $f1, $f1, @t[0], hs + csel $g0, $g0, $g1, hs // exchange |g0| and |g1| + csel $g1, $g1, @t[1], hs + extr $a_lo, $a_hi, $a_lo, #1 + lsr $a_hi, $a_hi, #1 + and @t[0], $f1, @t[6] + and @t[1], $g1, @t[6] + add $f1, $f1, $f1 // |f1|<<=1 + add $g1, $g1, $g1 // |g1|<<=1 + sub $f0, $f0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $g0, $g0, @t[1] // |g0|-=|g1| (or |g0-=0| ...) + cbnz $cnt, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ + +print $code; +close STDOUT; diff --git a/blst/asm/ct_is_square_mod_384-armv8.pl b/blst/asm/ct_is_square_mod_384-armv8.pl new file mode 100755 index 0000000..dcf3ff8 --- /dev/null +++ b/blst/asm/ct_is_square_mod_384-armv8.pl @@ -0,0 +1,398 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >12x better [on +# Cortex cores] than modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +my ($in_ptr, $out_ptr, $L) = map("x$_", (0..2)); +my @acc=map("x$_",(3..14)); +my ($cnt, $f0, $g0, $f1, $g1) = map("x$_",(15..17,19..20)); +my @t = map("x$_",(21..28)); +my ($a_, $b_) = @acc[5,11]; + +$frame = 2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + paciasp + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #$frame + + ldp @acc[0], @acc[1], [x0,#8*0] // load input + ldp @acc[2], @acc[3], [x0,#8*2] + ldp @acc[4], @acc[5], [x0,#8*4] + + add $in_ptr, sp, #255 // find closest 256-byte-aligned spot + and $in_ptr, $in_ptr, #-256 // in the frame... + + ldp @acc[6], @acc[7], [x1,#8*0] // load modulus + ldp @acc[8], @acc[9], [x1,#8*2] + ldp @acc[10], @acc[11], [x1,#8*4] + + stp @acc[0], @acc[1], [$in_ptr,#8*6] // copy input to |a| + stp @acc[2], @acc[3], [$in_ptr,#8*8] + stp @acc[4], @acc[5], [$in_ptr,#8*10] + stp @acc[6], @acc[7], [$in_ptr,#8*0] // copy modulus to |b| + stp @acc[8], @acc[9], [$in_ptr,#8*2] + stp @acc[10], @acc[11], [$in_ptr,#8*4] + + eor $L, $L, $L // init the Legendre symbol + mov $cnt, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub $cnt, $cnt, #1 + + eor $out_ptr, $in_ptr, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov $f1, $f0 // |f0| + mov $g1, $g0 // |g0| + add $out_ptr, $out_ptr, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp @acc[6], @acc[7], [$out_ptr,#-8*6] + eor $in_ptr, $in_ptr, #128 // flip-flop src |a|b| + and @t[6], @t[6], @acc[6] // if |a| was negative, + add $L, $L, @t[6], lsr#1 // adjust |L| + + cbnz $cnt, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr $a_, [$in_ptr,#8*6] // just load + mov $b_, @acc[6] // ldr $b_, [$in_ptr,#8*0] + mov $cnt, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, $L, #1 + eor x0, x0, #1 + + add sp, sp, #$frame + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + autiasp + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +my $fx = $g1; $fx = $f1 if ($j); +my @acc = @acc; @acc = @acc[6..11] if ($j); +my $k = 8*6*$j; +$code.=<<___; + ldp @acc[0], @acc[1], [$in_ptr,#8*0+$k] // load |b| (or |a|) + asr @t[6], $fx, #63 // |g1|'s sign as mask (or |f1|'s) + ldp @acc[2], @acc[3], [$in_ptr,#8*2+$k] + eor $fx, $fx, @t[6] // conditionally negate |g1| (or |f1|) + ldp @acc[4], @acc[5], [$in_ptr,#8*4+$k] + + eor @acc[0], @acc[0], @t[6] // conditionally negate |b| (or |a|) + sub $fx, $fx, @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + umulh @t[0], @acc[0], $fx + adcs @acc[3], @acc[3], xzr + umulh @t[1], @acc[1], $fx + eor @acc[5], @acc[5], @t[6] + umulh @t[2], @acc[2], $fx + adcs @acc[4], @acc[4], xzr + umulh @t[3], @acc[3], $fx + adc @acc[5], @acc[5], xzr + + umulh @t[4], @acc[4], $fx + and @t[7], $fx, @t[6] + umulh @t[5+$j], @acc[5], $fx + neg @t[7], @t[7] + mul @acc[0], @acc[0], $fx + mul @acc[1], @acc[1], $fx + mul @acc[2], @acc[2], $fx + adds @acc[1], @acc[1], @t[0] + mul @acc[3], @acc[3], $fx + adcs @acc[2], @acc[2], @t[1] + mul @acc[4], @acc[4], $fx + adcs @acc[3], @acc[3], @t[2] + mul @acc[5], @acc[5], $fx + adcs @acc[4], @acc[4], @t[3] + adcs @acc[5], @acc[5] ,@t[4] + adc @t[5+$j], @t[5+$j], @t[7] +___ +} +$code.=<<___; + adds @acc[0], @acc[0], @acc[6] + adcs @acc[1], @acc[1], @acc[7] + adcs @acc[2], @acc[2], @acc[8] + adcs @acc[3], @acc[3], @acc[9] + adcs @acc[4], @acc[4], @acc[10] + adcs @acc[5], @acc[5], @acc[11] + adc @acc[6], @t[5], @t[6] + + extr @acc[0], @acc[1], @acc[0], #30 + extr @acc[1], @acc[2], @acc[1], #30 + extr @acc[2], @acc[3], @acc[2], #30 + asr @t[6], @acc[6], #63 + extr @acc[3], @acc[4], @acc[3], #30 + extr @acc[4], @acc[5], @acc[4], #30 + extr @acc[5], @acc[6], @acc[5], #30 + + eor @acc[0], @acc[0], @t[6] + eor @acc[1], @acc[1], @t[6] + adds @acc[0], @acc[0], @t[6], lsr#63 + eor @acc[2], @acc[2], @t[6] + adcs @acc[1], @acc[1], xzr + eor @acc[3], @acc[3], @t[6] + adcs @acc[2], @acc[2], xzr + eor @acc[4], @acc[4], @t[6] + adcs @acc[3], @acc[3], xzr + eor @acc[5], @acc[5], @t[6] + stp @acc[0], @acc[1], [$out_ptr,#8*0] + adcs @acc[4], @acc[4], xzr + stp @acc[2], @acc[3], [$out_ptr,#8*2] + adc @acc[5], @acc[5], xzr + stp @acc[4], @acc[5], [$out_ptr,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +___ + +{ +my @a = @acc[0..5]; +my @b = @acc[6..11]; +my ($fg0, $fg1, $bias, $cnt) = ($g0, $g1, @t[6], @t[7]); + +$code.=<<___; +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp @b[4], @b[5], [$in_ptr,#8*4] // |a| is still in registers + ldp @b[2], @b[3], [$in_ptr,#8*2] + + orr @t[0], @a[5], @b[5] // check top-most limbs, ... + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[3], ne + orr @t[0], @a[5], @b[5] // ... ones before top-most, ... + csel @b[4], @b[4], @b[3], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[2], ne + orr @t[0], @a[5], @b[5] // ... and ones before that ... + csel @b[4], @b[4], @b[2], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[1], ne + orr @t[0], @a[5], @b[5] // and one more, ... + csel @b[4], @b[4], @b[1], ne + + cmp @t[0], #0 + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + csel @a[4], @a[4], @a[0], ne + orr @t[0], @a[5], @b[5] + csel @b[4], @b[4], @b[0], ne + + clz @t[0], @t[0] + cmp @t[0], #64 + csel @t[0], @t[0], xzr, ne + csel @a[5], @a[5], @a[4], ne + csel @b[5], @b[5], @b[4], ne + neg @t[1], @t[0] + + lslv @a[5], @a[5], @t[0] // align high limbs to the left + lslv @b[5], @b[5], @t[0] + lsrv @a[4], @a[4], @t[1] + lsrv @b[4], @b[4], @t[1] + and @a[4], @a[4], @t[1], asr#6 + and @b[4], @b[4], @t[1], asr#6 + orr $a_, @a[5], @a[4] + orr $b_, @b[5], @b[4] + + bfxil $a_, @a[0], #0, #32 + bfxil $b_, @b[0], #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov $cnt, #30 + mov $fg0, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov $fg1, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov $bias,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 // L + (a_ & b_) >> 1 + mov @t[0], $fg1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $fg1, $fg1, $fg0, hs // exchange |fg0| and |fg1| + csel $fg0, $fg0, @t[0], hs + csel $L, $L, @t[4], hs + lsr $a_, $a_, #1 + and @t[0], $fg1, @t[3] + and @t[1], $bias, @t[3] + add $t[2], $b_, #2 + sub $fg0, $fg0, @t[0] // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add $fg1, $fg1, $fg1 // |f1|<<=1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add $fg0, $fg0, @t[1] + sub $fg1, $fg1, $bias + + cbnz $cnt, .Loop_30 + + mov $bias, #0x7FFFFFFF + ubfx $f0, $fg0, #0, #32 + ubfx $g0, $fg0, #32, #32 + ubfx $f1, $fg1, #0, #32 + ubfx $g1, $fg1, #32, #32 + sub $f0, $f0, $bias // remove the bias + sub $g0, $g0, $bias + sub $f1, $f1, $bias + sub $g1, $g1, $bias + + ret +.size __inner_loop_30,.-__inner_loop_30 +___ +} + +$code.=<<___; +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx @t[3], $a_, #0, #1 // if |a_| is odd, then we'll be subtracting + and @t[4], $a_, $b_ + sub $cnt, $cnt, #1 + and @t[0], $b_, @t[3] + sub @t[1], $b_, $a_ // |b_|-|a_| + subs @t[2], $a_, @t[0] // |a_|-|b_| (or |a_|-0 if |a_| was even) + add @t[4], $L, @t[4], lsr#1 + csel $b_, $b_, $a_, hs // |b_| = |a_| + csel $a_, @t[2], @t[1], hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel $L, $L, @t[4], hs + add $t[2], $b_, #2 + lsr $a_, $a_, #1 + add $L, $L, $t[2], lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz $cnt, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ + +print $code; +close STDOUT; diff --git a/blst/asm/ct_is_square_mod_384-x86_64.pl b/blst/asm/ct_is_square_mod_384-x86_64.pl new file mode 100755 index 0000000..40016ed --- /dev/null +++ b/blst/asm/ct_is_square_mod_384-x86_64.pl @@ -0,0 +1,494 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast quadratic residue test as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific Legendre symbol addition chain... +# +# bool ct_is_square_mod_384(const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_is_square_mod_384(inp, mod): + a = inp + b = mod + L = 0 # only least significant bit, adding 1 makes up for sign change + + k = 30 + w = 32 + mask = (1 << w) - 1 + + for i in range(0, 768 // k - 1): + # __ab_approximation_30 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_30 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + L += (a_ & b_) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + L += (b_ + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + # __smulq_384_n_shift_by_30 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if b < 0: + b = -b + if a < 0: + a = -a + L += (b % 4) >> 1 # |b| is always odd, the second bit + # tells the whole story + + if True: + for j in range(0, 768 % k + k): + if a & 1: + if a < b: + a, b = b, a + L += (a & b) >> 1 # |a| and |b| are both odd, second bits + # tell the whole story + a = a-b + a = a >> 1 + L += (b + 2) >> 2 # if |b|%8 is 3 or 5 [out of 1,3,5,7] + + return (L & 1) ^ 1 +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr) = ("%rdi", "%rsi"); +my ($f0, $g0, $f1, $g1) = ("%rax", "%rbx", "%rdx","%rcx"); +my @acc=map("%r$_",(8..15)); +my $L = "%rbp"; + +$frame = 8*3+2*256; + +$code.=<<___; +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,\@function,2,"unwind" +.align 32 +ct_is_square_mod_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*3+255(%rsp), %rax # find closest 256-byte-aligned spot + and \$-256, %rax # in the frame... + + mov 8*0(%rdi), @acc[0] # load input + mov 8*1(%rdi), @acc[1] + mov 8*2(%rdi), @acc[2] + mov 8*3(%rdi), @acc[3] + mov 8*4(%rdi), @acc[4] + mov 8*5(%rdi), @acc[5] + + mov 8*0(%rsi), @acc[6] # load modulus + mov 8*1(%rsi), @acc[7] + mov 8*2(%rsi), %rbx + mov 8*3(%rsi), %rcx + mov 8*4(%rsi), %rdx + mov 8*5(%rsi), %rdi + mov %rax, $in_ptr # pointer to source |a|b| + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov %rbx, 8*8(%rax) + mov %rcx, 8*9(%rax) + mov %rdx, 8*10(%rax) + mov %rdi, 8*11(%rax) + + xor $L, $L # initialize the Legendre symbol + mov \$24, %ecx # 24 is 768/30-1 + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + mov %ecx, 8*2(%rsp) # offload loop counter + + call __ab_approximation_30 + mov $f0, 8*0(%rsp) # offload |f0| and |g0| + mov $g0, 8*1(%rsp) + + mov \$128+8*6, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |b| + call __smulq_384_n_shift_by_30 + + mov 8*0(%rsp), $f1 # pop |f0| and |g0| + mov 8*1(%rsp), $g1 + lea -8*6($out_ptr),$out_ptr # pointer to destination |a| + call __smulq_384_n_shift_by_30 + + mov 8*2(%rsp), %ecx # re-load loop counter + xor \$128, $in_ptr # flip-flop pointer to source |a|b| + + and 8*6($out_ptr), @acc[6] # if |a| was negative, adjust |L| + shr \$1, @acc[6] + add @acc[6], $L + + sub \$1, %ecx + jnz .Loop_is_square + + ################################# last iteration + #call __ab_approximation_30 # |a| and |b| are exact, just load + #mov 8*0($in_ptr), @acc[0] # |a_| + mov 8*6($in_ptr), @acc[1] # |b_| + call __inner_loop_48 # 48 is 768%30+30 + + mov \$1, %rax + and $L, %rax + xor \$1, %rax # return value + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,\@abi-omnipotent +.align 32 +__smulq_384_n_shift_by_30: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, %rbx # |f1| (or |g1|) + sar \$63, %rdx # |f1|'s sign as mask (or |g1|'s) + xor %rax, %rax + sub %rdx, %rax # |f1|'s sign as bit (or |g1|'s) + + xor %rdx, %rbx # conditionally negate |f1| (or |g1|) + add %rax, %rbx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov %rdx, @acc[6+$j] + and %rbx, @acc[6+$j] + mulq %rbx # |a|*|f1| (or |b|*|g1|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq %rbx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + neg @acc[6+$j] + mulq %rbx + add %rax, @acc[5] + adc %rdx, @acc[6+$j] +___ +$code.=<<___ if ($j==0); + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov $g1, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc @acc[7], @acc[6] + + shrd \$30, @acc[1], @acc[0] + shrd \$30, @acc[2], @acc[1] + shrd \$30, @acc[3], @acc[2] + shrd \$30, @acc[4], @acc[3] + shrd \$30, @acc[5], @acc[4] + shrd \$30, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor %rbx, %rbx + sub @acc[6], %rbx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add %rbx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +___ +{ +my ($a_, $b_) = @acc[0..1]; +my ($t0, $t1, $t2, $t3, $t4, $t5) = map("%r$_",(10..15)); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t5); +my $cnt = "%edi"; +{ +my @a = @acc[0..5]; +my @b = (@a[1..3], $t4, $t5, $g0); + +$code.=<<___; +.type __ab_approximation_30,\@abi-omnipotent +.align 32 +__ab_approximation_30: + mov 8*11($in_ptr), @b[5] # load |b| in reverse order + mov 8*10($in_ptr), @b[4] + mov 8*9($in_ptr), @b[3] + + mov @a[5], %rax + or @b[5], %rax # check top-most limbs, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[3], @a[4] + mov 8*8($in_ptr), @b[2] + cmovz @b[3], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... ones before top-most, ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[2], @a[4] + mov 8*7($in_ptr), @b[1] + cmovz @b[2], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[1], @a[4] + mov 8*6($in_ptr), @b[0] + cmovz @b[1], @b[4] + + mov @a[5], %rax + or @b[5], %rax # ... and ones before that ... + cmovz @a[4], @a[5] + cmovz @b[4], @b[5] + cmovz @a[0], @a[4] + cmovz @b[0], @b[4] + + mov @a[5], %rax + or @b[5], %rax + bsr %rax, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[5] + cmovz @b[0], @b[5] + cmovz %rax, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[4], @a[5] # align second limb to the left + shldq %cl, @b[4], @b[5] + + mov \$0xFFFFFFFF00000000, %rax + mov @a[0]d, ${a_}d + mov @b[0]d, ${b_}d + and %rax, @a[5] + and %rax, @b[5] + or @a[5], ${a_} + or @b[5], ${b_} + + jmp __inner_loop_30 + + ret +.size __ab_approximation_30,.-__ab_approximation_30 +___ +} +$code.=<<___; +.type __inner_loop_30,\@abi-omnipotent +.align 32 +__inner_loop_30: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + lea -1($fg0), $bias # 0x7FFFFFFF7FFFFFFF + mov \$30, $cnt + +.Loop_30: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax # pre-"negate" |L| + mov $fg0, $t2 + mov $fg1, $t3 + mov $L, $t4 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + cmovz $t4, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + lea (%rax,$L), $L # "negate" |L| if |b|%8 is 3 or 5 + sub $bias, $fg1 + + sub \$1, $cnt + jnz .Loop_30 + + shr \$32, $bias + mov %ebx, %eax # $fg0 -> $f0 + shr \$32, $g0 + mov %ecx, %edx # $fg1 -> $f1 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,\@abi-omnipotent +.align 32 +__inner_loop_48: + mov \$48, $cnt # 48 is 768%30+30 + +.Loop_48: + mov $a_, %rax + and $b_, %rax + shr \$1, %rax # (a_ & b_) >> 1 + + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + lea (%rax,$L), %rax + mov $L, $t2 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb %rax, $L + + sub $b_, $a_ # |a_|-|b_| + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $L + + lea 2($b_), %rax + shr \$1, $a_ # |a_|>>=1 + shr \$2, %rax + add %rax, $L # "negate" |L| if |b|%8 is 3 or 5 + + sub \$1, $cnt + jnz .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ctq_inverse_mod_384-x86_64.pl b/blst/asm/ctq_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000..2be39d8 --- /dev/null +++ b/blst/asm/ctq_inverse_mod_384-x86_64.pl @@ -0,0 +1,886 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >5x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 62 + w = 64 + mask = (1 << w) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_62 + n = max(a.bit_length(), b.bit_length()) + if n < 128: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-w)) << w) + b_ = (b & mask) | ((b >> (n-w)) << w) + + # __inner_loop_62 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulq_383_n_shift_by_62 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulq_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,\@function,4,"unwind" +.align 32 +ct_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr # pointer to source |a|b|1|0| + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr),$in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<11; $i++) { +my $smul_767x63 = $i>5 ? "__smulq_767x63" + : "__smulq_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + call __ab_approximation_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulq_383_n_shift_by_62 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulq_383_n_shift_by_62 + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==5); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# iteration before last + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$62, $cnt + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + mov 8*1($in_ptr), @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + mov 8*7($in_ptr), @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + mov @acc[0], 8*0($out_ptr) + mov @acc[2], 8*6($out_ptr) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*12($out_ptr),$out_ptr # pointer to destination |u| + call __smulq_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call __smulq_767x63 + + ################################# last iteration + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$22, $cnt # 766 % 62 + #call __ab_approximation_62 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulq_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulq_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +___ +######################################################################## +# see corresponding commentary in ctx_inverse_mod_384-x86_64... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulq_767x63,\@abi-omnipotent +.align 32 +__smulq_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $f0 # |f0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor $f0, $fx # conditionally negate |f0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |u| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| + mov %rax, 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] + mov @acc[$i], 8*$i($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + mov @acc[5], 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + mov $f0, $fx # overrides in_ptr + sar \$63, $f0 # |g0|'s sign as mask + xor %rax, %rax + sub $f0, %rax # |g0|'s sign as bit + + xor $f0, $fx # conditionally negate |g0| + add %rax, $fx + + xor $f0, @acc[0] # conditionally negate |v| + xor $f0, @acc[1] + xor $f0, @acc[2] + xor $f0, @acc[3] + xor $f0, @acc[4] + xor $f0, @acc[5] + xor $f0, @acc[6] + xor $f0, @acc[7] + xor $f0, @acc[8] + xor $f0, @acc[9] + xor $f0, @acc[10] + xor $f0, @acc[11] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulq $fx # |v|*|g0| + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<11; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___; + mov 8*1(%rsp), %rdx # out_ptr + imulq $fx, %rax + mov 8*2(%rsp), $in_ptr # restore original in_ptr + add @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulq_767x63,.-__smulq_767x63 +___ +} +$code.=<<___; +.type __smulq_383x63,\@abi-omnipotent +.align 32 +__smulq_383x63: +___ +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |u| (or |v|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |u|*|f0| (or |v|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx, %rax + add %rax, @acc[$i] + + lea 8*6($in_ptr), $in_ptr # pointer to |v| + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx, %rax + add %rax, @acc[$i] + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulq_383x63,.-__smulq_383x63 +___ +{ +$code.=<<___; +.type __smulq_383_n_shift_by_62,\@abi-omnipotent +.align 32 +__smulq_383_n_shift_by_62: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +$code.=<<___; + mov 8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov %rdx, $fx + sar \$63, %rdx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub %rdx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor %rdx, $fx # conditionally negate |f0| (or |g0|) + add %rax, $fx + + xor %rdx, @acc[0] # conditionally negate |a| (or |b|) + xor %rdx, @acc[1] + xor %rdx, @acc[2] + xor %rdx, @acc[3] + xor %rdx, @acc[4] + xor %rdx, @acc[5] + add @acc[0], %rax + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulq $fx # |a|*|f0| (or |b|*|g0|) + mov %rax, @acc[0] + mov @acc[1], %rax + mov %rdx, @acc[1] +___ +for($i=1; $i<5; $i++) { +$code.=<<___; + mulq $fx + add %rax, @acc[$i] + mov @acc[$i+1], %rax + adc \$0, %rdx + mov %rdx, @acc[$i+1] +___ +} +$code.=<<___ if ($j==0); + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea 8*6($in_ptr), $in_ptr # pointer to |b| + mov %rdx, @acc[6] + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + imulq $fx + add %rax, @acc[$i] + adc \$0, %rdx + + lea -8*6($in_ptr), $in_ptr # restore original in_ptr + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$62, @acc[1], @acc[0] + shrd \$62, @acc[2], @acc[1] + shrd \$62, @acc[3], @acc[2] + shrd \$62, @acc[4], @acc[3] + shrd \$62, @acc[5], @acc[4] + shrd \$62, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4, $t5) = ("%rax","%rbx","%rbp","%r14","%r15","%rsi"); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_62,\@abi-omnipotent +.align 32 +__ab_approximation_62: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*2($in_ptr), @a[0] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*1($in_ptr), @a[0] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + mov 8*0($in_ptr), @a[0] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + jmp __inner_loop_62 + + ret +.size __ab_approximation_62,.-__ab_approximation_62 +___ +} +$code.=<<___; +.type __inner_loop_62,\@abi-omnipotent +.align 8 +.long 0 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + mov $in_ptr, 8(%rsp) + +.Loop_62: + xor $t0, $t0 + xor $t1, $t1 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t2 + mov $b_hi, $t3 + cmovnz $b_lo, $t0 + cmovnz $b_hi, $t1 + sub $a_lo, $t2 # |b_|-|a_| + sbb $a_hi, $t3 + mov $a_lo, $t4 + mov $a_hi, $t5 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + sbb $t1, $a_hi + cmovc $t2, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t3, $a_hi + cmovc $t4, $b_lo # |b_| = |a_| + cmovc $t5, $b_hi + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shrd \$1, $a_hi, $a_lo + shr \$1, $a_hi + test \$1, $t4 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + mov 8(%rsp), $in_ptr + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/ctx_inverse_mod_384-x86_64.pl b/blst/asm/ctx_inverse_mod_384-x86_64.pl new file mode 100755 index 0000000..d207e2f --- /dev/null +++ b/blst/asm/ctx_inverse_mod_384-x86_64.pl @@ -0,0 +1,995 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Both constant-time and fast Euclidean inversion as suggested in +# https://eprint.iacr.org/2020/972. Performance is >4x better than +# modulus-specific FLT addition chain... +# +# void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod); +# +$python_ref.=<<'___'; +def ct_inverse_mod_383(inp, mod): + a, u = inp, 1 + b, v = mod, 0 + + k = 31 + mask = (1 << k) - 1 + + for i in range(0, 766 // k): + # __ab_approximation_31 + n = max(a.bit_length(), b.bit_length()) + if n < 64: + a_, b_ = a, b + else: + a_ = (a & mask) | ((a >> (n-k-2)) << k) + b_ = (b & mask) | ((b >> (n-k-2)) << k) + + # __inner_loop_31 + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, k): + if a_ & 1: + if a_ < b_: + a_, b_, f0, g0, f1, g1 = b_, a_, f1, g1, f0, g0 + a_, f0, g0 = a_-b_, f0-f1, g0-g1 + a_, f1, g1 = a_ >> 1, f1 << 1, g1 << 1 + + # __smulx_383_n_shift_by_31 + a, b = (a*f0 + b*g0) >> k, (a*f1 + b*g1) >> k + if a < 0: + a, f0, g0 = -a, -f0, -g0 + if b < 0: + b, f1, g1 = -b, -f1, -g1 + + # __smulx_767x63 + u, v = u*f0 + v*g0, u*f1 + v*g1 + + if 766 % k: + f0, g0, f1, g1 = 1, 0, 0, 1 + for j in range(0, 766 % k): + if a & 1: + if a < b: + a, b, f0, g0, f1, g1 = b, a, f1, g1, f0, g0 + a, f0, g0 = a-b, f0-f1, g0-g1 + a, f1, g1 = a >> 1, f1 << 1, g1 << 1 + + v = u*f1 + v*g1 + + if v < 0: + v += mod << (768 - mod.bit_length()) # left aligned + + return v & (2**768 - 1) # to be reduced % mod +___ + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +my ($out_ptr, $in_ptr, $n_ptr, $nx_ptr) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc=(map("%r$_",(8..15)), "%rbx", "%rbp", $in_ptr, $out_ptr); +my ($f0, $g0, $f1, $g1) = ("%rdx","%rcx","%r12","%r13"); +my $cnt = "%edi"; + +$frame = 8*11+2*512; + +$code.=<<___; +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,\@function,4,"unwind" +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 8*11+511(%rsp), %rax # find closest 512-byte-aligned spot + and \$-512, %rax # in the frame... + mov $out_ptr, 8*4(%rsp) + mov $nx_ptr, 8*5(%rsp) + + mov 8*0($in_ptr), @acc[0] # load input + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov 8*0($n_ptr), @acc[6] # load modulus + mov 8*1($n_ptr), @acc[7] + mov 8*2($n_ptr), @acc[8] + mov 8*3($n_ptr), @acc[9] + mov 8*4($n_ptr), @acc[10] + mov 8*5($n_ptr), @acc[11] + + mov @acc[0], 8*0(%rax) # copy input to |a| + mov @acc[1], 8*1(%rax) + mov @acc[2], 8*2(%rax) + mov @acc[3], 8*3(%rax) + mov @acc[4], 8*4(%rax) + mov @acc[5], 8*5(%rax) + + mov @acc[6], 8*6(%rax) # copy modulus to |b| + mov @acc[7], 8*7(%rax) + mov @acc[8], 8*8(%rax) + mov @acc[9], 8*9(%rax) + mov @acc[10], 8*10(%rax) + mov %rax, $in_ptr + mov @acc[11], 8*11(%rax) + + ################################# first iteration + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*7(%rsp) # corrected |f0| + #mov $g0, 8*8(%rsp) # corrected |g0| + mov $f0, 8*12($out_ptr) # initialize |u| with |f0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + mov $f0, 8*12($out_ptr) # initialize |v| with |f1| + + ################################# second iteration + xor \$256, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call __smulx_383_n_shift_by_31 + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call __smulx_383_n_shift_by_31 + #mov $f0, 8*9(%rsp) # corrected |f1| + #mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*12($in_ptr), %rax # |u| + mov 8*18($in_ptr), @acc[3] # |v| + mov $f0, %rbx + mov %rax, @acc[2] + imulq 8*7(%rsp) # |u|*|f0| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq 8*8(%rsp) # |v|*|g0| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*6($out_ptr) # destination |u| + mov @acc[1], 8*7($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*8($out_ptr) + mov @acc[1], 8*9($out_ptr) + mov @acc[1], 8*10($out_ptr) + mov @acc[1], 8*11($out_ptr) + lea 8*12($in_ptr), $in_ptr # make in_ptr "rewindable" with xor + + mov @acc[2], %rax + imulq %rbx # |u|*|f1| + mov %rax, @acc[0] + mov @acc[3], %rax + mov %rdx, @acc[1] + imulq %rcx # |v|*|g1| + add %rax, @acc[0] + adc %rdx, @acc[1] + mov @acc[0], 8*12($out_ptr) # destination |v| + mov @acc[1], 8*13($out_ptr) + sar \$63, @acc[1] # sign extension + mov @acc[1], 8*14($out_ptr) + mov @acc[1], 8*15($out_ptr) + mov @acc[1], 8*16($out_ptr) + mov @acc[1], 8*17($out_ptr) +___ +for($i=2; $i<23; $i++) { +my $smul_n_shift = $i<19 ? "__smulx_383_n_shift_by_31" + : "__smulx_191_n_shift_by_31"; +my $smul_767x63 = $i>11 ? "__smulx_767x63" + : "__smulx_383x63"; +$code.=<<___; + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$31, $cnt + call __ab_approximation_31 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + mov $f1, 8*9(%rsp) + mov $g1, 8*10(%rsp) + + mov \$256, $out_ptr + xor $in_ptr, $out_ptr # pointer to destination |a|b|u|v| + call $smul_n_shift + mov $f0, 8*7(%rsp) # corrected |f0| + mov $g0, 8*8(%rsp) # corrected |g0| + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr), $out_ptr # pointer to destination |b| + call $smul_n_shift + mov $f0, 8*9(%rsp) # corrected |f1| + mov $g0, 8*10(%rsp) # corrected |g1| + + mov 8*7(%rsp), $f0 # |f0| + mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + call __smulx_383x63 + + mov 8*9(%rsp), $f0 # |f1| + mov 8*10(%rsp), $g0 # |g1| + lea 8*6($out_ptr),$out_ptr # pointer to destination |v| + call $smul_767x63 +___ +$code.=<<___ if ($i==11); + sar \$63, @acc[5] # sign extension + mov @acc[5], 8*6($out_ptr) + mov @acc[5], 8*7($out_ptr) + mov @acc[5], 8*8($out_ptr) + mov @acc[5], 8*9($out_ptr) + mov @acc[5], 8*10($out_ptr) + mov @acc[5], 8*11($out_ptr) +___ +} +$code.=<<___; + ################################# two[!] last iterations in one go + xor \$256+8*12, $in_ptr # flip-flop pointer to source |a|b|u|v| + mov \$53, $cnt # 31 + 766 % 31 + #call __ab_approximation_31 # |a| and |b| are exact, just load + mov 8*0($in_ptr), @acc[0] # |a_lo| + #xor @acc[1], @acc[1] # |a_hi| + mov 8*6($in_ptr), @acc[2] # |b_lo| + #xor @acc[3], @acc[3] # |b_hi| + call __inner_loop_62 + #mov $f0, 8*7(%rsp) + #mov $g0, 8*8(%rsp) + #mov $f1, 8*9(%rsp) + #mov $g1, 8*10(%rsp) + + #mov 8*7(%rsp), $f0 # |f0| + #mov 8*8(%rsp), $g0 # |g0| + lea 8*12($in_ptr), $in_ptr # pointer to source |u|v| + #lea 8*6($out_ptr), $out_ptr # pointer to destination |u| + #call __smulx_383x63 + + #mov 8*9(%rsp), $f0 # |f1| + #mov 8*10(%rsp), $g0 # |g1| + mov $f1, $f0 + mov $g1, $g0 + mov 8*4(%rsp), $out_ptr # original out_ptr + call __smulx_767x63 + + mov 8*5(%rsp), $in_ptr # original n_ptr + mov %rax, %rdx # top limb of the result + sar \$63, %rax # result's sign as mask + + mov %rax, @acc[0] # mask |modulus| + mov %rax, @acc[1] + mov %rax, @acc[2] + and 8*0($in_ptr), @acc[0] + and 8*1($in_ptr), @acc[1] + mov %rax, @acc[3] + and 8*2($in_ptr), @acc[2] + and 8*3($in_ptr), @acc[3] + mov %rax, @acc[4] + and 8*4($in_ptr), @acc[4] + and 8*5($in_ptr), %rax + + add @acc[0], @acc[6] # conditionally add |modulus|<<384 + adc @acc[1], @acc[7] + adc @acc[2], @acc[8] + adc @acc[3], @acc[9] + adc @acc[4], %rcx + adc %rax, %rdx + + mov @acc[6], 8*6($out_ptr) # store absolute value + mov @acc[7], 8*7($out_ptr) + mov @acc[8], 8*8($out_ptr) + mov @acc[9], 8*9($out_ptr) + mov %rcx, 8*10($out_ptr) + mov %rdx, 8*11($out_ptr) + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +___ +######################################################################## +# Signed |u|*|f?|+|v|*|g?| subroutines. "NNN" in "NNNx63" suffix refers +# to the maximum bit-length of the *result*, and "63" - to the maximum +# bit-length of the |f?| and |g?| single-limb multiplicands. However! +# The latter should not be taken literally, as they are always chosen so +# that "bad things" don't happen. For example, there comes a point when +# |v| grows beyond 383 bits, while |u| remains 383 bits wide. Yet, we +# always call __smul_383x63 to perform |u|*|f0|+|v|*|g0| step. This is +# because past that point |f0| is always 1 and |g0| is always 0. And, +# since |u| never grows beyond 383 bits, __smul_767x63 doesn't have to +# perform full-width |u|*|f1| multiplication, half-width one with sign +# extension is sufficient... +{ +my ($out_ptr, $in_ptr, $f0, $g0) = ("%rdi", "%rsi", "%rdx", "%rcx"); +my @acc = map("%r$_",(8..15),"bx","bp","cx","di"); +my $fx = @acc[9]; + +$code.=<<___; +.type __smulx_767x63,\@abi-omnipotent +.align 32 +__smulx_767x63: + mov 8*0($in_ptr), @acc[0] # load |u| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + + mov $f0, %rax + sar \$63, %rax # |f0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |f0|'s sign as bit + + mov $out_ptr, 8*1(%rsp) + mov $in_ptr, 8*2(%rsp) + lea 8*6($in_ptr), $in_ptr # pointer to |v| + + xor %rax, $f0 # conditionally negate |f0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |u| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |u|*|f0| + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + mov %rdx, 8*6($out_ptr) + sar \$63, %rdx # sign extension + mov %rdx, 8*7($out_ptr) +___ +{ +my $fx=$in_ptr; +$code.=<<___; + mov $g0, $f0 # load |g0| + mov $g0, %rax + + mov 8*0($in_ptr), @acc[0] # load |v| + mov 8*1($in_ptr), @acc[1] + mov 8*2($in_ptr), @acc[2] + mov 8*3($in_ptr), @acc[3] + mov 8*4($in_ptr), @acc[4] + mov 8*5($in_ptr), @acc[5] + mov 8*6($in_ptr), @acc[6] + mov 8*7($in_ptr), @acc[7] + mov 8*8($in_ptr), @acc[8] + mov 8*9($in_ptr), @acc[9] + mov 8*10($in_ptr), @acc[10] + mov 8*11($in_ptr), @acc[11] + + sar \$63, %rax # |g0|'s sign as mask + xor $fx, $fx # overrides in_ptr + sub %rax, $fx # |g0|'s sign as bit + + xor %rax, $f0 # conditionally negate |g0| + add $fx, $f0 + + xor %rax, @acc[0] # conditionally negate |v| + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor %rax, @acc[5] + xor %rax, @acc[6] + xor %rax, @acc[7] + xor %rax, @acc[8] + xor %rax, @acc[9] + xor %rax, @acc[10] + xor %rax, @acc[11] + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] + adc \$0, @acc[7] + adc \$0, @acc[8] + adc \$0, @acc[9] + adc \$0, @acc[10] + adc \$0, @acc[11] + + mulx @acc[0], @acc[0], %rax # |v|*|g0| + mulx @acc[1], @acc[1], $fx + add %rax, @acc[1] +___ +for(my ($a,$b) = ("%rax", $fx), $i=2; $i<11; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___; + mulx @acc[11], @acc[11], $fx + mov 8*1(%rsp), %rdx # out_ptr + mov 8*2(%rsp), $in_ptr # restore original in_ptr + adc @acc[11], %rax + + add 8*0(%rdx), @acc[0] # accumulate |u|*|f0| + adc 8*1(%rdx), @acc[1] + adc 8*2(%rdx), @acc[2] + adc 8*3(%rdx), @acc[3] + adc 8*4(%rdx), @acc[4] + adc 8*5(%rdx), @acc[5] + adc 8*6(%rdx), @acc[6] + mov 8*7(%rdx), @acc[11] # sign extension + adc @acc[11], @acc[7] + adc @acc[11], @acc[8] + adc @acc[11], @acc[9] + adc @acc[11], @acc[10] + adc @acc[11], %rax + + mov %rdx, $out_ptr # restore original out_ptr + + mov @acc[0], 8*0(%rdx) + mov @acc[1], 8*1(%rdx) + mov @acc[2], 8*2(%rdx) + mov @acc[3], 8*3(%rdx) + mov @acc[4], 8*4(%rdx) + mov @acc[5], 8*5(%rdx) + mov @acc[6], 8*6(%rdx) + mov @acc[7], 8*7(%rdx) + mov @acc[8], 8*8(%rdx) + mov @acc[9], 8*9(%rdx) + mov @acc[10], 8*10(%rdx) + mov %rax, 8*11(%rdx) + + ret +.size __smulx_767x63,.-__smulx_767x63 +___ +} +$code.=<<___; +.type __smulx_383x63,\@abi-omnipotent +.align 32 +__smulx_383x63: +___ +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |u| (or |v|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov $f0, $fx + sar \$63, $fx # |f0|'s sign as mask (or |g0|'s) + xor %rax, %rax + sub $fx, %rax # |f0|'s sign as bit (or |g0|'s) + + xor $fx, $f0 # conditionally negate |f0| + add %rax, $f0 + + xor $fx, @acc[0] # conditionally negate |u| (or |v|) + xor $fx, @acc[1] + xor $fx, @acc[2] + xor $fx, @acc[3] + xor $fx, @acc[4] + xor $fx, @acc[5] + add %rax, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mulx @acc[0], @acc[0], $fx # |u|*|f0| (or |v|*|g0|) + mulx @acc[1], @acc[1], %rax + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, "%rax"), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + mulx @acc[$i], @acc[$i], %rax + mov $g0, $f0 + adc $fx, @acc[$i] + + mov @acc[0], 8*0($out_ptr) # offload |u|*|f0| + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) +___ +} +$code.=<<___; + mulx @acc[$i], @acc[$i], %rax + adc $fx, @acc[$i] + + add 8*0($out_ptr), @acc[0] # accumulate |u|*|f0| + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), @acc[5] + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov @acc[5], 8*5($out_ptr) + + ret +.size __smulx_383x63,.-__smulx_383x63 +___ +######################################################################## +# Signed abs(|a|*|f?|+|b|*|g?|)>>k subroutines. "NNN" in the middle of +# the names refers to maximum bit-lengths of |a| and |b|. As already +# mentioned, |f?| and |g?| can be viewed as 63 bits wide, but are always +# chosen so that "bad things" don't happen. For example, so that the +# sum of the products doesn't overflow, and that the final result is +# never wider than inputs... +{ +$code.=<<___; +.type __smulx_383_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_383_n_shift_by_31: + mov $f0, @acc[8] + xor @acc[6], @acc[6] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + mov $k+8*3($in_ptr), @acc[3] + mov $k+8*4($in_ptr), @acc[4] + mov $k+8*5($in_ptr), @acc[5] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor %rax, @acc[2] + xor %rax, @acc[3] + xor %rax, @acc[4] + xor @acc[5], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[5] + add $fx, @acc[1] +___ +for(my ($a,$b) = ($fx, @acc[5]), $i=2; $i<5; $i++) { +$code.=<<___; + mulx @acc[$i], @acc[$i], $a + adc $b, @acc[$i] +___ + ($a, $b) = ($b, $a); +} +$code.=<<___ if ($j==0); + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc %rdx, @acc[6] + + mov $g0, %rdx + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) +___ +} +$code.=<<___; + adc \$0, $fx + imulq %rdx + add $fx, %rax + adc \$0, %rdx + + add 8*0($out_ptr), @acc[0] + adc 8*1($out_ptr), @acc[1] + adc 8*2($out_ptr), @acc[2] + adc 8*3($out_ptr), @acc[3] + adc 8*4($out_ptr), @acc[4] + adc 8*5($out_ptr), %rax + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[1], @acc[0] + shrd \$31, @acc[2], @acc[1] + shrd \$31, @acc[3], @acc[2] + shrd \$31, @acc[4], @acc[3] + shrd \$31, %rax, @acc[4] + shrd \$31, @acc[6], %rax + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[0] # conditionally negate the result + xor @acc[6], @acc[1] + xor @acc[6], @acc[2] + xor @acc[6], @acc[3] + xor @acc[6], @acc[4] + xor @acc[6], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, @acc[2] + adc \$0, @acc[3] + adc \$0, @acc[4] + adc \$0, %rax + + mov @acc[0], 8*0($out_ptr) + mov @acc[1], 8*1($out_ptr) + mov @acc[2], 8*2($out_ptr) + mov @acc[3], 8*3($out_ptr) + mov @acc[4], 8*4($out_ptr) + mov %rax, 8*5($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +___ +} { +$code.=<<___; +.type __smulx_191_n_shift_by_31,\@abi-omnipotent +.align 32 +__smulx_191_n_shift_by_31: + mov $f0, @acc[8] +___ +my $f0 = @acc[8]; +for($j=0; $j<2; $j++) { +my $k = 8*6*$j; +my @acc=@acc; + @acc=@acc[3..5] if ($j); +$code.=<<___; + mov $k+8*0($in_ptr), @acc[0] # load |a| (or |b|) + mov $k+8*1($in_ptr), @acc[1] + mov $k+8*2($in_ptr), @acc[2] + + mov %rdx, %rax + sar \$63, %rax # |f0|'s sign as mask (or |g0|'s) + xor $fx, $fx + sub %rax, $fx # |f0|'s sign as bit (or |g0|'s) + + xor %rax, %rdx # conditionally negate |f0| (or |g0|) + add $fx, %rdx + + xor %rax, @acc[0] # conditionally negate |a| (or |b|) + xor %rax, @acc[1] + xor @acc[2], %rax + add $fx, @acc[0] + adc \$0, @acc[1] + adc \$0, %rax + + mulx @acc[0], @acc[0], $fx # |a|*|f0| (or |b|*|g0|) + mulx @acc[1], @acc[1], @acc[2] + add $fx, @acc[1] + adc \$0, @acc[2] + imulq %rdx + add %rax, @acc[2] + adc \$0, %rdx +___ +$code.=<<___ if ($j==0); + mov %rdx, @acc[6] + mov $g0, %rdx +___ +} +$code.=<<___; + add @acc[0], @acc[3] + adc @acc[1], @acc[4] + adc @acc[2], @acc[5] + adc %rdx, @acc[6] + mov $f0, %rdx + + shrd \$31, @acc[4], @acc[3] + shrd \$31, @acc[5], @acc[4] + shrd \$31, @acc[6], @acc[5] + + sar \$63, @acc[6] # sign as mask + xor $fx, $fx + sub @acc[6], $fx # sign as bit + + xor @acc[6], @acc[3] # conditionally negate the result + xor @acc[6], @acc[4] + xor @acc[6], @acc[5] + add $fx, @acc[3] + adc \$0, @acc[4] + adc \$0, @acc[5] + + mov @acc[3], 8*0($out_ptr) + mov @acc[4], 8*1($out_ptr) + mov @acc[5], 8*2($out_ptr) + + xor @acc[6], %rdx # conditionally negate |f0| + xor @acc[6], $g0 # conditionally negate |g0| + add $fx, %rdx + add $fx, $g0 + + ret +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +___ +} } + +{ +my ($a_lo, $a_hi, $b_lo, $b_hi) = map("%r$_",(8..11)); +my ($t0, $t1, $t2, $t3, $t4) = ("%rax","%rbx","%rbp","%r14","%r15"); +my ($fg0, $fg1, $bias) = ($g0, $g1, $t4); +my ($a_, $b_) = ($a_lo, $b_lo); +{ +my @a = ($a_lo, $t1, $a_hi); +my @b = ($b_lo, $t2, $b_hi); + +$code.=<<___; +.type __ab_approximation_31,\@abi-omnipotent +.align 32 +__ab_approximation_31: + mov 8*5($in_ptr), @a[2] # load |a| in reverse order + mov 8*11($in_ptr), @b[2] # load |b| in reverse order + mov 8*4($in_ptr), @a[1] + mov 8*10($in_ptr), @b[1] + mov 8*3($in_ptr), @a[0] + mov 8*9($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # check top-most limbs, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*2($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*8($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... ones before top-most, ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*1($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*7($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + mov 8*0($in_ptr), @a[0] + cmovz @b[0], @b[1] + mov 8*6($in_ptr), @b[0] + + mov @a[2], $t0 + or @b[2], $t0 # ... and ones before that ... + cmovz @a[1], @a[2] + cmovz @b[1], @b[2] + cmovz @a[0], @a[1] + cmovz @b[0], @b[1] + + mov @a[2], $t0 + or @b[2], $t0 + bsr $t0, %rcx + lea 1(%rcx), %rcx + cmovz @a[0], @a[2] + cmovz @b[0], @b[2] + cmovz $t0, %rcx + neg %rcx + #and \$63, %rcx # debugging artefact + + shldq %cl, @a[1], @a[2] # align second limb to the left + shldq %cl, @b[1], @b[2] + + mov \$0x7FFFFFFF, %eax + and %rax, @a[0] + and %rax, @b[0] + andn @a[2], %rax, @a[2] + andn @b[2], %rax, @b[2] + or @a[2], @a[0] + or @b[2], @b[0] + + jmp __inner_loop_31 + + ret +.size __ab_approximation_31,.-__ab_approximation_31 +___ +} +$code.=<<___; +.type __inner_loop_31,\@abi-omnipotent +.align 32 +__inner_loop_31: ################# by Thomas Pornin + mov \$0x7FFFFFFF80000000, $fg0 # |f0|=1, |g0|=0 + mov \$0x800000007FFFFFFF, $fg1 # |f1|=0, |g1|=1 + mov \$0x7FFFFFFF7FFFFFFF, $bias + +.Loop_31: + cmp $b_, $a_ # if |a_|<|b_|, swap the variables + mov $a_, $t0 + mov $b_, $t1 + mov $fg0, $t2 + mov $fg1, $t3 + cmovb $b_, $a_ + cmovb $t0, $b_ + cmovb $fg1, $fg0 + cmovb $t2, $fg1 + + sub $b_, $a_ # |a_|-|b_| + sub $fg1, $fg0 # |f0|-|f1|, |g0|-|g1| + add $bias, $fg0 + + test \$1, $t0 # if |a_| was even, roll back + cmovz $t0, $a_ + cmovz $t1, $b_ + cmovz $t2, $fg0 + cmovz $t3, $fg1 + + shr \$1, $a_ # |a_|>>=1 + add $fg1, $fg1 # |f1|<<=1, |g1|<<=1 + sub $bias, $fg1 + sub \$1, $cnt + jnz .Loop_31 + + shr \$32, $bias + mov %ecx, %edx # $fg0, $f0 + mov ${fg1}d, ${f1}d + shr \$32, $g0 + shr \$32, $g1 + sub $bias, $f0 # remove the bias + sub $bias, $g0 + sub $bias, $f1 + sub $bias, $g1 + + ret +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,\@abi-omnipotent +.align 32 +__inner_loop_62: + mov \$1, $f0 # |f0|=1 + xor $g0, $g0 # |g0|=0 + xor $f1, $f1 # |f1|=0 + mov \$1, $g1 # |g1|=1 + +.Loop_62: + xor $t0, $t0 + test \$1, $a_lo # if |a_| is odd, then we'll be subtracting |b_| + mov $b_lo, $t1 + cmovnz $b_lo, $t0 + sub $a_lo, $t1 # |b_|-|a_| + mov $a_lo, $t2 + sub $t0, $a_lo # |a_|-|b_| (or |a_|-0 if |a_| was even) + cmovc $t1, $a_lo # borrow means |a_|<|b_|, replace with |b_|-|a_| + cmovc $t2, $b_lo # |b_| = |a_| + mov $f0, $t0 # exchange |f0| and |f1| + cmovc $f1, $f0 + cmovc $t0, $f1 + mov $g0, $t1 # exchange |g0| and |g1| + cmovc $g1, $g0 + cmovc $t1, $g1 + xor $t0, $t0 + xor $t1, $t1 + shr \$1, $a_lo + test \$1, $t2 # if |a_| was odd, then we'll be subtracting... + cmovnz $f1, $t0 + cmovnz $g1, $t1 + add $f1, $f1 # |f1|<<=1 + add $g1, $g1 # |g1|<<=1 + sub $t0, $f0 # |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub $t1, $g0 # |g0|-=|g1| (or |g0-=0| ...) + sub \$1, $cnt + jnz .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/div3w-armv8.pl b/blst/asm/div3w-armv8.pl new file mode 100755 index 0000000..bfa3245 --- /dev/null +++ b/blst/asm/div3w-armv8.pl @@ -0,0 +1,122 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$code.=<<___; +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +{ +my ($div_rem, $divisor, $quot) = map("x$_",(0..2)); +my @div = map("x$_",(3..4)); +my @acc = map("x$_",(5..7)); +my @t = map("x$_",(8..11)); + +$code.=<<___; +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp @div[0],@div[1],[$divisor] + + mul @acc[0],@div[0],$quot // divisor[0:1} * quotient + umulh @acc[1],@div[0],$quot + mul @t[3], @div[1],$quot + umulh @acc[2],@div[1],$quot + + ldp @t[0],@t[1],[$div_rem] // load 3 limbs of the dividend + ldr @t[2],[$div_rem,#16] + + adds @acc[1],@acc[1],@t[3] + adc @acc[2],@acc[2],xzr + + subs @t[0],@t[0],@acc[0] // dividend - divisor * quotient + sbcs @t[1],@t[1],@acc[1] + sbcs @t[2],@t[2],@acc[2] + sbc @acc[0],xzr,xzr // borrow -> mask + + add $quot,$quot,@acc[0] // if borrowed, adjust the quotient ... + and @div[0],@div[0],@acc[0] + and @div[1],@div[1],@acc[0] + adds @t[0],@t[0],@div[0] // ... and add divisor + adc @t[1],@t[1],@div[1] + + stp @t[0],@t[1],[$div_rem] // save 2 limbs of the remainder + str $quot,[$div_rem,#16] // and one limb of the quotient + + mov x0,$quot // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr @div[0],[$divisor] + ldr @t[0],[$div_rem] // load 1 limb of the dividend + + mul @acc[0],@div[0],$quot // divisor * quotient + + sub @t[0],@t[0],@acc[0] // dividend - divisor * quotient + + stp @t[0],$quot,[$div_rem] // save remainder and quotient + + mov x0,$quot // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/div3w-x86_64.pl b/blst/asm/div3w-x86_64.pl new file mode 100755 index 0000000..b8192db --- /dev/null +++ b/blst/asm/div3w-x86_64.pl @@ -0,0 +1,184 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$c_ref=<<'___'; +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t *div_top, limb_t d_lo, limb_t d_hi) +{ + llimb_t R = ((llimb_t)div_top[1] << LIMB_BITS) | div_top[0]; + llimb_t D = ((llimb_t)d_hi << LIMB_BITS) | d_lo; + limb_t Q = 0, mask; + size_t i; + + for (i = 0; i < LIMB_BITS; i++) { + Q <<= 1; + mask = (R >= D); + Q |= mask; + R -= (D & ((llimb_t)0 - mask)); + D >>= 1; + } + + mask = 0 - (Q >> (LIMB_BITS - 1)); /* does it overflow? */ + + Q <<= 1; + Q |= (R >= D); + + return (Q | mask); +} +___ + +$code.=<<___; +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,\@function,3 +.align 32 +div_3_limbs: + mov (%rdi),%r8 # load R.lo + mov 8(%rdi),%r9 # load R.hi + xor %rax,%rax # Q = 0 + mov \$64,%ecx # loop counter + +.Loop: + mov %r8,%r10 # put aside R + sub %rsi,%r8 # R -= D + mov %r9,%r11 + sbb %rdx,%r9 + lea 1(%rax,%rax),%rax # Q <<= 1 + speculative bit + mov %rdx,%rdi + cmovc %r10,%r8 # restore R if R - D borrowed + cmovc %r11,%r9 + sbb \$0,%rax # subtract speculative bit + shl \$63,%rdi + shr \$1,%rsi + shr \$1,%rdx + or %rdi,%rsi # D >>= 1 + sub \$1,%ecx + jnz .Loop + + lea 1(%rax,%rax),%rcx # Q <<= 1 + speculative bit + sar \$63,%rax # top bit -> mask + + sub %rsi,%r8 # R -= D + sbb %rdx,%r9 + sbb \$0,%rcx # subtract speculative bit + + or %rcx,%rax # all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +___ +######################################################################## +# Calculate remainder and adjust the quotient, which can be off-by-one. +# Then save quotient in limb next to top limb of the remainder. There is +# place, because the remainder/next-iteration-dividend gets shorter by +# one limb. +{ +my ($div_rem, $divisor, $quotient) = ("%rdi", "%rsi", "%rcx"); +my @acc = ("%r8", "%r9", "%rdx"); +my @tmp = ("%r10", "%r11", "%rax"); + +$code.=<<___; +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,\@function,3 +.align 32 +quot_rem_128: + mov %rdx, %rax + mov %rdx, $quotient + + mulq 0($divisor) # divisor[0:1] * quotient + mov %rax, @acc[0] + mov $quotient, %rax + mov %rdx, @acc[1] + + mulq 8($divisor) + add %rax, @acc[1] + adc \$0, %rdx # %rdx is @acc[2] + + mov 0($div_rem), @tmp[0] # load 3 limbs of the dividend + mov 8($div_rem), @tmp[1] + mov 16($div_rem), @tmp[2] + + sub @acc[0], @tmp[0] # dividend - divisor * quotient + sbb @acc[1], @tmp[1] + sbb @acc[2], @tmp[2] + sbb @acc[0], @acc[0] # borrow -> mask + + add @acc[0], $quotient # if borrowed, adjust the quotient ... + mov @acc[0], @acc[1] + and 0($divisor), @acc[0] + and 8($divisor), @acc[1] + add @acc[0], @tmp[0] # ... and add divisor + adc @acc[1], @tmp[1] + + mov @tmp[0], 0($div_rem) # save 2 limbs of the remainder ... + mov @tmp[1], 8($div_rem) + mov $quotient, 16($div_rem) # ... and 1 limb of the quotient + + mov $quotient, %rax # return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +######################################################################## +# Unlike 128-bit case above, quotient is exact. As result just one limb +# of the dividend is sufficient to calculate the remainder... + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,\@function,3 +.align 32 +quot_rem_64: + mov %rdx, %rax # return quotient + imulq 0($divisor), %rdx # divisor[0] * quotient + + mov 0($div_rem), @tmp[0] # load 1 limb of the dividend + + sub %rdx, @tmp[0] # dividend - divisor * quotient + + mov @tmp[0], 0($div_rem) # save 1 limb of the remainder ... + mov %rax, 8($div_rem) # ... and 1 limb of the quotient + + ret +.size quot_rem_64,.-quot_rem_64 +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/mul_mont_256-armv8.pl b/blst/asm/mul_mont_256-armv8.pl new file mode 100755 index 0000000..ba6c2b8 --- /dev/null +++ b/blst/asm/mul_mont_256-armv8.pl @@ -0,0 +1,409 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod=map("x$_",(5..8)); +$bi="x9"; +@a=map("x$_",(10..13)); +@tmp=map("x$_",(14..17)); +@acc=map("x$_",(19..24)); +$m0=$n_ptr; + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + mul @acc[0],@a[0],$bi + ldp @mod[0],@mod[1],[$n_ptr] + mul @acc[1],@a[1],$bi + ldp @mod[2],@mod[3],[$n_ptr,#16] + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],xzr, @tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adc @acc[4],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $m0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adc @acc[4],@acc[4],xzr + + adds @acc[1],@acc[1],@tmp[0] + //mul @tmp[0],@mod[0],$m0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$m0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$m0 + adc @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$m0 +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 //adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @acc[4],@acc[4],xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +___ +{ +my @acc = (@a,@acc[0..3]); +my @a = @mod; + +$code.=<<___; +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + paciasp + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mov $n0,$n_ptr + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is @acc[x] + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul @acc[1],@a[1],@a[0] // a[1]*a[0] + umulh @tmp[1],@a[1],@a[0] + mul @acc[2],@a[2],@a[0] // a[2]*a[0] + umulh @tmp[2],@a[2],@a[0] + mul @acc[3],@a[3],@a[0] // a[3]*a[0] + umulh @acc[4],@a[3],@a[0] + + adds @acc[2],@acc[2],@tmp[1] // accumulate high parts of multiplication + mul @tmp[0],@a[2],@a[1] // a[2]*a[1] + umulh @tmp[1],@a[2],@a[1] + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@a[3],@a[1] // a[3]*a[1] + umulh @tmp[3],@a[3],@a[1] + adc @acc[4],@acc[4],xzr // can't overflow + + mul @acc[5],@a[3],@a[2] // a[3]*a[2] + umulh @acc[6],@a[3],@a[2] + + adds @tmp[1],@tmp[1],@tmp[2] // accumulate high parts of multiplication + mul @acc[0],@a[0],@a[0] // a[0]*a[0] + adc @tmp[2],@tmp[3],xzr // can't overflow + + adds @acc[3],@acc[3],@tmp[0] // accumulate low parts of multiplication + umulh @a[0],@a[0],@a[0] + adcs @acc[4],@acc[4],@tmp[1] + mul @tmp[1],@a[1],@a[1] // a[1]*a[1] + adcs @acc[5],@acc[5],@tmp[2] + umulh @a[1],@a[1],@a[1] + adc @acc[6],@acc[6],xzr // can't overflow + + adds @acc[1],@acc[1],@acc[1] // acc[1-6]*=2 + mul @tmp[2],@a[2],@a[2] // a[2]*a[2] + adcs @acc[2],@acc[2],@acc[2] + umulh @a[2],@a[2],@a[2] + adcs @acc[3],@acc[3],@acc[3] + mul @tmp[3],@a[3],@a[3] // a[3]*a[3] + adcs @acc[4],@acc[4],@acc[4] + umulh @a[3],@a[3],@a[3] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adc @acc[7],xzr,xzr + + adds @acc[1],@acc[1],@a[0] // +a[i]*a[i] + adcs @acc[2],@acc[2],@tmp[1] + adcs @acc[3],@acc[3],@a[1] + adcs @acc[4],@acc[4],@tmp[2] + adcs @acc[5],@acc[5],@a[2] + adcs @acc[6],@acc[6],@tmp[3] + adc @acc[7],@acc[7],@a[3] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds @acc[0],@acc[0],@acc[4] // accumulate upper half + adcs @acc[1],@acc[1],@acc[5] + adcs @acc[2],@acc[2],@acc[6] + adcs @acc[3],@acc[3],@acc[7] + adc @acc[4],xzr,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs xzr, @acc[4],xzr + + csel @acc[0],@acc[0],@tmp[0],lo + csel @acc[1],@acc[1],@tmp[1],lo + csel @acc[2],@acc[2],@tmp[2],lo + csel @acc[3],@acc[3],@tmp[3],lo + + stp @acc[0],@acc[1],[$r_ptr] + stp @acc[2],@acc[3],[$r_ptr,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + autiasp + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +} +{ +my @a = (@a, $bi); + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov $n0,$n_ptr + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp @tmp[0],@tmp[1],[$a_ptr,#32] + ldp @tmp[2],@tmp[3],[$a_ptr,#48] + + adds @a[0],@a[0],@tmp[0] + adcs @a[1],@a[1],@tmp[1] + adcs @a[2],@a[2],@tmp[2] + adcs @a[3],@a[3],@tmp[3] + adc @a[4],xzr,xzr + + subs @tmp[0],@a[0],@mod[0] + sbcs @tmp[1],@a[1],@mod[1] + sbcs @tmp[2],@a[2],@mod[2] + sbcs @tmp[3],@a[3],@mod[3] + sbcs xzr, @a[4],xzr + + csel @a[0],@a[0],@tmp[0],lo + csel @a[1],@a[1],@tmp[1],lo + csel @a[2],@a[2],@tmp[2],lo + csel @a[3],@a[3],@tmp[3],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + + ldr x29,[sp],#16 + autiasp + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul $m0,$n0,@a[0] + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] +___ +for ($i=1;$i<4;$i++) { +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + mul $m0,$n0,@a[0] + adc @a[3],@a[4],@tmp[3] +___ +} +$code.=<<___; + //mul @tmp[0],@mod[0],$m0 + mul @tmp[1],@mod[1],$m0 + mul @tmp[2],@mod[2],$m0 + mul @tmp[3],@mod[3],$m0 + subs xzr,@a[0],#1 //adds @a[0],@a[0],@tmp[0] + umulh @tmp[0],@mod[0],$m0 + adcs @a[1],@a[1],@tmp[1] + umulh @tmp[1],@mod[1],$m0 + adcs @a[2],@a[2],@tmp[2] + umulh @tmp[2],@mod[2],$m0 + adcs @a[3],@a[3],@tmp[3] + umulh @tmp[3],@mod[3],$m0 + adc @a[4],xzr,xzr + + adds @a[0],@a[1],@tmp[0] + adcs @a[1],@a[2],@tmp[1] + adcs @a[2],@a[3],@tmp[2] + adc @a[3],@a[4],@tmp[3] + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 +___ +} + +print $code; + +close STDOUT; diff --git a/blst/asm/mul_mont_384-armv8.pl b/blst/asm/mul_mont_384-armv8.pl new file mode 100755 index 0000000..44e12a0 --- /dev/null +++ b/blst/asm/mul_mont_384-armv8.pl @@ -0,0 +1,2015 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +($r_ptr,$a_ptr,$b_ptr,$n_ptr,$n0) = map("x$_", 0..4); + +@mod = map("x$_",(5..10)); +@a = map("x$_",(11..16)); +$bi = "x17"; +@acc = map("x$_",(19..25)); +@tmp = map("x$_",(26..28,0,1,3)); + +$code.=<<___; +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + adcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + adcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + adcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + stp @a[0],@a[1],[$r_ptr,#48] + csel @a[4],@a[4],@acc[4],lo + stp @a[2],@a[3],[$r_ptr,#64] + csel @a[5],@a[5],@acc[5],lo + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + paciasp + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + autiasp + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + stp @a[0], @a[1], [$r_ptr] + sbcs @a[4],@a[4],@acc[4] + ldp @a[0], @a[1], [$a_ptr,#48] + sbcs @a[5],@a[5],@acc[5] + + ldp @acc[0],@acc[1],[$b_ptr,#48] + stp @a[2], @a[3], [$r_ptr,#16] + ldp @a[2], @a[3], [$a_ptr,#64] + ldp @acc[2],@acc[3],[$b_ptr,#64] + + sbcs @a[0],@a[0],@acc[0] + stp @a[4], @a[5], [$r_ptr,#32] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#80] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#80] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + adds @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + adcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + adcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc $bi,xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,$bi,xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + stp @a[0],@a[1],[$r_ptr] + csel @a[5],@a[5],@acc[5],lo + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp @a[0], @a[1], [$a_ptr] + ldp @acc[0],@acc[1],[$b_ptr] + ldp @a[2], @a[3], [$a_ptr,#16] + subs @a[0],@a[0],@acc[0] + ldp @acc[2],@acc[3],[$b_ptr,#16] + sbcs @a[1],@a[1],@acc[1] + ldp @a[4], @a[5], [$a_ptr,#32] + sbcs @a[2],@a[2],@acc[2] + ldp @acc[4],@acc[5],[$b_ptr,#32] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc $bi,xzr,xzr + + and @acc[0],@mod[0],$bi + and @acc[1],@mod[1],$bi + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],$bi + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],$bi + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],$bi + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],$bi + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov @tmp[0],$r_ptr // save r_ptr + mov @tmp[1],$a_ptr // save b_ptr + mov @tmp[2],$b_ptr // save b_ptr + + sub $r_ptr,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add $a_ptr,$a_ptr,#48 // mul_384(t1, a->im, b->im) + add $b_ptr,$b_ptr,#48 + add $r_ptr,sp,#96 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + sub $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#240 + bl __add_mod_384 + + add $a_ptr,@tmp[2],#0 + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#192 // t2 + bl __add_mod_384 + + add $a_ptr,$r_ptr,#0 + add $b_ptr,$r_ptr,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,$r_ptr + add $b_ptr,sp,#0 + bl __sub_mod_384x384 + + add $b_ptr,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add $a_ptr,sp,#0 + add $b_ptr,sp,#96 + add $r_ptr,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add $a_ptr,sp,#0 // ret->re = redc(t0) + add $r_ptr,@tmp[0],#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add $a_ptr,sp,#192 // ret->im = redc(t2) + add $r_ptr,$r_ptr,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + add $r_ptr,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add $r_ptr,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds @a[0],@a[0],@a[0] // add with itself + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @acc[0],@a[0],@acc[0],lo + csel @acc[1],@a[1],@acc[1],lo + csel @acc[2],@a[2],@acc[2],lo + ldp @a[0],@a[1],[sp] + csel @acc[3],@a[3],@acc[3],lo + ldr $bi, [sp,#48] + csel @acc[4],@a[4],@acc[4],lo + ldp @a[2],@a[3],[sp,#16] + csel @acc[5],@a[5],@acc[5],lo + ldp @a[4],@a[5],[sp,#32] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + add $b_ptr,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 + mov $bi,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + adc $n0,$bi,xzr + ldr $bi,[$b_ptr,8*$i] + + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],$n0,xzr + ldr $n0,[x29,#96] + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adcs @acc[6],@acc[6],xzr + adc $bi,xzr,xzr + + adds @acc[1],@acc[1],@tmp[0] + // mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adcs @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 + adc $bi,$bi,xzr +___ +} +$code.=<<___; + subs xzr,@acc[0],#1 // adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adcs @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + adc $bi,$bi,xzr + + adds @acc[0],@acc[1],@tmp[0] + adcs @acc[1],@acc[2],@tmp[1] + adcs @acc[2],@acc[3],@tmp[2] + adcs @acc[3],@acc[4],@tmp[3] + adcs @acc[4],@acc[5],@tmp[4] + adcs @acc[5],@acc[6],@tmp[5] + adc @acc[6],$bi,xzr + + subs @tmp[0],@acc[0],@mod[0] + sbcs @tmp[1],@acc[1],@mod[1] + sbcs @tmp[2],@acc[2],@mod[2] + sbcs @tmp[3],@acc[3],@mod[3] + sbcs @tmp[4],@acc[4],@mod[4] + sbcs @tmp[5],@acc[5],@mod[5] + sbcs xzr, @acc[6],xzr + + csel @a[0],@acc[0],@tmp[0],lo + csel @a[1],@acc[1],@tmp[1],lo + csel @a[2],@acc[2],@tmp[2],lo + csel @a[3],@acc[3],@tmp[3],lo + csel @a[4],@acc[4],@tmp[4],lo + csel @a[5],@acc[5],@tmp[5],lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov $n0,$n_ptr // adjust for missing b_ptr + + mov $n_ptr,$r_ptr // save r_ptr + mov $r_ptr,sp + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + mov $a_ptr,sp + mov $r_ptr,$n_ptr // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n0,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov $bi,x5 // save b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + mov $r_ptr,sp +.Loop_sqr_383: + bl __sqr_384 + sub $b_ptr,$b_ptr,#1 // counter + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + mov $a_ptr,sp + bl __mul_by_1_mont_384 + + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // just accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + cbnz $b_ptr,.Loop_sqr_383 + + mov $b_ptr,$bi + ldr $bi,[$bi] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ +my @acc=(@acc,@tmp[0..2]); + +$code.=<<___; +.type __sqr_384,%function +.align 5 +__sqr_384: + mul @acc[0],@a[1],@a[0] + mul @acc[1],@a[2],@a[0] + mul @acc[2],@a[3],@a[0] + mul @acc[3],@a[4],@a[0] + mul @acc[4],@a[5],@a[0] + + umulh @mod[1],@a[1],@a[0] + umulh @mod[2],@a[2],@a[0] + umulh @mod[3],@a[3],@a[0] + umulh @mod[4],@a[4],@a[0] + adds @acc[1],@acc[1],@mod[1] + umulh @mod[5],@a[5],@a[0] + adcs @acc[2],@acc[2],@mod[2] + mul @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + mul @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + mul @mod[4],@a[4],@a[1] + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],@a[1] + + adds @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],@a[1] + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],@a[1] + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],@a[1] + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],@a[1] + adc @acc[6],xzr,xzr + + mul @mod[0],@a[0],@a[0] + adds @acc[3],@acc[3],@mod[2] + umulh @a[0], @a[0],@a[0] + adcs @acc[4],@acc[4],@mod[3] + mul @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + mul @mod[4],@a[4],@a[2] + adc @acc[6],@acc[6],@mod[5] + mul @mod[5],@a[5],@a[2] + + adds @acc[4],@acc[4],@mod[3] + umulh @mod[3],@a[3],@a[2] + adcs @acc[5],@acc[5],@mod[4] + umulh @mod[4],@a[4],@a[2] + adcs @acc[6],@acc[6],@mod[5] + umulh @mod[5],@a[5],@a[2] + adc @acc[7],xzr,xzr + + mul @mod[1],@a[1],@a[1] + adds @acc[5],@acc[5],@mod[3] + umulh @a[1], @a[1],@a[1] + adcs @acc[6],@acc[6],@mod[4] + mul @mod[4],@a[4],@a[3] + adc @acc[7],@acc[7],@mod[5] + mul @mod[5],@a[5],@a[3] + + adds @acc[6],@acc[6],@mod[4] + umulh @mod[4],@a[4],@a[3] + adcs @acc[7],@acc[7],@mod[5] + umulh @mod[5],@a[5],@a[3] + adc @acc[8],xzr,xzr + mul @mod[2],@a[2],@a[2] + adds @acc[7],@acc[7],@mod[4] + umulh @a[2], @a[2],@a[2] + adc @acc[8],@acc[8],@mod[5] + mul @mod[3],@a[3],@a[3] + + mul @mod[5],@a[5],@a[4] + umulh @a[3], @a[3],@a[3] + adds @acc[8],@acc[8],@mod[5] + umulh @mod[5],@a[5],@a[4] + mul @mod[4],@a[4],@a[4] + adc @acc[9],@mod[5],xzr + + adds @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + adcs @acc[2],@acc[2],@acc[2] + adcs @acc[3],@acc[3],@acc[3] + adcs @acc[4],@acc[4],@acc[4] + adcs @acc[5],@acc[5],@acc[5] + adcs @acc[6],@acc[6],@acc[6] + adcs @acc[7],@acc[7],@acc[7] + umulh @a[4], @a[4],@a[4] + adcs @acc[8],@acc[8],@acc[8] + mul @mod[5],@a[5],@a[5] + adcs @acc[9],@acc[9],@acc[9] + umulh @a[5], @a[5],@a[5] + adc $a_ptr,xzr,xzr + + adds @acc[0],@acc[0],@a[0] + adcs @acc[1],@acc[1],@mod[1] + adcs @acc[2],@acc[2],@a[1] + adcs @acc[3],@acc[3],@mod[2] + adcs @acc[4],@acc[4],@a[2] + adcs @acc[5],@acc[5],@mod[3] + adcs @acc[6],@acc[6],@a[3] + stp @mod[0],@acc[0],[$r_ptr] + adcs @acc[7],@acc[7],@mod[4] + stp @acc[1],@acc[2],[$r_ptr,#16] + adcs @acc[8],@acc[8],@a[4] + stp @acc[3],@acc[4],[$r_ptr,#32] + adcs @acc[9],@acc[9],@mod[5] + stp @acc[5],@acc[6],[$r_ptr,#48] + adc @a[5],@a[5],$a_ptr + stp @acc[7],@acc[8],[$r_ptr,#64] + stp @acc[9],@a[5],[$r_ptr,#80] + + ret +.size __sqr_384,.-__sqr_384 +___ +} +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + mul @tmp[0],$n0,@a[0] + ldp @a[4],@a[5],[$a_ptr,#32] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + mul @tmp[0],$n0,@a[0] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + // mul @acc[0],@mod[0],@tmp[0] + mul @acc[1],@mod[1],@tmp[0] + mul @acc[2],@mod[2],@tmp[0] + mul @acc[3],@mod[3],@tmp[0] + mul @acc[4],@mod[4],@tmp[0] + mul @acc[5],@mod[5],@tmp[0] + subs xzr,@a[0],#1 // adds @acc[0],@acc[0],@a[0] + umulh @a[0],@mod[0],@tmp[0] + adcs @acc[1],@acc[1],@a[1] + umulh @a[1],@mod[1],@tmp[0] + adcs @acc[2],@acc[2],@a[2] + umulh @a[2],@mod[2],@tmp[0] + adcs @acc[3],@acc[3],@a[3] + umulh @a[3],@mod[3],@tmp[0] + adcs @acc[4],@acc[4],@a[4] + umulh @a[4],@mod[4],@tmp[0] + adcs @acc[5],@acc[5],@a[5] + umulh @a[5],@mod[5],@tmp[0] + adc @acc[6],xzr,xzr +___ +} +$code.=<<___; + adds @a[0],@a[0],@acc[1] + adcs @a[1],@a[1],@acc[2] + adcs @a[2],@a[2],@acc[3] + adcs @a[3],@a[3],@acc[4] + adcs @a[4],@a[4],@acc[5] + adc @a[5],@a[5],@acc[6] + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @a[0],@a[0],@acc[0] // accumulate upper half + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adcs @a[5],@a[5],@acc[5] + adc @acc[6],xzr,xzr + + subs @acc[0],@a[0],@mod[0] + sbcs @acc[1],@a[1],@mod[1] + sbcs @acc[2],@a[2],@mod[2] + sbcs @acc[3],@a[3],@mod[3] + sbcs @acc[4],@a[4],@mod[4] + sbcs @acc[5],@a[5],@mod[5] + sbcs xzr,@acc[6],xzr + + csel @a[0],@a[0],@acc[0],lo + csel @a[1],@a[1],@acc[1],lo + csel @a[2],@a[2],@acc[2],lo + csel @a[3],@a[3],@acc[3],lo + csel @a[4],@a[4],@acc[4],lo + csel @a[5],@a[5],@acc[5],lo + + stp @a[0],@a[1],[$r_ptr] + stp @a[2],@a[3],[$r_ptr,#16] + stp @a[4],@a[5],[$r_ptr,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp @a[0],@a[1],[$a_ptr] + ldr $bi, [$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + + umulh @mod[0],@a[0],$bi + umulh @mod[1],@a[1],$bi + umulh @mod[2],@a[2],$bi + umulh @mod[3],@a[3],$bi + umulh @mod[4],@a[4],$bi + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,8*1] + + str @acc[0],[$r_ptr] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],xzr, @mod[5] + mul @mod[5],@a[5],$bi +___ +for ($i=1;$i<5;$i++) { +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + ldr $bi,[$b_ptr,#8*($i+1)] + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + mul @mod[0],@a[0],$bi + adcs @acc[1],@acc[2],@mod[1] + mul @mod[1],@a[1],$bi + adcs @acc[2],@acc[3],@mod[2] + mul @mod[2],@a[2],$bi + adcs @acc[3],@acc[4],@mod[3] + mul @mod[3],@a[3],$bi + adcs @acc[4],@acc[5],@mod[4] + mul @mod[4],@a[4],$bi + adc @acc[5],@acc[6],@mod[5] + mul @mod[5],@a[5],$bi +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@mod[0] + umulh @mod[0],@a[0],$bi + adcs @acc[1],@acc[1],@mod[1] + umulh @mod[1],@a[1],$bi + adcs @acc[2],@acc[2],@mod[2] + umulh @mod[2],@a[2],$bi + adcs @acc[3],@acc[3],@mod[3] + umulh @mod[3],@a[3],$bi + adcs @acc[4],@acc[4],@mod[4] + umulh @mod[4],@a[4],$bi + adcs @acc[5],@acc[5],@mod[5] + umulh @mod[5],@a[5],$bi + adc @acc[6],xzr,xzr + + str @acc[0],[$r_ptr,8*$i] + adds @acc[0],@acc[1],@mod[0] + adcs @acc[1],@acc[2],@mod[1] + adcs @acc[2],@acc[3],@mod[2] + adcs @acc[3],@acc[4],@mod[3] + adcs @acc[4],@acc[5],@mod[4] + adc @acc[5],@acc[6],@mod[5] + + stp @acc[0],@acc[1],[$r_ptr,#48] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp @a[0],@a[1],[$a_ptr] + mov @tmp[0],$r_ptr // save r_ptr + ldp @acc[0],@acc[1],[$a_ptr,#48] + mov @tmp[1],$a_ptr // save a_ptr + ldp @a[2],@a[3],[$a_ptr,#16] + mov @tmp[2],$b_ptr // save b_ptr + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @a[4],@a[5],[$a_ptr,#32] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + ldp @a[0],@a[1],[$b_ptr] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[0],@acc[1],[$b_ptr,#48] + adcs @mod[3],$a[3],@acc[3] + ldp @a[2],@a[3],[$b_ptr,#16] + adcs @mod[4],$a[4],@acc[4] + ldp @acc[2],@acc[3],[$b_ptr,#64] + adc @mod[5],$a[5],@acc[5] + ldp @a[4],@a[5],[$b_ptr,#32] + + stp @mod[0],@mod[1],[sp] + adds @mod[0],$a[0],@acc[0] // t1 = b->re + b->im + ldp @acc[4],@acc[5],[$b_ptr,#80] + adcs @mod[1],$a[1],@acc[1] + stp @mod[2],@mod[3],[sp,#16] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + stp @mod[4],@mod[5],[sp,#32] + adcs @mod[4],$a[4],@acc[4] + stp @mod[0],@mod[1],[sp,#48] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[sp,#64] + stp @mod[4],@mod[5],[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add $a_ptr,sp,#0 // mul_384(ret->im, t0, t1) + add $b_ptr,sp,#48 + add $r_ptr,@tmp[0],#96 + bl __mul_384 + + add $a_ptr,@tmp[1],#48 // mul_384(tx, a->im, b->im) + add $b_ptr,@tmp[2],#48 + add $r_ptr,sp,#0 + bl __mul_384 + + ldp @mod[0],@mod[1],[$n_ptr] + ldp @mod[2],@mod[3],[$n_ptr,#16] + ldp @mod[4],@mod[5],[$n_ptr,#32] + + add $a_ptr,@tmp[0],#96 // ret->im -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#96 + bl __sub_mod_384x384 + + add $b_ptr,@tmp[0],#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add $a_ptr,@tmp[0],#0 // ret->re -= tx + add $b_ptr,sp,#0 + add $r_ptr,@tmp[0],#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp @a[0],@a[1],[$a_ptr] + ldp @acc[0],@acc[1],[$a_ptr,#48] + ldp @a[2],@a[3],[$a_ptr,#16] + adds @mod[0],$a[0],@acc[0] // t0 = a->re + a->im + ldp @acc[2],@acc[3],[$a_ptr,#64] + adcs @mod[1],$a[1],@acc[1] + ldp @a[4],@a[5],[$a_ptr,#32] + adcs @mod[2],$a[2],@acc[2] + ldp @acc[4],@acc[5],[$a_ptr,#80] + adcs @mod[3],$a[3],@acc[3] + stp @mod[0],@mod[1],[$r_ptr] + adcs @mod[4],$a[4],@acc[4] + ldp @mod[0],@mod[1],[$b_ptr] + adc @mod[5],$a[5],@acc[5] + stp @mod[2],@mod[3],[$r_ptr,#16] + + subs @a[0],$a[0],@acc[0] // t1 = a->re - a->im + ldp @mod[2],@mod[3],[$b_ptr,#16] + sbcs @a[1],$a[1],@acc[1] + stp @mod[4],@mod[5],[$r_ptr,#32] + sbcs @a[2],$a[2],@acc[2] + ldp @mod[4],@mod[5],[$b_ptr,#32] + sbcs @a[3],$a[3],@acc[3] + sbcs @a[4],$a[4],@acc[4] + sbcs @a[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + adds @a[0],@a[0],@acc[0] + and @acc[2],@mod[2],@acc[6] + adcs @a[1],@a[1],@acc[1] + and @acc[3],@mod[3],@acc[6] + adcs @a[2],@a[2],@acc[2] + and @acc[4],@mod[4],@acc[6] + adcs @a[3],@a[3],@acc[3] + and @acc[5],@mod[5],@acc[6] + adcs @a[4],@a[4],@acc[4] + stp @a[0],@a[1],[$r_ptr,#48] + adc @a[5],@a[5],@acc[5] + stp @a[2],@a[3],[$r_ptr,#64] + stp @a[4],@a[5],[$r_ptr,#80] + + mov $n0,$a_ptr // save a_ptr + add $a_ptr,$r_ptr,#0 // mul_384(ret->re, t0, t1) + add $b_ptr,$r_ptr,#48 + bl __mul_384 + + add $a_ptr,$n0,#0 // mul_384(ret->im, a->re, a->im) + add $b_ptr,$n0,#48 + add $r_ptr,$r_ptr,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp @a[0],@a[1],[$r_ptr] + ldp @a[2],@a[3],[$r_ptr,#16] + adds @a[0],@a[0],@a[0] // add with itself + ldp @a[4],@a[5],[$r_ptr,#32] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adcs @acc[0],@acc[0],@acc[0] + adcs @acc[1],@acc[1],@acc[1] + stp @a[0],@a[1],[$r_ptr] + adcs @acc[2],@acc[2],@acc[2] + stp @a[2],@a[3],[$r_ptr,#16] + adcs @acc[3],@acc[3],@acc[3] + stp @a[4],@a[5],[$r_ptr,#32] + adcs @acc[4],@acc[4],@acc[4] + stp @acc[0],@acc[1],[$r_ptr,#48] + adc @acc[5],@acc[5],@acc[5] + stp @acc[2],@acc[3],[$r_ptr,#64] + stp @acc[4],@acc[5],[$r_ptr,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp $n_ptr,$r_ptr,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov $n0,$n_ptr // adjust for missing b_ptr + + ldp @a[0],@a[1],[$a_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + + ldp $bi,@acc[1],[$a_ptr,#48] + ldp @acc[2],@acc[3],[$a_ptr,#64] + ldp @acc[4],@acc[5],[$a_ptr,#80] + + adds @mod[0],$a[0],$bi // t0 = a->re + a->im + adcs @mod[1],$a[1],@acc[1] + adcs @mod[2],$a[2],@acc[2] + adcs @mod[3],$a[3],@acc[3] + adcs @mod[4],$a[4],@acc[4] + adc @mod[5],$a[5],@acc[5] + + subs @acc[0],$a[0],$bi // t1 = a->re - a->im + sbcs @acc[1],$a[1],@acc[1] + sbcs @acc[2],$a[2],@acc[2] + sbcs @acc[3],$a[3],@acc[3] + sbcs @acc[4],$a[4],@acc[4] + sbcs @acc[5],$a[5],@acc[5] + sbc @acc[6],xzr,xzr // borrow flag as mask + + stp @mod[0],@mod[1],[sp] + stp @mod[2],@mod[3],[sp,#16] + stp @mod[4],@mod[5],[sp,#32] + stp @acc[0],@acc[1],[sp,#48] + stp @acc[2],@acc[3],[sp,#64] + stp @acc[4],@acc[5],[sp,#80] + str @acc[6],[sp,#96] + + ldp @mod[0],@mod[1],[$b_ptr] + ldp @mod[2],@mod[3],[$b_ptr,#16] + ldp @mod[4],@mod[5],[$b_ptr,#32] + + add $b_ptr,$a_ptr,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds @acc[0],@a[0],@a[0] // add with itself + adcs @acc[1],@a[1],@a[1] + adcs @acc[2],@a[2],@a[2] + adcs @acc[3],@a[3],@a[3] + adcs @acc[4],@a[4],@a[4] + adc @acc[5],@a[5],@a[5] + + stp @acc[0],@acc[1],[$b_ptr,#48] + stp @acc[2],@acc[3],[$b_ptr,#64] + stp @acc[4],@acc[5],[$b_ptr,#80] + + ldp @a[0],@a[1],[sp] + ldr $bi,[sp,#48] + ldp @a[2],@a[3],[sp,#16] + ldp @a[4],@a[5],[sp,#32] + + add $b_ptr,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr @acc[6],[sp,#96] // account for sign from a->re - a->im + ldp @acc[0],@acc[1],[sp] + ldp @acc[2],@acc[3],[sp,#16] + ldp @acc[4],@acc[5],[sp,#32] + + and @acc[0],@acc[0],@acc[6] + and @acc[1],@acc[1],@acc[6] + and @acc[2],@acc[2],@acc[6] + and @acc[3],@acc[3],@acc[6] + and @acc[4],@acc[4],@acc[6] + and @acc[5],@acc[5],@acc[6] + + subs @a[0],@a[0],@acc[0] + sbcs @a[1],@a[1],@acc[1] + sbcs @a[2],@a[2],@acc[2] + sbcs @a[3],@a[3],@acc[3] + sbcs @a[4],@a[4],@acc[4] + sbcs @a[5],@a[5],@acc[5] + sbc @acc[6],xzr,xzr + + and @acc[0],@mod[0],@acc[6] + and @acc[1],@mod[1],@acc[6] + and @acc[2],@mod[2],@acc[6] + and @acc[3],@mod[3],@acc[6] + and @acc[4],@mod[4],@acc[6] + and @acc[5],@mod[5],@acc[6] + + adds @a[0],@a[0],@acc[0] + adcs @a[1],@a[1],@acc[1] + adcs @a[2],@a[2],@acc[2] + adcs @a[3],@a[3],@acc[3] + adcs @a[4],@a[4],@acc[4] + adc @a[5],@a[5],@acc[5] + + stp @a[0],@a[1],[$b_ptr] + stp @a[2],@a[3],[$b_ptr,#16] + stp @a[4],@a[5],[$b_ptr,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul @acc[0],@a[0],$bi + mul @acc[1],@a[1],$bi + mul @acc[2],@a[2],$bi + mul @acc[3],@a[3],$bi + mul @acc[4],@a[4],$bi + mul @acc[5],@a[5],$bi + mul $n0,$n0,@acc[0] + + umulh @tmp[0],@a[0],$bi + umulh @tmp[1],@a[1],$bi + umulh @tmp[2],@a[2],$bi + umulh @tmp[3],@a[3],$bi + umulh @tmp[4],@a[4],$bi + umulh @tmp[5],@a[5],$bi + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],xzr, @tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +for ($i=1;$i<6;$i++) { +$code.=<<___; + ldr $bi,[$b_ptr,8*$i] + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + + ldr $n0,[x29,#96] + adds @acc[0],@acc[1],@tmp[0] + mul @tmp[0],@a[0],$bi + adcs @acc[1],@acc[2],@tmp[1] + mul @tmp[1],@a[1],$bi + adcs @acc[2],@acc[3],@tmp[2] + mul @tmp[2],@a[2],$bi + adcs @acc[3],@acc[4],@tmp[3] + mul @tmp[3],@a[3],$bi + adcs @acc[4],@acc[5],@tmp[4] + mul @tmp[4],@a[4],$bi + adcs @acc[5],@acc[6],@tmp[5] + mul @tmp[5],@a[5],$bi + adc @acc[6],xzr,xzr + + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@a[0],$bi + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@a[1],$bi + adcs @acc[2],@acc[2],@tmp[2] + mul $n0,$n0,@acc[0] + umulh @tmp[2],@a[2],$bi + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@a[3],$bi + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@a[4],$bi + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@a[5],$bi + adc @acc[6],@acc[6],xzr + + adds @acc[1],@acc[1],@tmp[0] + mul @tmp[0],@mod[0],$n0 + adcs @acc[2],@acc[2],@tmp[1] + mul @tmp[1],@mod[1],$n0 + adcs @acc[3],@acc[3],@tmp[2] + mul @tmp[2],@mod[2],$n0 + adcs @acc[4],@acc[4],@tmp[3] + mul @tmp[3],@mod[3],$n0 + adcs @acc[5],@acc[5],@tmp[4] + mul @tmp[4],@mod[4],$n0 + adc @acc[6],@acc[6],@tmp[5] + mul @tmp[5],@mod[5],$n0 +___ +} +$code.=<<___; + adds @acc[0],@acc[0],@tmp[0] + umulh @tmp[0],@mod[0],$n0 + adcs @acc[1],@acc[1],@tmp[1] + umulh @tmp[1],@mod[1],$n0 + adcs @acc[2],@acc[2],@tmp[2] + umulh @tmp[2],@mod[2],$n0 + adcs @acc[3],@acc[3],@tmp[3] + umulh @tmp[3],@mod[3],$n0 + adcs @acc[4],@acc[4],@tmp[4] + umulh @tmp[4],@mod[4],$n0 + adcs @acc[5],@acc[5],@tmp[5] + umulh @tmp[5],@mod[5],$n0 + adc @acc[6],@acc[6],xzr + ldp $n0,$b_ptr,[x29,#96] // pull r_ptr + + adds @a[0],@acc[1],@tmp[0] + adcs @a[1],@acc[2],@tmp[1] + adcs @a[2],@acc[3],@tmp[2] + adcs @a[3],@acc[4],@tmp[3] + adcs @a[4],@acc[5],@tmp[4] + adcs @a[5],@acc[6],@tmp[5] + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + adds @a[0],@a[0],@a[0] + adcs @a[1],@a[1],@a[1] + adcs @a[2],@a[2],@a[2] + adcs @a[3],@a[3],@a[3] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + paciasp + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov $n0,$b_ptr + ldp @mod[0],@mod[1],[$a_ptr] + ldp @mod[2],@mod[3],[$a_ptr,#16] + ldp @mod[4],@mod[5],[$a_ptr,#32] + mov $a_ptr,$r_ptr + + bl __mul_by_1_mont_384 + add $a_ptr,$a_ptr,#48 + + and $b_ptr,@a[0],#1 + orr $n_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $n_ptr,$n_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $n_ptr,$n_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $n_ptr,$n_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $n_ptr,$n_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $b_ptr,$b_ptr,$bi + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and $r_ptr,@a[0],#1 + orr $a_ptr,@a[0],@a[1] + adds @a[0],@a[0],@a[0] + orr $a_ptr,$a_ptr,@a[2] + adcs @a[1],@a[1],@a[1] + orr $a_ptr,$a_ptr,@a[3] + adcs @a[2],@a[2],@a[2] + orr $a_ptr,$a_ptr,@a[4] + adcs @a[3],@a[3],@a[3] + orr $a_ptr,$a_ptr,@a[5] + adcs @a[4],@a[4],@a[4] + adcs @a[5],@a[5],@a[5] + adc $bi,xzr,xzr + + subs @a[0],@a[0],@mod[0] + sbcs @a[1],@a[1],@mod[1] + sbcs @a[2],@a[2],@mod[2] + sbcs @a[3],@a[3],@mod[3] + sbcs @a[4],@a[4],@mod[4] + sbcs @a[5],@a[5],@mod[5] + sbc $bi,$bi,xzr + + mvn $bi,$bi + and $bi,$bi,#2 + orr $r_ptr,$r_ptr,$bi + + cmp $n_ptr,#0 + csel $n_ptr,$r_ptr,$b_ptr,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp $a_ptr,#0 + csel $a_ptr,$r_ptr,$b_ptr,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and $n_ptr,$n_ptr,#1 + and $a_ptr,$a_ptr,#2 + orr $r_ptr,$a_ptr,$n_ptr // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + autiasp + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ + +if (0) { +my @b = ($bi, @mod[0..4]); +my @comba = @acc[4..6]; + +$code.=<<___; +.type __mul_384_comba,%function +.align 5 +__mul_384_comba: + ldp @a[0],@a[1],[$a_ptr] + ldp @b[0],@b[1],[$b_ptr] + ldp @a[2],@a[3],[$a_ptr,#16] + ldp @a[4],@a[5],[$a_ptr,#32] + ldp @b[2],@b[3],[$b_ptr,#16] + ldp @b[4],@b[5],[$b_ptr,#32] + + mul @comba[0],@a[0],@b[0] + umulh @comba[1],@a[0],@b[0] + mul @acc[0],@a[1],@b[0] + umulh @acc[1],@a[1],@b[0] + str @comba[0],[$r_ptr] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[0],@b[1] + umulh @acc[3],@a[0],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],xzr, @acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[2],@b[0] + umulh @acc[1],@a[2],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#8] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[1],@b[1] + umulh @acc[3],@a[1],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[0],@b[2] + umulh @acc[1],@a[0],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[3],@b[0] + umulh @acc[3],@a[3],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#16] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[2],@b[1] + umulh @acc[1],@a[2],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[1],@b[2] + umulh @acc[3],@a[1],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[0],@b[3] + umulh @acc[1],@a[0],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[4],@b[0] + umulh @acc[3],@a[4],@b[0] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#24] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[3],@b[1] + umulh @acc[1],@a[3],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[2],@b[2] + umulh @acc[3],@a[2],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[3] + umulh @acc[1],@a[1],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[4] + umulh @acc[3],@a[0],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[0] + umulh @acc[1],@a[5],@b[0] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#32] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[1] + umulh @acc[3],@a[4],@b[1] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[2] + umulh @acc[1],@a[3],@b[2] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[3] + umulh @acc[3],@a[2],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[4] + umulh @acc[1],@a[1],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[0],@b[5] + umulh @acc[3],@a[0],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[1] + umulh @acc[1],@a[5],@b[1] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#40] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[2] + umulh @acc[3],@a[4],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[3],@b[3] + umulh @acc[1],@a[3],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[2],@b[4] + umulh @acc[3],@a[2],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[1],@b[5] + umulh @acc[1],@a[1],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[2] + umulh @acc[3],@a[5],@b[2] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#48] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[3] + umulh @acc[1],@a[4],@b[3] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[4] + umulh @acc[3],@a[3],@b[4] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[2],@b[5] + umulh @acc[1],@a[2],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + mul @acc[2],@a[5],@b[3] + umulh @acc[3],@a[5],@b[3] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#56] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[0],@a[4],@b[4] + umulh @acc[1],@a[4],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],xzr,xzr + mul @acc[2],@a[3],@b[5] + umulh @acc[3],@a[3],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],@comba[2],xzr + mul @acc[0],@a[5],@b[4] + umulh @acc[1],@a[5],@b[4] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#64] +___ + push(@comba,shift(@comba)); +$code.=<<___; + mul @acc[2],@a[4],@b[5] + umulh @acc[3],@a[4],@b[5] + adds @comba[0],@comba[0],@acc[0] + adcs @comba[1],@comba[1],@acc[1] + adc @comba[2],xzr,xzr + mul @acc[0],@a[5],@b[5] + umulh @acc[1],@a[5],@b[5] + adds @comba[0],@comba[0],@acc[2] + adcs @comba[1],@comba[1],@acc[3] + adc @comba[2],@comba[2],xzr + str @comba[0],[$r_ptr,#72] +___ + push(@comba,shift(@comba)); +$code.=<<___; + adds @comba[0],@comba[0],@acc[0] + adc @comba[1],@comba[1],@acc[1] + stp @comba[0],@comba[1],[$r_ptr,#80] + + ret +.size __mul_384_comba,.-__mul_384_comba +___ +} +print $code; + +close STDOUT; diff --git a/blst/asm/mulq_mont_256-x86_64.pl b/blst/asm/mulq_mont_256-x86_64.pl new file mode 100755 index 0000000..12e58bb --- /dev/null +++ b/blst/asm/mulq_mont_256-x86_64.pl @@ -0,0 +1,513 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# As for "sparse" in subroutine names, see commentary in the +# asm/mulx_mont_256-x86_64.pl module. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +{ ############################################################## 256 bits +my @acc=map("%r$_",(9..15)); + +{ ############################################################## mulq +my ($hi, $a0) = ("%rbp", $r_ptr); + +$code.=<<___; +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,\@function,5,"unwind" +.align 32 +mul_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[4] + mov 8*1($a_ptr), @acc[5] + mov 8*2($a_ptr), @acc[3] + mov 8*3($a_ptr), $hi + mov $b_org, $b_ptr # evacuate from %rdx + + mov %rax, @acc[6] + mulq @acc[4] # a[0]*b[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,\@function,4,"unwind" +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov 8*0($a_ptr), %rax + mov $n_ptr, $n0 + mov 8*1($a_ptr), @acc[5] + mov $b_org, $n_ptr + mov 8*2($a_ptr), @acc[3] + lea ($a_ptr), $b_ptr + mov 8*3($a_ptr), $hi + + mov %rax, @acc[6] + mulq %rax # a[0]*a[0] + mov %rax, @acc[0] + mov @acc[6], %rax + mov %rdx, @acc[1] + call __mulq_mont_sparse_256 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +___ +{ +my @acc=@acc; +$code.=<<___; +.type __mulq_mont_sparse_256,\@abi-omnipotent +.align 32 +__mulq_mont_sparse_256: + mulq @acc[5] # a[1]*b[0] + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[3] # a[2]*b[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq $hi # a[3]*b[0] + add %rax, @acc[3] + mov 8($b_ptr), %rax + adc \$0, %rdx + xor @acc[5], @acc[5] + mov %rdx, @acc[4] + +___ +for (my $i=1; $i<4; $i++) { +my $b_next = $i<3 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], $a0 + imulq $n0, @acc[0] + + ################################# Multiply by b[$i] + mov %rax, @acc[6] + mulq 8*0($a_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*1($a_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($a_ptr) + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($a_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc %rdx, @acc[5] # can't overflow + xor @acc[6], @acc[6] + + ################################# reduction + mulq 8*0($n_ptr) + add %rax, $a0 # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $a0 + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $a0, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + adc \$0, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq $n0, %rax + mov 8(%rsp), $a_ptr # restore $r_ptr + + ################################# last reduction + mov %rax, @acc[6] + mulq 8*0($n_ptr) + add %rax, @acc[0] # guaranteed to be zero + mov @acc[6], %rax + adc %rdx, @acc[0] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[6], %rax + adc \$0, %rdx + add @acc[0], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + mov @acc[2], $b_ptr + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add %rdx, @acc[4] + adc \$0, @acc[5] + + ################################# + # Branch-less conditional subtraction of modulus + + mov @acc[3], @acc[0] + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + sbb 8*2($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*3($n_ptr), @acc[4] + sbb \$0, @acc[5] + + cmovc %rax, @acc[1] + cmovc $b_ptr, @acc[2] + cmovc @acc[0], @acc[3] + mov @acc[1], 8*0($a_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*1($a_ptr) + mov @acc[3], 8*2($a_ptr) + mov @acc[4], 8*3($a_ptr) + + ret +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +___ +} } +{ my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,\@function,4,"unwind" +.align 32 +from_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + ################################# + # Branch-less conditional acc[0:3] - modulus + + #mov @acc[4], %rax # __mulq_by_1_mont_256 does it + mov @acc[5], @acc[1] + mov @acc[6], @acc[2] + mov @acc[0], @acc[3] + + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + sbb 8*3($n_ptr), @acc[0] + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,\@function,4,"unwind" +.align 32 +redc_mont_256: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_256 + + add 8*4($a_ptr), @acc[4] # accumulate upper half + adc 8*5($a_ptr), @acc[5] + mov @acc[4], %rax + adc 8*6($a_ptr), @acc[6] + mov @acc[5], @acc[1] + adc 8*7($a_ptr), @acc[0] + sbb $a_ptr, $a_ptr + + ################################# + # Branch-less conditional acc[0:4] - modulus + + mov @acc[6], @acc[2] + sub 8*0($n_ptr), @acc[4] + sbb 8*1($n_ptr), @acc[5] + sbb 8*2($n_ptr), @acc[6] + mov @acc[0], @acc[3] + sbb 8*3($n_ptr), @acc[0] + sbb \$0, $a_ptr + + cmovnc @acc[4], %rax + cmovnc @acc[5], @acc[1] + cmovnc @acc[6], @acc[2] + mov %rax, 8*0($r_ptr) + cmovnc @acc[0], @acc[3] + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +___ +{ +my @acc=@acc; + +$code.=<<___; +.type __mulq_by_1_mont_256,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_256: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + + mov %rax, @acc[4] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<4; $i++) { +my $hi = @acc[4]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[4] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[4] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[4], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) +___ +$code.=<<___ if ($i<3); + mov @acc[1], @acc[5] + imulq $n0, @acc[1] +___ +$code.=<<___; + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 +___ +} } } + +print $code; +close STDOUT; diff --git a/blst/asm/mulq_mont_384-x86_64.pl b/blst/asm/mulq_mont_384-x86_64.pl new file mode 100755 index 0000000..3812319 --- /dev/null +++ b/blst/asm/mulq_mont_384-x86_64.pl @@ -0,0 +1,2675 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +# common argument layout +($r_ptr,$a_ptr,$b_org,$n_ptr,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); +$b_ptr = "%rbx"; + +# common accumulator layout +@acc=map("%r$_",(8..15)); + +######################################################################## +{ my @acc=(@acc,"%rax","%rbx","%rbp",$a_ptr); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.text + +######################################################################## +# Double-width subtraction modulo n<<384, as opposite to naively +# expected modulo n*n. It works because n<<384 is the actual +# input boundary condition for Montgomery reduction, not n*n. +# Just in case, this is duplicated, but only one module is +# supposed to be linked... +.type __sub_mod_384x384,\@abi-omnipotent +.align 32 +__sub_mod_384x384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + mov 8*6($a_ptr), @acc[6] + + sub 8*0($b_org), @acc[0] + mov 8*7($a_ptr), @acc[7] + sbb 8*1($b_org), @acc[1] + mov 8*8($a_ptr), @acc[8] + sbb 8*2($b_org), @acc[2] + mov 8*9($a_ptr), @acc[9] + sbb 8*3($b_org), @acc[3] + mov 8*10($a_ptr), @acc[10] + sbb 8*4($b_org), @acc[4] + mov 8*11($a_ptr), @acc[11] + sbb 8*5($b_org), @acc[5] + mov @acc[0], 8*0($r_ptr) + sbb 8*6($b_org), @acc[6] + mov 8*0($n_ptr), @acc[0] + mov @acc[1], 8*1($r_ptr) + sbb 8*7($b_org), @acc[7] + mov 8*1($n_ptr), @acc[1] + mov @acc[2], 8*2($r_ptr) + sbb 8*8($b_org), @acc[8] + mov 8*2($n_ptr), @acc[2] + mov @acc[3], 8*3($r_ptr) + sbb 8*9($b_org), @acc[9] + mov 8*3($n_ptr), @acc[3] + mov @acc[4], 8*4($r_ptr) + sbb 8*10($b_org), @acc[10] + mov 8*4($n_ptr), @acc[4] + mov @acc[5], 8*5($r_ptr) + sbb 8*11($b_org), @acc[11] + mov 8*5($n_ptr), @acc[5] + sbb $b_org, $b_org + + and $b_org, @acc[0] + and $b_org, @acc[1] + and $b_org, @acc[2] + and $b_org, @acc[3] + and $b_org, @acc[4] + and $b_org, @acc[5] + + add @acc[0], @acc[6] + adc @acc[1], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[2], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[3], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[4], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[5], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,\@abi-omnipotent +.align 32 +__add_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*0($b_org), @acc[0] + adc 8*1($b_org), @acc[1] + adc 8*2($b_org), @acc[2] + mov @acc[0], @acc[6] + adc 8*3($b_org), @acc[3] + mov @acc[1], @acc[7] + adc 8*4($b_org), @acc[4] + mov @acc[2], @acc[8] + adc 8*5($b_org), @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc @acc[9], @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,\@abi-omnipotent +.align 32 +__sub_mod_384: + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + +__sub_mod_384_a_is_loaded: + sub 8*0($b_org), @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb 8*1($b_org), @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb 8*2($b_org), @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb 8*3($b_org), @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb 8*4($b_org), @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb 8*5($b_org), @acc[5] + mov 8*5($n_ptr), @acc[11] + sbb $b_org, $b_org + + and $b_org, @acc[6] + and $b_org, @acc[7] + and $b_org, @acc[8] + and $b_org, @acc[9] + and $b_org, @acc[10] + and $b_org, @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[8], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[9], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[10], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[11], @acc[5] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __sub_mod_384,.-__sub_mod_384 +___ +} + +######################################################################## +# "Complex" multiplication and squaring. Use vanilla multiplication when +# possible to fold reductions. I.e. instead of mul_mont, mul_mont +# followed by add/sub_mod, it calls mul, mul, double-width add/sub_mod +# followed by *common* reduction... +{ my $frame = 5*8 + # place for argument off-load + + 3*768/8; # place for 3 768-bit temporary vectors +$code.=<<___; +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,\@function,5,"unwind" +.align 32 +mul_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $b_org, $b_ptr + mov $r_ptr, 8*4(%rsp) # offload arguments + mov $a_ptr, 8*3(%rsp) + mov $b_org, 8*2(%rsp) + mov $n_ptr, 8*1(%rsp) + mov $n0, 8*0(%rsp) + + ################################# mul_384(t0, a->re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulq_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 48($a_ptr), $a_ptr # a->im + lea 40+96(%rsp), $r_ptr # t1 + call __mulq_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea -48($a_ptr), $b_org + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*2(%rsp), $a_ptr + lea 48($a_ptr), $b_org + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulq_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2=t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2=t2-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + mov $n_ptr, $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +$code.=<<___; +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,\@function,4,"unwind" +.align 32 +sqr_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $a_ptr, 8*2(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + call __mulq_mont_384 +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $b_org, $b_org + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $b_org + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($r_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($r_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($r_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,\@function,4,"unwind" +.align 32 +mul_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulq_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulq_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulq_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mul_382x,.-mul_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,\@function,3,"unwind" +.align 32 +sqr_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulq_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulq_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_382x,.-sqr_382x +___ +} +{ ########################################################## 384-bit mul +my @acc=map("%r$_",("cx",8..12)); +my $bi = "%rbp"; + +$code.=<<___; +.globl mul_384 +.hidden mul_384 +.type mul_384,\@function,3,"unwind" +.align 32 +mul_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 +.cfi_end_prologue + + mov $b_org, $b_ptr + call __mulq_384 + + mov 0(%rsp),%r12 +.cfi_restore %r12 + mov 8(%rsp),%rbx +.cfi_restore %rbx + mov 16(%rsp),%rbp +.cfi_restore %rbp + lea 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 +.cfi_epilogue + ret +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,\@abi-omnipotent +.align 32 +__mulq_384: + mov 8*0($b_ptr), %rax + + mov %rax, $bi + mulq 8*0($a_ptr) + mov %rax, 8*0($r_ptr) + mov $bi, %rax + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[4] + mov 8*1($b_ptr), %rax + adc \$0, %rdx + mov %rdx, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov @acc[0], 8*$i($r_ptr) + mov %rdx, @acc[0] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[1], @acc[0] + adc \$0, %rdx + mov %rdx, @acc[1] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[2], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[3], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[3] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[4], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[5], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulq_384,.-__mulq_384 +___ +} +if (0) { ############################################################## +my @b=map("%r$_",(10..15)); +my @a=reverse(@b); + @b[5]=$b_ptr; +my $bi = "%rbp"; +my @comba=map("%r$_",("cx",8,9)); +# a[0]*b[0] +# a[1]*b[0] +# a[0]*b[1] +# a[2]*b[0] +# a[1]*b[1] +# a[0]*b[2] +# a[3]*b[0] +# a[2]*b[1] +# a[1]*b[2] +# a[0]*b[3] +# a[4]*b[0] +# a[3]*b[1] +# a[2]*b[2] +# a[1]*b[3] +# a[0]*b[4] +# a[5]*b[0] +# a[4]*b[1] +# a[3]*b[2] +# a[2]*b[3] +# a[1]*b[4] +# a[0]*b[5] +# a[5]*b[1] +# a[4]*b[2] +# a[3]*b[3] +# a[2]*b[4] +# a[1]*b[5] +# a[5]*b[2] +# a[4]*b[3] +# a[3]*b[4] +# a[2]*b[5] +# a[5]*b[3] +# a[4]*b[4] +# a[3]*b[5] +# a[5]*b[4] +# a[4]*b[5] +# a[5]*b[5] +# +# 13% less instructions give +15% on Core2, +10% on Goldmont, +# -0% on Sandy Bridge, but -16% on Haswell:-( +# [for reference +5% on Skylake, +11% on Ryzen] + +$code.=<<___; +.type __mulq_comba_384,\@abi-omnipotent +.align 32 +__mulq_comba_384: + mov 8*0($b_ptr), %rax + mov 8*0($a_ptr), @a[0] + mov 8*1($a_ptr), @a[1] + mov 8*1($b_ptr), @b[1] + + mov %rax, @b[0] + mulq @a[0] # a[0]*b[0] + mov %rax, 8*0($r_ptr) + mov @b[0], %rax + mov %rdx, @comba[0] + + ################################# + mov 8*2($a_ptr), @a[2] + xor @comba[2], @comba[2] + mulq @a[1] # a[1]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc \$0, %rdx + mov 8*2($b_ptr), @b[2] + mov %rdx, @comba[1] + + mulq @a[0] # a[0]*b[1] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*1($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[2] # a[2]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[0] # a[0]*b[2] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*2($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*3($a_ptr) # a[3]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[2] # a[2]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[2] + add %rax, @comba[0] + mov 8*3($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[3] + mulq @a[0] # a[0]*b[3] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*3($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq 8*4($a_ptr) # a[4]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[1] # a[1]*b[3] + add %rax, @comba[0] + mov 8*4($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[4] + mulq @a[0] # a[0]*b[4] + add %rax, @comba[0] + mov @b[0], %rax + adc %rdx, @comba[1] + mov 8*5($a_ptr), @a[5] + adc \$0, @comba[2] + mov @comba[0], 8*4($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[0] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*4($a_ptr) # a[4]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[4] + add %rax, @comba[0] + mov 8*5($b_ptr), %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mov %rax, @b[5] + mulq @a[0] # a[0]*b[5] + add %rax, @comba[0] + mov @b[1], %rax + adc %rdx, @comba[1] + mov 8*4($a_ptr), @a[4] + adc \$0, @comba[2] + mov @comba[0], 8*5($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[1] + add %rax, @comba[0] + mov @b[2], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*3($a_ptr) # a[3]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*1($a_ptr) # a[1]*b[5] + add %rax, @comba[0] + mov $b[2], %rax + adc %rdx, @comba[1] + mov 8*3($a_ptr), @a[3] + adc \$0, @comba[2] + mov @comba[0], 8*6($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[2] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq 8*2($a_ptr) # a[2]*b[5] + add %rax, @comba[0] + mov @b[3], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*7($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[3] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[3] # a[3]*b[5] + add %rax, @comba[0] + mov @b[4], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*8($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + xor @comba[2], @comba[2] + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + + mulq @a[4] # a[4]*b[5] + add %rax, @comba[0] + mov @b[5], %rax + adc %rdx, @comba[1] + adc \$0, @comba[2] + mov @comba[0], 8*9($r_ptr) +___ + push(@comba,shift(@comba)); +$code.=<<___; + mulq @a[5] # a[5]*b[4] + add %rax, @comba[0] + adc %rdx, @comba[1] + + mov @comba[0], 8*10($r_ptr) + mov @comba[1], 8*11($r_ptr) + + ret +.size __mulq_comba_384,.-__mulq_comba_384 +___ +} +{ ########################################################## 384-bit sqr +my @acc=(@acc,"%rcx","%rbx","%rbp",$a_ptr); +my $hi; + +$code.=<<___; +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,\@function,2,"unwind" +.align 32 +sqr_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrq_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,\@abi-omnipotent +.align 32 +__sqrq_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + + ######################################### + mov %rax, @acc[6] + mulq @acc[7] # a[1]*a[0] + mov %rax, @acc[1] + mov @acc[6], %rax + mov 8*4($a_ptr), @acc[10] + mov %rdx, @acc[2] + + mulq @acc[8] # a[2]*a[0] + add %rax, @acc[2] + mov @acc[6], %rax + adc \$0, %rdx + mov 8*5($a_ptr), @acc[11] + mov %rdx, @acc[3] + + mulq @acc[9] # a[3]*a[0] + add %rax, @acc[3] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq @acc[10] # a[4]*a[0] + add %rax, @acc[4] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq @acc[11] # a[5]*a[0] + add %rax, @acc[5] + mov @acc[6], %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq %rax # a[0]*a[0] + xor @acc[0], @acc[0] + mov %rax, 8*0($r_ptr) + mov @acc[7], %rax + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[0] + add %rdx, @acc[1] # accumulate a[0]*a[0] + adc \$0, @acc[0] # carries to a[1]*a[1] + mov @acc[1], 8*1($r_ptr) +___ +$hi=@acc[1]; +$code.=<<___; + ######################################### + mulq @acc[8] # a[2]*a[1] + add %rax, @acc[3] + mov @acc[7], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[9] # a[3]*a[1] + add %rax, @acc[4] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[1] + add %rax, @acc[5] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[1] + add %rax, @acc[6] + mov @acc[7], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq %rax # a[1]*a[1] + xor @acc[1], @acc[1] + add %rax, @acc[0] # can't carry + mov @acc[8], %rax + add @acc[2], @acc[2] # double acc[2:3] + adc @acc[3], @acc[3] + adc \$0, @acc[1] + add @acc[0], @acc[2] # accumulate a[1]*a[1] + adc %rdx, @acc[3] + adc \$0, @acc[1] # carries to a[2]*a[2] + mov @acc[2], 8*2($r_ptr) +___ +$hi=@acc[0]; +$code.=<<___; + ######################################### + mulq @acc[9] # a[3]*a[2] + add %rax, @acc[5] + mov @acc[8], %rax + adc \$0, %rdx + mov @acc[3], 8*3($r_ptr) + mov %rdx, $hi + + mulq @acc[10] # a[4]*a[2] + add %rax, @acc[6] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[6] + adc \$0, %rdx + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[2] + add %rax, @acc[7] + mov @acc[8], %rax + adc \$0, %rdx + add $hi, @acc[7] + adc \$0, %rdx + mov %rdx, @acc[8] + + mulq %rax # a[2]*a[2] + xor @acc[3], @acc[3] + add %rax, @acc[1] # can't carry + mov @acc[9], %rax + add @acc[4], @acc[4] # double acc[4:5] + adc @acc[5], @acc[5] + adc \$0, @acc[3] + add @acc[1], @acc[4] # accumulate a[2]*a[2] + adc %rdx, @acc[5] + adc \$0, @acc[3] # carries to a[3]*a[3] + mov @acc[4], 8*4($r_ptr) + + ######################################### + mulq @acc[10] # a[4]*a[3] + add %rax, @acc[7] + mov @acc[9], %rax + adc \$0, %rdx + mov @acc[5], 8*5($r_ptr) + mov %rdx, $hi + + mulq @acc[11] # a[5]*a[3] + add %rax, @acc[8] + mov @acc[9], %rax + adc \$0, %rdx + add $hi, @acc[8] + adc \$0, %rdx + mov %rdx, @acc[9] + + mulq %rax # a[3]*a[3] + xor @acc[4], @acc[4] + add %rax, @acc[3] # can't carry + mov @acc[10], %rax + add @acc[6], @acc[6] # double acc[6:7] + adc @acc[7], @acc[7] + adc \$0, @acc[4] + add @acc[3], @acc[6] # accumulate a[3]*a[3] + adc %rdx, @acc[7] + mov @acc[6], 8*6($r_ptr) + adc \$0, @acc[4] # carries to a[4]*a[4] + mov @acc[7], 8*7($r_ptr) + + ######################################### + mulq @acc[11] # a[5]*a[4] + add %rax, @acc[9] + mov @acc[10], %rax + adc \$0, %rdx + mov %rdx, @acc[10] + + mulq %rax # a[4]*a[4] + xor @acc[5], @acc[5] + add %rax, @acc[4] # can't carry + mov @acc[11], %rax + add @acc[8], @acc[8] # double acc[8:9] + adc @acc[9], @acc[9] + adc \$0, @acc[5] + add @acc[4], @acc[8] # accumulate a[4]*a[4] + adc %rdx, @acc[9] + mov @acc[8], 8*8($r_ptr) + adc \$0, @acc[5] # carries to a[5]*a[5] + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulq %rax # a[5]*a[5] + add @acc[5], %rax # can't carry + add @acc[10], @acc[10] # double acc[10] + adc \$0, %rdx + add @acc[10], %rax # accumulate a[5]*a[5] + adc \$0, %rdx + mov %rax, 8*10($r_ptr) + mov %rdx, 8*11($r_ptr) + + ret +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,\@function,4,"unwind" +.align 32 +sqr_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*15, %rsp +.cfi_adjust_cfa_offset 8*15 +.cfi_end_prologue + + mov $n_ptr, 8*12(%rsp) # n0 + mov $b_org, 8*13(%rsp) # n_ptr + mov $r_ptr, 8*14(%rsp) + + mov %rsp, $r_ptr + call __sqrq_384 + + lea 0(%rsp), $a_ptr + mov 8*12(%rsp), %rcx # n0 for mul_by_1 + mov 8*13(%rsp), $b_ptr # n_ptr for mul_by_1 + mov 8*14(%rsp), $r_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + lea 8*15(%rsp), %r8 # size optimization + mov 8*15(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*21 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 +___ +} +{ ########################################################## 384-bit redc_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" + +$code.=<<___; +######################################################################## +# void redc_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,\@function,4,"unwind" +.align 32 +redc_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + +######################################################################## +# void from_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,\@function,4,"unwind" +.align 32 +from_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulq_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[6], %rax # __mulq_by_1_mont_384 does it + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size from_mont_384,.-from_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulq_by_1_mont_384: + mov 8*0($a_ptr), %rax + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov %rax, @acc[6] + imulq $n0, %rax + mov %rax, @acc[0] +___ +for (my $i=0; $i<6; $i++) { +my $hi = @acc[6]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[6] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[6] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx +___ +$code.=<<___ if ($i<5); + mov @acc[1], @acc[7] + imulq $n0, @acc[1] +___ +$code.=<<___; + add $hi, @acc[3] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov @acc[1], %rax + adc \$0, %rdx + add $hi, @acc[5] + adc \$0, %rdx + mov %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulq_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +___ +} } + +{ ########################################################## mulq_mont +my ($bi, $hi) = ("%rdi", "%rbp"); + +$code.=<<___; +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,\@function,5,"unwind" +.align 32 +mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*3, %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov 8*0($b_org), %rax + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + mov $b_org, $b_ptr # evacuate from %rdx + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + + call __mulq_mont_384 + + mov 24(%rsp),%r15 +.cfi_restore %r15 + mov 32(%rsp),%r14 +.cfi_restore %r14 + mov 40(%rsp),%r13 +.cfi_restore %r13 + mov 48(%rsp),%r12 +.cfi_restore %r12 + mov 56(%rsp),%rbx +.cfi_restore %rbx + mov 64(%rsp),%rbp +.cfi_restore %rbp + lea 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 +.cfi_epilogue + ret +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulq_mont_384,\@abi-omnipotent +.align 32 +__mulq_mont_384: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + xor @acc[7], @acc[7] + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, $hi # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, $hi + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[1] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[2] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*3($n_ptr) + add $hi, @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add $hi, @acc[4] + adc \$0, %rdx + mov %rdx, $hi + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add $hi, @acc[5] + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mov @acc[0], $hi + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($a_ptr) + add @acc[7], @acc[5] + adc \$0, %rdx + xor @acc[7], @acc[7] + add %rax, @acc[5] + mov @acc[0], %rax + adc %rdx, @acc[6] + adc \$0, @acc[7] +___ +} +$code.=<<___; + ################################# + # Branch-less conditional acc[0:6] - modulus + + #mov @acc[0], %rax + mov 8*2(%rsp), $r_ptr # restore $r_ptr + sub 8*0($n_ptr), @acc[0] + mov @acc[1], %rdx + sbb 8*1($n_ptr), @acc[1] + mov @acc[2], $b_ptr + sbb 8*2($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*3($n_ptr), @acc[3] + mov @acc[4], $hi + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[7] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rdx, @acc[1] + cmovc $b_ptr, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc $hi, @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc @acc[7], @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __mulq_mont_384,.-__mulq_mont_384 +___ +} } +$code.=<<___; +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_384: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1, %edx + lea 0($r_ptr), $a_ptr + dec %edx + jnz .Loop_sqr_384 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #mov 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 + + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8*17, %rsp +.cfi_adjust_cfa_offset 8*17 +.cfi_end_prologue + + mov $n0, 8*0(%rsp) + mov $r_ptr, 8*1(%rsp) # to __mulq_mont_384 + mov $n_ptr, 8*2(%rsp) + lea 8*4(%rsp), $r_ptr + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq (%r9), %xmm2 # prefetch b[0] + +.Loop_sqr_383: + movd %edx, %xmm1 # loop counter + + call __sqrq_384 + + lea 0($r_ptr), $a_ptr + mov 8*0(%rsp), %rcx # n0 for mul_by_1 + mov 8*2(%rsp), $b_ptr # n_ptr for mul_by_1 + call __mulq_by_1_mont_384 + + movd %xmm1, %edx # loop counter + add 8*6($a_ptr), @acc[6] # just accumulate upper half + adc 8*7($a_ptr), @acc[7] + adc 8*8($a_ptr), @acc[0] + adc 8*9($a_ptr), @acc[1] + adc 8*10($a_ptr), @acc[2] + adc 8*11($a_ptr), @acc[3] + lea 0($r_ptr), $a_ptr + + mov @acc[6], 8*0($r_ptr) # omitting full reduction gives ~5% + mov @acc[7], 8*1($r_ptr) # in addition-chains + mov @acc[0], 8*2($r_ptr) + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + dec %edx + jnz .Loop_sqr_383 + + movq %xmm2, %rax # b[0] + mov $b_ptr, $n_ptr + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + #movq 8*0($b_ptr), %rax + #mov 8*0($a_ptr), @acc[6] + #mov 8*1($a_ptr), @acc[7] + #mov 8*2($a_ptr), @acc[4] + #mov 8*3($a_ptr), @acc[5] + mov @acc[0], @acc[4] + mov @acc[1], @acc[5] + + call __mulq_mont_384 # formally one can omit full reduction + # even after multiplication... + lea 8*17(%rsp), %r8 # size optimization + mov 8*17(%rsp), %r15 +.cfi_restore %r15 + mov 8*1(%r8), %r14 +.cfi_restore %r14 + mov 8*2(%r8), %r13 +.cfi_restore %r13 + mov 8*3(%r8), %r12 +.cfi_restore %r12 + mov 8*4(%r8), %rbx +.cfi_restore %rbx + mov 8*5(%r8), %rbp +.cfi_restore %rbp + lea 8*6(%r8), %rsp +.cfi_adjust_cfa_offset -8*23 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + my $bi = "%rbp"; + +$code.=<<___; +.type __mulq_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulq_mont_383_nonred: + mov %rax, $bi + mulq @acc[6] # a[0]*b[0] + mov %rax, @acc[0] + mov $bi, %rax + mov %rdx, @acc[1] + + mulq @acc[7] # a[1]*b[0] + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[2] + + mulq @acc[4] # a[2]*b[0] + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[3] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq @acc[5] # a[3]*b[0] + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[4] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[5] + + mulq 8*5($a_ptr) + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +for (my $i=0; $i<6;) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + ################################# reduction $i + mulq 8*0($n_ptr) + add %rax, @acc[7] # guaranteed to be zero + mov @acc[0], %rax + adc %rdx, @acc[7] + + mulq 8*1($n_ptr) + add %rax, @acc[1] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*2($n_ptr) + add %rax, @acc[2] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*3($n_ptr) + add @acc[7], @acc[3] + adc \$0, %rdx + add %rax, @acc[3] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*4($n_ptr) + add %rax, @acc[4] + mov @acc[0], %rax + adc \$0, %rdx + add @acc[7], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[7] + + mulq 8*5($n_ptr) + add %rax, @acc[5] + mov $b_next, %rax + adc \$0, %rdx + add @acc[7], @acc[5] + adc %rdx, @acc[6] +___ + push(@acc,shift(@acc)); +$code.=<<___ if ($i++<5); + ################################# Multiply by b[$i] + mov %rax, $bi + mulq 8*0($a_ptr) + add %rax, @acc[0] + mov $bi, %rax + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*1($a_ptr) + add %rax, @acc[1] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[1] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*2($a_ptr) + add %rax, @acc[2] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[2] + adc \$0, %rdx + mov %rdx, @acc[6] + + mov @acc[0], @acc[7] + imulq 8(%rsp), @acc[0] + + mulq 8*3($a_ptr) + add %rax, @acc[3] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[3] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*4($a_ptr) + add %rax, @acc[4] + mov $bi, %rax + adc \$0, %rdx + add @acc[6], @acc[4] + adc \$0, %rdx + mov %rdx, @acc[6] + + mulq 8*5($a_ptr) + add @acc[6], @acc[5] + adc \$0, %rdx + add %rax, @acc[5] + mov @acc[0], %rax + adc \$0, %rdx + mov %rdx, @acc[6] +___ +} +$code.=<<___; + ret +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives 8-11% better performance in add-chains +$code.=<<___; +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,\@function,4,"unwind" +.align 32 +sqr_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $a_ptr, 8*2(%rsp) + mov $r_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*2(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rax # a->im + mov 8*0($a_ptr), @acc[6] # a->re + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[4] + mov 8*3($a_ptr), @acc[5] + + mov 8*3(%rsp), $r_ptr + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($r_ptr) # ret->im + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rax # t1[0] + mov 32+8*0(%rsp), @acc[6] # t0[0..3] + mov 32+8*1(%rsp), @acc[7] + mov 32+8*2(%rsp), @acc[4] + mov 32+8*3(%rsp), @acc[5] + + call __mulq_mont_383_nonred +___ +{ +my @acc = map("%r$_",14,15,8..11, # output from __mulq_mont_384 + 12,13,"ax","bx","bp","si"); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + mov 32+8*0(%rsp), @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[6] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[7] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[8] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($r_ptr) # ret->re + mov @acc[1], 8*1($r_ptr) + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/mulx_mont_256-x86_64.pl b/blst/asm/mulx_mont_256-x86_64.pl new file mode 100755 index 0000000..0d6bf2e --- /dev/null +++ b/blst/asm/mulx_mont_256-x86_64.pl @@ -0,0 +1,486 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# "Sparse" in subroutine names refers to most significant limb of the +# modulus. Though "sparse" is a bit of misnomer, because limitation is +# just not-all-ones. Or in other words not larger than 2^256-2^192-1. +# In general Montgomery multiplication algorithm can handle one of the +# inputs being non-reduced and capped by 1<re, b->re); + #lea 0($b_btr), $b_ptr # b->re + #lea 0($a_ptr), $a_ptr # a->re + lea 40(%rsp), $r_ptr # t0 + call __mulx_384 + + ################################# mul_384(t1, a->im, b->im); + lea 48($b_ptr), $b_ptr # b->im + lea 128+48($a_ptr), $a_ptr # a->im + lea 96($r_ptr), $r_ptr # t1 + call __mulx_384 + + ################################# mul_384(t2, a->re+a->im, b->re+b->im); + mov 8*1(%rsp), $n_ptr + lea ($b_ptr), $a_ptr # b->re + lea -48($b_ptr), $b_org # b->im + lea 40+192+48(%rsp), $r_ptr + call __add_mod_384 + + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea -48($r_ptr), $r_ptr + call __add_mod_384 + + lea ($r_ptr),$b_ptr + lea 48($r_ptr),$a_ptr + call __mulx_384 + + ################################# t2=t2-t0-t1 + lea ($r_ptr), $a_ptr # t2 + lea 40(%rsp), $b_org # t0 + mov 8*1(%rsp), $n_ptr + call __sub_mod_384x384 # t2-t0 + + lea ($r_ptr), $a_ptr # t2 + lea -96($r_ptr), $b_org # t1 + call __sub_mod_384x384 # t2-t0-t1 + + ################################# t0=t0-t1 + lea 40(%rsp), $a_ptr + lea 40+96(%rsp), $b_org + lea 40(%rsp), $r_ptr + call __sub_mod_384x384 # t0-t1 + + lea ($n_ptr), $b_ptr # n_ptr for redc_mont_384 + + ################################# redc_mont_384(ret->re, t0, mod, n0); + lea 40(%rsp), $a_ptr # t0 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + mov 8*4(%rsp), $r_ptr # ret->re + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + ################################# redc_mont_384(ret->im, t2, mod, n0); + lea 40+192(%rsp), $a_ptr # t2 + mov 8*0(%rsp), %rcx # n0 for redc_mont_384 + lea 48($r_ptr), $r_ptr # ret->im + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +___ +} +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # alignment +$code.=<<___; +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,\@function,4,"unwind" +.align 32 +sqrx_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + # gap for __mulx_mont_384 + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# add_mod_384(t0, a->re, a->im); + lea 48($a_ptr), $b_org # a->im + lea 32(%rsp), $r_ptr # t0 + call __add_mod_384 + + ################################# sub_mod_384(t1, a->re, a->im); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_org # a->im + lea 32+48(%rsp), $r_ptr # t1 + call __sub_mod_384 + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + mov @acc[0], @acc[6] + adc @acc[3], @acc[3] + mov @acc[1], @acc[7] + adc @acc[4], @acc[4] + mov @acc[2], @acc[8] + adc @acc[5], @acc[5] + mov @acc[3], @acc[9] + sbb $a_ptr, $a_ptr + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[10] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], @acc[11] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $a_ptr + + cmovc @acc[6], @acc[0] + cmovc @acc[7], @acc[1] + cmovc @acc[8], @acc[2] + mov @acc[0], 8*6($b_ptr) # ret->im + cmovc @acc[9], @acc[3] + mov @acc[1], 8*7($b_ptr) + cmovc @acc[10], @acc[4] + mov @acc[2], 8*8($b_ptr) + cmovc @acc[11], @acc[5] + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32(%rsp), $a_ptr # t0 + lea 32+48(%rsp), $b_ptr # t1 + + mov 32+48(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,\@function,4,"unwind" +.align 32 +mulx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + lea 96($r_ptr), $r_ptr # ret->im + mov $a_ptr, 8*0(%rsp) + mov $b_org, 8*1(%rsp) + mov $r_ptr, 8*2(%rsp) # offload ret->im + mov $n_ptr, 8*3(%rsp) + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[0] + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + add 8*6($a_ptr), @acc[0] + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + + mov @acc[0], 32+8*0(%rsp) + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + ################################# t1 = b->re + b->im + mov 8*0($b_org), @acc[0] + mov 8*1($b_org), @acc[1] + mov 8*2($b_org), @acc[2] + mov 8*3($b_org), @acc[3] + mov 8*4($b_org), @acc[4] + mov 8*5($b_org), @acc[5] + + add 8*6($b_org), @acc[0] + adc 8*7($b_org), @acc[1] + adc 8*8($b_org), @acc[2] + adc 8*9($b_org), @acc[3] + adc 8*10($b_org), @acc[4] + adc 8*11($b_org), @acc[5] + + mov @acc[0], 32+8*6(%rsp) + mov @acc[1], 32+8*7(%rsp) + mov @acc[2], 32+8*8(%rsp) + mov @acc[3], 32+8*9(%rsp) + mov @acc[4], 32+8*10(%rsp) + mov @acc[5], 32+8*11(%rsp) + + ################################# mul_384(ret->im, t0, t1); + lea 32+8*0(%rsp), $a_ptr # t0 + lea 32+8*6(%rsp), $b_ptr # t1 + call __mulx_384 + + ################################# mul_384(ret->re, a->re, b->re); + mov 8*0(%rsp), $a_ptr + mov 8*1(%rsp), $b_ptr + lea -96($r_ptr), $r_ptr # ret->re + call __mulx_384 + + ################################# mul_384(tx, a->im, b->im); + lea 48+128($a_ptr), $a_ptr + lea 48($b_ptr), $b_ptr + lea 32(%rsp), $r_ptr + call __mulx_384 + + ################################# ret->im -= tx + mov 8*2(%rsp), $a_ptr # restore ret->im + lea 32(%rsp), $b_org + mov 8*3(%rsp), $n_ptr + mov $a_ptr, $r_ptr + call __sub_mod_384x384 + + ################################# ret->im -= ret->re + lea 0($r_ptr), $a_ptr + lea -96($r_ptr), $b_org + call __sub_mod_384x384 + + ################################# ret->re -= tx + lea -96($r_ptr), $a_ptr + lea 32(%rsp), $b_org + lea -96($r_ptr), $r_ptr + call __sub_mod_384x384 + + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_382x,.-mulx_382x +___ +} +{ my @acc=(@acc,"%rax","%rbx","%rbp",$b_org); # all registers are affected + # except for $n_ptr and $r_ptr +$code.=<<___; +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,\@function,3,"unwind" +.align 32 +sqrx_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $a_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + + ################################# t0 = a->re + a->im + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + mov 8*5($a_ptr), @acc[11] + + mov @acc[6], @acc[0] + add 8*6($a_ptr), @acc[6] + mov @acc[7], @acc[1] + adc 8*7($a_ptr), @acc[7] + mov @acc[8], @acc[2] + adc 8*8($a_ptr), @acc[8] + mov @acc[9], @acc[3] + adc 8*9($a_ptr), @acc[9] + mov @acc[10], @acc[4] + adc 8*10($a_ptr), @acc[10] + mov @acc[11], @acc[5] + adc 8*11($a_ptr), @acc[11] + + mov @acc[6], 8*0($r_ptr) + mov @acc[7], 8*1($r_ptr) + mov @acc[8], 8*2($r_ptr) + mov @acc[9], 8*3($r_ptr) + mov @acc[10], 8*4($r_ptr) + mov @acc[11], 8*5($r_ptr) + + ################################# t1 = a->re - a->im + lea 48($a_ptr), $b_org + lea 48($r_ptr), $r_ptr + call __sub_mod_384_a_is_loaded + + ################################# mul_384(ret->re, t0, t1); + lea ($r_ptr), $a_ptr + lea -48($r_ptr), $b_ptr + lea -48($r_ptr), $r_ptr + call __mulx_384 + + ################################# mul_384(ret->im, a->re, a->im); + mov (%rsp), $a_ptr + lea 48($a_ptr), $b_ptr + lea 96($r_ptr), $r_ptr + call __mulx_384 + + mov 8*0($r_ptr), @acc[0] # double ret->im + mov 8*1($r_ptr), @acc[1] + mov 8*2($r_ptr), @acc[2] + mov 8*3($r_ptr), @acc[3] + mov 8*4($r_ptr), @acc[4] + mov 8*5($r_ptr), @acc[5] + mov 8*6($r_ptr), @acc[6] + mov 8*7($r_ptr), @acc[7] + mov 8*8($r_ptr), @acc[8] + mov 8*9($r_ptr), @acc[9] + mov 8*10($r_ptr), @acc[10] + add @acc[0], @acc[0] + mov 8*11($r_ptr), @acc[11] + adc @acc[1], @acc[1] + mov @acc[0], 8*0($r_ptr) + adc @acc[2], @acc[2] + mov @acc[1], 8*1($r_ptr) + adc @acc[3], @acc[3] + mov @acc[2], 8*2($r_ptr) + adc @acc[4], @acc[4] + mov @acc[3], 8*3($r_ptr) + adc @acc[5], @acc[5] + mov @acc[4], 8*4($r_ptr) + adc @acc[6], @acc[6] + mov @acc[5], 8*5($r_ptr) + adc @acc[7], @acc[7] + mov @acc[6], 8*6($r_ptr) + adc @acc[8], @acc[8] + mov @acc[7], 8*7($r_ptr) + adc @acc[9], @acc[9] + mov @acc[8], 8*8($r_ptr) + adc @acc[10], @acc[10] + mov @acc[9], 8*9($r_ptr) + adc @acc[11], @acc[11] + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + mov 8*1(%rsp),%r15 +.cfi_restore %r15 + mov 8*2(%rsp),%r14 +.cfi_restore %r14 + mov 8*3(%rsp),%r13 +.cfi_restore %r13 + mov 8*4(%rsp),%r12 +.cfi_restore %r12 + mov 8*5(%rsp),%rbx +.cfi_restore %rbx + mov 8*6(%rsp),%rbp +.cfi_restore %rbp + lea 8*7(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +___ +} +{ ########################################################## 384-bit mulx +my ($a0, $a1) = @acc[6..7]; +my @acc = @acc[0..5]; +my ($lo, $hi, $zr) = ("%rax", "%rcx", "%rbp"); + +$code.=<<___; +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,\@function,3,"unwind" +.align 32 +mulx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + call __mulx_384 + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbx +.cfi_restore %rbx + mov 40(%rsp),%rbp +.cfi_restore %rbp + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,\@abi-omnipotent +.align 32 +__mulx_384: + mov 8*0($b_ptr), %rdx + mov 8*0($a_ptr), $a0 + mov 8*1($a_ptr), $a1 + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + lea -128($a_ptr), $a_ptr + + mulx $a0, @acc[1], $hi + xor $zr, $zr + + mulx $a1, @acc[0], $lo + adcx $hi, @acc[0] + mov @acc[1], 8*0($r_ptr) + + mulx @acc[2], @acc[1], $hi + adcx $lo, @acc[1] + + mulx @acc[3], @acc[2], $lo + adcx $hi, @acc[2] + + mulx @acc[4], @acc[3], $hi + adcx $lo, @acc[3] + + mulx @acc[5], @acc[4], @acc[5] + mov 8*1($b_ptr), %rdx + adcx $hi, @acc[4] + adcx $zr, @acc[5] +___ +for(my $i=1; $i<6; $i++) { +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : "%rax"; +$code.=<<___; + mulx $a0, $lo, $hi + adcx @acc[0], $lo + adox $hi, @acc[1] + mov $lo, 8*$i($r_ptr) + + mulx $a1, @acc[0], $hi + adcx @acc[1], $acc[0] + adox $hi, @acc[2] + + mulx 128+8*2($a_ptr), @acc[1], $lo + adcx @acc[2], @acc[1] + adox $lo, @acc[3] + + mulx 128+8*3($a_ptr), @acc[2], $hi + adcx @acc[3], @acc[2] + adox $hi, @acc[4] + + mulx 128+8*4($a_ptr), @acc[3], $lo + adcx @acc[4], @acc[3] + adox @acc[5], $lo + + mulx 128+8*5($a_ptr), @acc[4], @acc[5] + mov $b_next, %rdx + adcx $lo, @acc[4] + adox $zr, @acc[5] + adcx $zr, @acc[5] +___ +} +$code.=<<___; + mov @acc[0], 8*6($r_ptr) + mov @acc[1], 8*7($r_ptr) + mov @acc[2], 8*8($r_ptr) + mov @acc[3], 8*9($r_ptr) + mov @acc[4], 8*10($r_ptr) + mov @acc[5], 8*11($r_ptr) + + ret +.size __mulx_384,.-__mulx_384 +___ +} +{ ########################################################## 384-bit sqrx +$code.=<<___; +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,\@function,2,"unwind" +.align 32 +sqrx_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + push $r_ptr +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + call __sqrx_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_384,.-sqrx_384 +___ +if (0) { +# up to 5% slower than below variant +my @acc=map("%r$_",("no",8..15,"cx","bx")); + push(@acc, $a_ptr); +my ($lo, $hi, $carry)=("%rax", "%rbp", "%rno"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + adc $hi, @acc[5] + adc \$0, @acc[6] + + mulx %rdx, $lo, $hi # a[0]*a[0] + mov @acc[7], %rdx + xor @acc[7], @acc[7] + add @acc[1], @acc[1] # double acc[1] + adc \$0, @acc[7] + add $hi, @acc[1] + adc \$0, @acc[7] + mov $lo, 8*0($r_ptr) + mov @acc[1], 8*1($r_ptr) +___ +($carry, @acc[7]) = (@acc[7], @acc[1]); +$code.=<<___; + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + mulx %rdx, $lo, $hi # a[1]*a[1] + mov @acc[8], %rdx + xor @acc[8], @acc[8] + adox @acc[2], @acc[2] # double acc[2:3] + adcx $carry, $lo # can't carry + adox @acc[3], @acc[3] + adcx $lo, @acc[2] + adox @acc[8], @acc[8] + adcx $hi, @acc[3] + adc \$0, @acc[8] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) +___ +($carry,@acc[8])=(@acc[8],$carry); +$code.=<<___; + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + mulx %rdx, $lo, $hi # a[2]*a[2] + mov @acc[9], %rdx + xor @acc[9], @acc[9] + adox @acc[4], @acc[4] # double acc[4:5] + adcx $carry, $lo # can't carry + adox @acc[5], @acc[5] + adcx $lo, @acc[4] + adox @acc[9], @acc[9] + adcx $hi, @acc[5] + adc \$0, $acc[9] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) +___ +($carry,@acc[9])=(@acc[9],$carry); +$code.=<<___; + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + mulx %rdx, $lo, $hi + mov @acc[10], %rdx + xor @acc[10], @acc[10] + adox @acc[6], @acc[6] # double acc[6:7] + adcx $carry, $lo # can't carry + adox @acc[7], @acc[7] + adcx $lo, @acc[6] + adox @acc[10], @acc[10] + adcx $hi, @acc[7] + adc \$0, $acc[10] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) +___ +($carry,@acc[10])=(@acc[10],$carry); +$code.=<<___; + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + add $lo, @acc[9] + adc \$0, @acc[10] + + mulx %rdx, $lo, $hi # a[4]*a[4] + mov @acc[11], %rdx + xor @acc[11], @acc[11] + adox @acc[8], @acc[8] # double acc[8:10] + adcx $carry, $lo # can't carry + adox @acc[9], @acc[9] + adcx $lo, @acc[8] + adox @acc[10], @acc[10] + adcx $hi, @acc[9] + adox @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + ######################################### + mulx %rdx, $lo, $hi # a[5]*a[5] + adcx $lo, @acc[10] + adcx $hi, @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} else { +my @acc=map("%r$_",("no",8..15,"cx","bx","bp")); +my ($lo, $hi)=($r_ptr, "%rax"); + +$code.=<<___; +.type __sqrx_384,\@abi-omnipotent +.align 32 +__sqrx_384: + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[9] + mov 8*4($a_ptr), @acc[10] + + ######################################### + mulx @acc[7], @acc[1], $lo # a[1]*a[0] + mov 8*5($a_ptr), @acc[11] + mulx @acc[8], @acc[2], $hi # a[2]*a[0] + add $lo, @acc[2] + mulx @acc[9], @acc[3], $lo # a[3]*a[0] + adc $hi, @acc[3] + mulx @acc[10], @acc[4], $hi # a[4]*a[0] + adc $lo, @acc[4] + mulx @acc[11], @acc[5], @acc[6] # a[5]*a[0] + mov @acc[7], %rdx + adc $hi, @acc[5] + adc \$0, @acc[6] + + ######################################### + xor @acc[7], @acc[7] + mulx @acc[8], $lo, $hi # a[2]*a[1] + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx @acc[9], $lo, $hi # a[3]*a[1] + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx @acc[10], $lo, $hi # a[4]*a[1] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[11], $lo, $hi # a[5]*a[1] + mov @acc[8], %rdx + adcx $lo, @acc[6] + adox @acc[7], $hi + adcx $hi, @acc[7] + + ######################################### + xor @acc[8], @acc[8] + mulx @acc[9], $lo, $hi # a[3]*a[2] + adcx $lo, @acc[5] + adox $hi, @acc[6] + + mulx @acc[10], $lo, $hi # a[4]*a[2] + adcx $lo, @acc[6] + adox $hi, @acc[7] + + mulx @acc[11], $lo, $hi # a[5]*a[2] + mov @acc[9], %rdx + adcx $lo, @acc[7] + adox @acc[8], $hi + adcx $hi, @acc[8] + + ######################################### + xor @acc[9], @acc[9] + mulx @acc[10], $lo, $hi # a[4]*a[3] + adcx $lo, @acc[7] + adox $hi, @acc[8] + + mulx @acc[11], $lo, $hi # a[5]*a[3] + mov @acc[10], %rdx + adcx $lo, @acc[8] + adox @acc[9], $hi + adcx $hi, @acc[9] + + ######################################### + mulx @acc[11], $lo, @acc[10] # a[5]*a[4] + mov 8*0($a_ptr), %rdx + add $lo, @acc[9] + mov 8(%rsp), $r_ptr # restore $r_ptr + adc \$0, @acc[10] + + ######################################### double acc[1:10] + xor @acc[11], @acc[11] + adcx @acc[1], @acc[1] + adcx @acc[2], @acc[2] + adcx @acc[3], @acc[3] + adcx @acc[4], @acc[4] + adcx @acc[5], @acc[5] + + ######################################### accumulate a[i]*a[i] + mulx %rdx, %rdx, $hi # a[0]*a[0] + mov %rdx, 8*0($r_ptr) + mov 8*1($a_ptr), %rdx + adox $hi, @acc[1] + mov @acc[1], 8*1($r_ptr) + + mulx %rdx, @acc[1], $hi # a[1]*a[1] + mov 8*2($a_ptr), %rdx + adox @acc[1], @acc[2] + adox $hi, @acc[3] + mov @acc[2], 8*2($r_ptr) + mov @acc[3], 8*3($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[2]*a[2] + mov 8*3($a_ptr), %rdx + adox @acc[1], @acc[4] + adox @acc[2], @acc[5] + adcx @acc[6], @acc[6] + adcx @acc[7], @acc[7] + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[3]*a[3] + mov 8*4($a_ptr), %rdx + adox @acc[1], @acc[6] + adox @acc[2], @acc[7] + adcx @acc[8], @acc[8] + adcx @acc[9], @acc[9] + mov @acc[6], 8*6($r_ptr) + mov @acc[7], 8*7($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[4]*a[4] + mov 8*5($a_ptr), %rdx + adox @acc[1], @acc[8] + adox @acc[2], @acc[9] + adcx @acc[10], @acc[10] + adcx @acc[11], @acc[11] + mov @acc[8], 8*8($r_ptr) + mov @acc[9], 8*9($r_ptr) + + mulx %rdx, @acc[1], @acc[2] # a[5]*a[5] + adox @acc[1], @acc[10] + adox @acc[2], @acc[11] + + mov @acc[10], 8*10($r_ptr) + mov @acc[11], 8*11($r_ptr) + + ret +.size __sqrx_384,.-__sqrx_384 +___ +} + +{ ########################################################## 384-bit redcx_mont +my ($n_ptr, $n0)=($b_ptr, $n_ptr); # arguments are "shifted" +my ($lo, $hi) = ("%rax", "%rbp"); + +$code.=<<___; +######################################################################## +# void redcx_mont_384(uint64_t ret[6], const uint64_t a[12], +# uint64_t m[6], uint64_t n0); +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,\@function,4,"unwind" +.align 32 +redcx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + +######################################################################## +# void fromx_mont_384(uint64_t ret[6], const uint64_t a[6], +# uint64_t m[6], uint64_t n0); +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,\@function,4,"unwind" +.align 32 +fromx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $b_org, $n_ptr + call __mulx_by_1_mont_384 + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[6], %rax + mov @acc[7], %rcx + mov @acc[0], %rdx + mov @acc[1], %rbp + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[7] + mov @acc[2], @acc[5] + sbb 8*2($n_ptr), @acc[0] + sbb 8*3($n_ptr), @acc[1] + sbb 8*4($n_ptr), @acc[2] + mov @acc[3], $a_ptr + sbb 8*5($n_ptr), @acc[3] + + cmovc %rax, @acc[6] + cmovc %rcx, @acc[7] + cmovc %rdx, @acc[0] + mov @acc[6], 8*0($r_ptr) + cmovc %rbp, @acc[1] + mov @acc[7], 8*1($r_ptr) + cmovc @acc[5], @acc[2] + mov @acc[0], 8*2($r_ptr) + cmovc $a_ptr, @acc[3] + mov @acc[1], 8*3($r_ptr) + mov @acc[2], 8*4($r_ptr) + mov @acc[3], 8*5($r_ptr) + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_by_1_mont_384,\@abi-omnipotent +.align 32 +__mulx_by_1_mont_384: + mov 8*0($a_ptr), @acc[0] + mov $n0, %rdx + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] +___ +for (my $i=0; $i<6; $i++) { +$code.=<<___; + imulq @acc[0], %rdx + + ################################# reduction $i + xor @acc[6], @acc[6] # @acc[6]=0, cf=0, of=0 + mulx 8*0($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5($n_ptr), $lo, $hi + mov $n0, %rdx + adcx $lo, @acc[5] + adox @acc[6], $hi + adcx $hi, @acc[6] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + ret +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,\@abi-omnipotent +.align 32 +__redc_tail_mont_384: + add 8*6($a_ptr), @acc[0] # accumulate upper half + mov @acc[0], %rax + adc 8*7($a_ptr), @acc[1] + adc 8*8($a_ptr), @acc[2] + adc 8*9($a_ptr), @acc[3] + mov @acc[1], %rcx + adc 8*10($a_ptr), @acc[4] + adc 8*11($a_ptr), @acc[5] + sbb @acc[6], @acc[6] + + ################################# + # Branch-less conditional acc[0:6] - modulus + + mov @acc[2], %rdx + mov @acc[3], %rbp + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + mov @acc[4], @acc[7] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + mov @acc[5], $a_ptr + sbb 8*5($n_ptr), @acc[5] + sbb \$0, @acc[6] + + cmovc %rax, @acc[0] + cmovc %rcx, @acc[1] + cmovc %rdx, @acc[2] + mov @acc[0], 8*0($r_ptr) + cmovc %rbp, @acc[3] + mov @acc[1], 8*1($r_ptr) + cmovc @acc[7], @acc[4] + mov @acc[2], 8*2($r_ptr) + cmovc $a_ptr, @acc[5] + mov @acc[3], 8*3($r_ptr) + mov @acc[4], 8*4($r_ptr) + mov @acc[5], 8*5($r_ptr) + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 0($r_ptr), $a_ptr + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + xor %rax, %rax + mov @acc[0], @acc[7] + add @acc[0], @acc[0] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[0] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + not %rax # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,\@function,3,"unwind" +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$8, %rsp +.cfi_adjust_cfa_offset 8 +.cfi_end_prologue + + mov $a_ptr, $n_ptr + lea 48($r_ptr), $a_ptr # sgn0(a->im) + mov $b_org, $n0 + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + lea 0($r_ptr), $a_ptr # sgn0(a->re) + xor $r_ptr, $r_ptr + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, $r_ptr + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, $r_ptr + + mov @acc[0], 0(%rsp) # a->im is zero or not + not $r_ptr # 2*x > p, which means "negative" + and \$1, @acc[7] + and \$2, $r_ptr + or @acc[7], $r_ptr # pack sign and parity + + call __mulx_by_1_mont_384 + + mov @acc[0], @acc[6] + or @acc[1], @acc[0] + or @acc[2], @acc[0] + or @acc[3], @acc[0] + or @acc[4], @acc[0] + or @acc[5], @acc[0] + + xor %rax, %rax + mov @acc[6], @acc[7] + add @acc[6], @acc[6] + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + adc \$0, %rax + + sub 8*0($n_ptr), @acc[6] + sbb 8*1($n_ptr), @acc[1] + sbb 8*2($n_ptr), @acc[2] + sbb 8*3($n_ptr), @acc[3] + sbb 8*4($n_ptr), @acc[4] + sbb 8*5($n_ptr), @acc[5] + sbb \$0, %rax + + mov 0(%rsp), @acc[6] + + not %rax # 2*x > p, which means "negative" + + test @acc[0], @acc[0] + cmovz $r_ptr, @acc[7] # a->re==0? prty(a->im) : prty(a->re) + + test @acc[6], @acc[6] + cmovnz $r_ptr, %rax # a->im!=0? sgn0(a->im) : sgn0(a->re) + + and \$1, @acc[7] + and \$2, %rax + or @acc[7], %rax # pack sign and parity + + mov 8(%rsp),%r15 +.cfi_restore %r15 + mov 16(%rsp),%r14 +.cfi_restore %r14 + mov 24(%rsp),%r13 +.cfi_restore %r13 + mov 32(%rsp),%r12 +.cfi_restore %r12 + mov 40(%rsp),%rbx +.cfi_restore %rbx + mov 48(%rsp),%rbp +.cfi_restore %rbp + lea 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 +.cfi_epilogue + ret +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +___ +} } + +{ ########################################################## mulx/sqrx_mont +my @acc = (@acc, "%rax"); +my ($lo,$hi)=("%rdi","%rbp"); + +$code.=<<___; +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,\@function,5,"unwind" +.align 32 +mulx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $b_org, $b_ptr # evacuate from %rdx + mov 8*0($b_org), %rdx + mov 8*0($a_ptr), @acc[6] + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + mov $n0, (%rsp) + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_384,\@abi-omnipotent +.align 32 +__mulx_mont_384: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] + xor @acc[7], @acc[7] + +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], 16(%rsp) + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx $hi, @acc[7] # cf=0 + adox @acc[8], @acc[7] + adox @acc[8], @acc[8] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx 16(%rsp), $lo # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[0], @acc[6] + adox @acc[0], @acc[7] + adcx @acc[0], @acc[7] + adox @acc[0], @acc[8] + adcx @acc[0], @acc[8] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + mov @acc[2], @acc[0] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + mov @acc[3], $a_ptr + + mulx 8*5+128($n_ptr), $lo, $hi + adcx $lo, @acc[5] + adox $hi, @acc[6] + mov @acc[1], %rdx + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + lea 128($n_ptr), $n_ptr + mov @acc[4], @acc[8] + adc \$0, @acc[7] + + ################################# + # Branch-less conditional acc[1:7] - modulus + + sub 8*0($n_ptr), @acc[1] + sbb 8*1($n_ptr), @acc[2] + mov @acc[5], $lo + sbb 8*2($n_ptr), @acc[3] + sbb 8*3($n_ptr), @acc[4] + sbb 8*4($n_ptr), @acc[5] + mov @acc[6], $hi + sbb 8*5($n_ptr), @acc[6] + sbb \$0, @acc[7] + + cmovnc @acc[1], %rdx + cmovc @acc[0], @acc[2] + cmovc $a_ptr, @acc[3] + cmovnc @acc[4], @acc[8] + mov %rdx, 8*0($b_ptr) + cmovnc @acc[5], $lo + mov @acc[2], 8*1($b_ptr) + cmovnc @acc[6], $hi + mov @acc[3], 8*2($b_ptr) + mov @acc[8], 8*3($b_ptr) + mov $lo, 8*4($b_ptr) + mov $hi, 8*5($b_ptr) + + ret +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +___ +} +$code.=<<___; +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,\@function,4,"unwind" +.align 32 +sqrx_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*3(%rsp), %rsp +.cfi_adjust_cfa_offset 8*3 +.cfi_end_prologue + + mov $n_ptr, $n0 # n0 + lea -128($b_org), $n_ptr # control u-op density + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + lea ($a_ptr), $b_ptr + mov $n0, (%rsp) # n0 + lea -128($a_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 # as fast as dedicated squaring + + mov 8*3(%rsp),%r15 +.cfi_restore %r15 + mov 8*4(%rsp),%r14 +.cfi_restore %r14 + mov 8*5(%rsp),%r13 +.cfi_restore %r13 + mov 8*6(%rsp),%r12 +.cfi_restore %r12 + mov 8*7(%rsp),%rbx +.cfi_restore %rbx + mov 8*8(%rsp),%rbp +.cfi_restore %rbp + lea 8*9(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_384 + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + +.Loop_sqrx_384: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_384 + + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_384 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + lea -128($n_ptr), $n_ptr # control u-op density + + mulx @acc[6],@acc[0],@acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,\@function,6,"unwind" +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + lea -8*5(%rsp), %rsp +.cfi_adjust_cfa_offset 8*5 +.cfi_end_prologue + + mov $b_org, @acc[2] # loop counter + mov 8*0($a_ptr), %rdx + mov 8*1($a_ptr), @acc[7] + mov 8*2($a_ptr), @acc[8] + mov $a_ptr, $b_ptr + mov 8*3($a_ptr), @acc[4] + mov $r_ptr, 8*2(%rsp) # to __mulx_mont_383_nonred + mov 8*4($a_ptr), $lo + mov 8*5($a_ptr), $hi + + mov $n0, (%rsp) + mov %r9, 8*3(%rsp) # 6th, multiplicand argument + movq 8*0(%r9), %xmm2 # prefetch b[0] + lea -128($n_ptr), $n_ptr # control u-op density + +.Loop_sqrx_383: + movd @acc[2]d, %xmm1 + lea -128($b_ptr), $a_ptr # control u-op density + + mulx %rdx, @acc[0], @acc[1] # a[0]*a[0] + call __mulx_mont_383_nonred # omitting full reduction gives ~15% + # in addition-chains + movd %xmm1, @acc[2]d + dec @acc[2]d + jnz .Loop_sqrx_383 + + mov %rdx, @acc[6] + movq %xmm2, %rdx # b[0] + lea -128($b_ptr), $a_ptr # control u-op density + mov 8*3(%rsp), $b_ptr # 6th, multiplicand argument + + mulx @acc[6], @acc[0], @acc[1] # a[0]*b[0] + call __mulx_mont_384 + + mov 8*5(%rsp),%r15 +.cfi_restore %r15 + mov 8*6(%rsp),%r14 +.cfi_restore %r14 + mov 8*7(%rsp),%r13 +.cfi_restore %r13 + mov 8*8(%rsp),%r12 +.cfi_restore %r12 + mov 8*9(%rsp),%rbx +.cfi_restore %rbx + mov 8*10(%rsp),%rbp +.cfi_restore %rbp + lea 8*11(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +___ +{ my @acc=@acc; # will be rotated locally + +$code.=<<___; +.type __mulx_mont_383_nonred,\@abi-omnipotent +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + mulx @acc[7], @acc[6], @acc[2] + mulx @acc[8], @acc[7], @acc[3] + add @acc[6], @acc[1] + mulx @acc[4], @acc[8], @acc[4] + adc @acc[7], @acc[2] + mulx $lo, $lo, @acc[5] + adc @acc[8], @acc[3] + mulx $hi, $hi, @acc[6] + mov 8($b_ptr), %rdx + adc $lo, @acc[4] + adc $hi, @acc[5] + adc \$0, @acc[6] +___ +for (my $i=1; $i<6; $i++) { +my $tt = $i==1 ? @acc[7] : $hi; +my $b_next = $i<5 ? 8*($i+1)."($b_ptr)" : @acc[1]; +$code.=<<___; + mov @acc[0], @acc[8] + imulq 8(%rsp), @acc[0] + + ################################# Multiply by b[$i] + xor @acc[7], @acc[7] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($a_ptr), $lo, $hi + adox $lo, @acc[1] + adcx $hi, @acc[2] + + mulx 8*1+128($a_ptr), $lo, $hi + adox $lo, @acc[2] + adcx $hi, @acc[3] + + mulx 8*2+128($a_ptr), $lo, $hi + adox $lo, @acc[3] + adcx $hi, @acc[4] + + mulx 8*3+128($a_ptr), $lo, $hi + adox $lo, @acc[4] + adcx $hi, @acc[5] + + mulx 8*4+128($a_ptr), $lo, $hi + adox $lo, @acc[5] + adcx $hi, @acc[6] + + mulx 8*5+128($a_ptr), $lo, $hi + mov @acc[0], %rdx + adox $lo, @acc[6] + adcx @acc[7], $hi + adox $hi, @acc[7] + + ################################# reduction + xor @acc[0], @acc[0] # acc[0]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[8] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov $b_next, %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adcx @acc[8], @acc[6] + adox @acc[8], @acc[7] + adcx @acc[8], @acc[7] +___ + push(@acc,shift(@acc)); +} +$code.=<<___; + imulq 8(%rsp), %rdx + mov 8*3(%rsp), $b_ptr # restore $r_ptr + + ################################# last reduction + xor @acc[8], @acc[8] # @acc[8]=0, cf=0, of=0 + mulx 8*0+128($n_ptr), $lo, $hi + adcx $lo, @acc[0] # guaranteed to be zero + adox $hi, @acc[1] + + mulx 8*1+128($n_ptr), $lo, $hi + adcx $lo, @acc[1] + adox $hi, @acc[2] + + mulx 8*2+128($n_ptr), $lo, $hi + adcx $lo, @acc[2] + adox $hi, @acc[3] + + mulx 8*3+128($n_ptr), $lo, $hi + adcx $lo, @acc[3] + adox $hi, @acc[4] + + mulx 8*4+128($n_ptr), $lo, $hi + adcx $lo, @acc[4] + adox $hi, @acc[5] + + mulx 8*5+128($n_ptr), $lo, $hi + mov @acc[1], %rdx + adcx $lo, @acc[5] + adox $hi, @acc[6] + adc \$0, @acc[6] + mov @acc[4], @acc[8] + + mov @acc[1], 8*0($b_ptr) + mov @acc[2], 8*1($b_ptr) + mov @acc[3], 8*2($b_ptr) + mov @acc[5], $lo + mov @acc[4], 8*3($b_ptr) + mov @acc[5], 8*4($b_ptr) + mov @acc[6], 8*5($b_ptr) + mov @acc[6], $hi + + ret +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +___ +} } } +{ my $frame = 4*8 + # place for argument off-load + + 2*384/8 + # place for 2 384-bit temporary vectors + 8; # align +my @acc = (@acc,"%rax","%rdx","%rbx","%rbp"); + +# omitting 3 reductions gives ~10% better performance in add-chains +$code.=<<___; +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,\@function,4,"unwind" +.align 32 +sqrx_mont_382x: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$$frame, %rsp +.cfi_adjust_cfa_offset $frame +.cfi_end_prologue + + mov $n_ptr, 8*0(%rsp) # n0 + mov $b_org, $n_ptr # n_ptr + mov $r_ptr, 8*2(%rsp) + mov $a_ptr, 8*3(%rsp) + + ################################# + mov 8*0($a_ptr), @acc[0] # a->re + mov 8*1($a_ptr), @acc[1] + mov 8*2($a_ptr), @acc[2] + mov 8*3($a_ptr), @acc[3] + mov 8*4($a_ptr), @acc[4] + mov 8*5($a_ptr), @acc[5] + + mov @acc[0], @acc[6] + add 8*6($a_ptr), @acc[0] # a->re + a->im + mov @acc[1], @acc[7] + adc 8*7($a_ptr), @acc[1] + mov @acc[2], @acc[8] + adc 8*8($a_ptr), @acc[2] + mov @acc[3], @acc[9] + adc 8*9($a_ptr), @acc[3] + mov @acc[4], @acc[10] + adc 8*10($a_ptr), @acc[4] + mov @acc[5], @acc[11] + adc 8*11($a_ptr), @acc[5] + + sub 8*6($a_ptr), @acc[6] # a->re - a->im + sbb 8*7($a_ptr), @acc[7] + sbb 8*8($a_ptr), @acc[8] + sbb 8*9($a_ptr), @acc[9] + sbb 8*10($a_ptr), @acc[10] + sbb 8*11($a_ptr), @acc[11] + sbb $r_ptr, $r_ptr # borrow flag as mask + + mov @acc[0], 32+8*0(%rsp) # t0 + mov @acc[1], 32+8*1(%rsp) + mov @acc[2], 32+8*2(%rsp) + mov @acc[3], 32+8*3(%rsp) + mov @acc[4], 32+8*4(%rsp) + mov @acc[5], 32+8*5(%rsp) + + mov @acc[6], 32+8*6(%rsp) # t1 + mov @acc[7], 32+8*7(%rsp) + mov @acc[8], 32+8*8(%rsp) + mov @acc[9], 32+8*9(%rsp) + mov @acc[10], 32+8*10(%rsp) + mov @acc[11], 32+8*11(%rsp) + mov $r_ptr, 32+8*12(%rsp) + + ################################# mul_mont_384(ret->im, a->re, a->im, mod, n0); + #mov 8*3(%rsp), $a_ptr # a->re + lea 48($a_ptr), $b_ptr # a->im + + mov 48($a_ptr), %rdx + mov 8*0($a_ptr), %r14 # @acc[6] + mov 8*1($a_ptr), %r15 # @acc[7] + mov 8*2($a_ptr), %rax # @acc[8] + mov 8*3($a_ptr), %r12 # @acc[4] + mov 8*4($a_ptr), %rdi # $lo + mov 8*5($a_ptr), %rbp # $hi + lea -128($a_ptr), $a_ptr # control u-op density + lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + add @acc[0], @acc[0] # add with itself + adc @acc[1], @acc[1] + adc @acc[2], @acc[2] + adc @acc[3], @acc[3] + adc @acc[4], @acc[4] + adc @acc[5], @acc[5] + + mov @acc[0], 8*6($b_ptr) # ret->im + mov @acc[1], 8*7($b_ptr) + mov @acc[2], 8*8($b_ptr) + mov @acc[3], 8*9($b_ptr) + mov @acc[4], 8*10($b_ptr) + mov @acc[5], 8*11($b_ptr) +___ +} +$code.=<<___; + ################################# mul_mont_384(ret->re, t0, t1, mod, n0); + lea 32-128(%rsp), $a_ptr # t0 [+u-op density] + lea 32+8*6(%rsp), $b_ptr # t1 + + mov 32+8*6(%rsp), %rdx # t1[0] + mov 32+8*0(%rsp), %r14 # @acc[6] + mov 32+8*1(%rsp), %r15 # @acc[7] + mov 32+8*2(%rsp), %rax # @acc[8] + mov 32+8*3(%rsp), %r12 # @acc[4] + mov 32+8*4(%rsp), %rdi # $lo + mov 32+8*5(%rsp), %rbp # $hi + #lea -128($a_ptr), $a_ptr # control u-op density + #lea -128($n_ptr), $n_ptr # control u-op density + + mulx %r14, %r8, %r9 + call __mulx_mont_383_nonred +___ +{ +my @acc = map("%r$_","dx",15,"ax",12,"di","bp", # output from __mulx_mont_384 + 8..11,13,14); +$code.=<<___; + mov 32+8*12(%rsp), @acc[11] # account for sign from a->re - a->im + lea 128($n_ptr), $n_ptr + mov 32+8*0(%rsp), @acc[6] + and @acc[11], @acc[6] + mov 32+8*1(%rsp), @acc[7] + and @acc[11], @acc[7] + mov 32+8*2(%rsp), @acc[8] + and @acc[11], @acc[8] + mov 32+8*3(%rsp), @acc[9] + and @acc[11], @acc[9] + mov 32+8*4(%rsp), @acc[10] + and @acc[11], @acc[10] + and 32+8*5(%rsp), @acc[11] + + sub @acc[6], @acc[0] + mov 8*0($n_ptr), @acc[6] + sbb @acc[7], @acc[1] + mov 8*1($n_ptr), @acc[7] + sbb @acc[8], @acc[2] + mov 8*2($n_ptr), @acc[8] + sbb @acc[9], @acc[3] + mov 8*3($n_ptr), @acc[9] + sbb @acc[10], @acc[4] + mov 8*4($n_ptr), @acc[10] + sbb @acc[11], @acc[5] + sbb @acc[11], @acc[11] + + and @acc[11], @acc[6] + and @acc[11], @acc[7] + and @acc[11], @acc[8] + and @acc[11], @acc[9] + and @acc[11], @acc[10] + and 8*5($n_ptr), @acc[11] + + add @acc[6], @acc[0] + adc @acc[7], @acc[1] + adc @acc[8], @acc[2] + adc @acc[9], @acc[3] + adc @acc[10], @acc[4] + adc @acc[11], @acc[5] + + mov @acc[0], 8*0($b_ptr) # ret->re + mov @acc[1], 8*1($b_ptr) + mov @acc[2], 8*2($b_ptr) + mov @acc[3], 8*3($b_ptr) + mov @acc[4], 8*4($b_ptr) + mov @acc[5], 8*5($b_ptr) +___ +} +$code.=<<___; + lea $frame(%rsp), %r8 # size optimization + mov 8*0(%r8),%r15 +.cfi_restore %r15 + mov 8*1(%r8),%r14 +.cfi_restore %r14 + mov 8*2(%r8),%r13 +.cfi_restore %r13 + mov 8*3(%r8),%r12 +.cfi_restore %r12 + mov 8*4(%r8),%rbx +.cfi_restore %rbx + mov 8*5(%r8),%rbp +.cfi_restore %rbp + lea 8*6(%r8),%rsp +.cfi_adjust_cfa_offset -$frame-8*6 +.cfi_epilogue + ret +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x +___ +} + +print $code; +close STDOUT; diff --git a/blst/asm/sha256-armv8.pl b/blst/asm/sha256-armv8.pl new file mode 100755 index 0000000..1de27c7 --- /dev/null +++ b/blst/asm/sha256-armv8.pl @@ -0,0 +1,541 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for ARMv8. +# +# This module is stripped of scalar code paths, with raionale that all +# known processors are NEON-capable. +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = shift; + +if ($flavour && $flavour ne "void") { + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + + open STDOUT,"| \"$^X\" $xlate $flavour $output"; +} else { + open STDOUT,">$output"; +} + +$BITS=256; +$SZ=4; +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; +$reg_t="w"; +$pre="blst_"; + +($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); + +$code.=<<___; +.text + +.align 6 +.type .LK$BITS,%object +.LK$BITS: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0 //terminator +.size .LK$BITS,.-.LK$BITS +.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by \@dot-asm" +.align 2 +___ + +if ($SZ==4) { +my $Ktbl="x3"; + +my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); +my @MSG=map("v$_.16b",(4..7)); +my ($W0,$W1)=("v16.4s","v17.4s"); +my ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); + +$code.=<<___; +.globl ${pre}sha256_block_armv8 +.type ${pre}sha256_block_armv8,%function +.align 6 +${pre}sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1.32 {$ABCD,$EFGH},[$ctx] + adr $Ktbl,.LK256 + +.Loop_hw: + ld1 {@MSG[0]-@MSG[3]},[$inp],#64 + sub $num,$num,#1 + ld1.32 {$W0},[$Ktbl],#16 + rev32 @MSG[0],@MSG[0] + rev32 @MSG[1],@MSG[1] + rev32 @MSG[2],@MSG[2] + rev32 @MSG[3],@MSG[3] + orr $ABCD_SAVE,$ABCD,$ABCD // offload + orr $EFGH_SAVE,$EFGH,$EFGH +___ +for($i=0;$i<12;$i++) { +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + sha256su0 @MSG[0],@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + sha256su1 @MSG[0],@MSG[2],@MSG[3] +___ + ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); +} +$code.=<<___; + ld1.32 {$W1},[$Ktbl],#16 + add.i32 $W0,$W0,@MSG[0] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + ld1.32 {$W0},[$Ktbl],#16 + add.i32 $W1,$W1,@MSG[1] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + ld1.32 {$W1},[$Ktbl] + add.i32 $W0,$W0,@MSG[2] + sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W0 + sha256h2 $EFGH,$abcd,$W0 + + add.i32 $W1,$W1,@MSG[3] + orr $abcd,$ABCD,$ABCD + sha256h $ABCD,$EFGH,$W1 + sha256h2 $EFGH,$abcd,$W1 + + add.i32 $ABCD,$ABCD,$ABCD_SAVE + add.i32 $EFGH,$EFGH,$EFGH_SAVE + + cbnz $num,.Loop_hw + + st1.32 {$ABCD,$EFGH},[$ctx] + + ldr x29,[sp],#16 + ret +.size ${pre}sha256_block_armv8,.-${pre}sha256_block_armv8 +___ +} + +if ($SZ==4) { ######################################### NEON stuff # +# You'll surely note a lot of similarities with sha256-armv4 module, +# and of course it's not a coincidence. sha256-armv4 was used as +# initial template, but was adapted for ARMv8 instruction set and +# extensively re-tuned for all-round performance. + +my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); +my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); +my $Ktbl="x16"; +my $Xfer="x17"; +my @X = map("q$_",(0..3)); +my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); +my $j=0; + +sub AUTOLOAD() # thunk [simplified] x86-style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; + my $arg = pop; + $arg = "#$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; +} + +sub Dscalar { shift =~ m|[qv]([0-9]+)|?"d$1":""; } +sub Dlo { shift =~ m|[qv]([0-9]+)|?"v$1.d[0]":""; } +sub Dhi { shift =~ m|[qv]([0-9]+)|?"v$1.d[1]":""; } + +sub Xupdate() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + &ext_8 ($T0,@X[0],@X[1],4); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ext_8 ($T3,@X[2],@X[3],4); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dscalar($T7),&Dhi(@X[3])); # X[14..15] + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T2,$T0,$sigma0[0]); + eval(shift(@insns)); + &ushr_32 ($T1,$T0,$sigma0[2]); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T3); # X[0..3] += X[9..12] + eval(shift(@insns)); + &sli_32 ($T2,$T0,32-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T0,$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T2); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T3,$T0,32-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T4,$T7,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T1,$T1,$T3); # sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T4,$T7,32-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T5,$T7,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T3,$T7,$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); + eval(shift(@insns)); + &sli_u32 ($T3,$T7,32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T4); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T3); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &ushr_32 ($T6,@X[0],$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T7,@X[0],$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T6,@X[0],32-$sigma1[0]); + eval(shift(@insns)); + &ushr_32 ($T5,@X[0],$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T6); + eval(shift(@insns)); + eval(shift(@insns)); + &sli_32 ($T5,@X[0],32-$sigma1[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl], #16"); + eval(shift(@insns)); + &eor_8 ($T7,$T7,$T5); # sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + &eor_8 ($T5,$T5,$T5); + eval(shift(@insns)); + eval(shift(@insns)); + &mov (&Dhi($T5), &Dlo($T7)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 (@X[0],@X[0],$T5); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + while($#insns>=1) { eval(shift(@insns)); } + &st1_32 ("{$T0}","[$Xfer], #16"); + eval(shift(@insns)); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub Xpreload() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); + my ($a,$b,$c,$d,$e,$f,$g,$h); + + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_8 ("{@X[0]}","[$inp],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + &ld1_32 ("{$T0}","[$Ktbl],#16"); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &rev32 (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &add_32 ($T0,$T0,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &st1_32 ("{$T0}","[$Xfer], #16"); + + push(@X,shift(@X)); # "rotate" X[] +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. + '&add ($h,$h,$t1)', # h+=X[i]+K[i] + '&add ($a,$a,$t4);'. # h+=Sigma0(a) from the past + '&and ($t1,$f,$e)', + '&bic ($t4,$g,$e)', + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past + '&orr ($t1,$t1,$t4)', # Ch(e,f,g) + '&eor ($t0,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) + '&eor ($t4,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', + '&add ($h,$h,$t1)', # h+=Ch(e,f,g) + '&ror ($t0,$t0,"#$Sigma1[0]")', + '&eor ($t2,$a,$b)', # a^b, b^c in next round + '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) + '&add ($h,$h,$t0)', # h+=Sigma1(e) + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,"[$Ktbl]") if ($j==15);'. + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) + '&ror ($t4,$t4,"#$Sigma0[0]")', + '&add ($d,$d,$h)', # d+=h + '&eor ($t3,$t3,$b)', # Maj(a,b,c) + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' + ) +} + +$code.=<<___; +.globl ${pre}sha256_block_data_order +.type ${pre}sha256_block_data_order,%function +.align 4 +${pre}sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr $Ktbl,.LK256 + add $num,$inp,$num,lsl#6 // len to point at the end of inp + + ld1.8 {@X[0]},[$inp], #16 + ld1.8 {@X[1]},[$inp], #16 + ld1.8 {@X[2]},[$inp], #16 + ld1.8 {@X[3]},[$inp], #16 + ld1.32 {$T0},[$Ktbl], #16 + ld1.32 {$T1},[$Ktbl], #16 + ld1.32 {$T2},[$Ktbl], #16 + ld1.32 {$T3},[$Ktbl], #16 + rev32 @X[0],@X[0] // yes, even on + rev32 @X[1],@X[1] // big-endian + rev32 @X[2],@X[2] + rev32 @X[3],@X[3] + mov $Xfer,sp + add.32 $T0,$T0,@X[0] + add.32 $T1,$T1,@X[1] + add.32 $T2,$T2,@X[2] + st1.32 {$T0-$T1},[$Xfer], #32 + add.32 $T3,$T3,@X[3] + st1.32 {$T2-$T3},[$Xfer] + sub $Xfer,$Xfer,#32 + + ldp $A,$B,[$ctx] + ldp $C,$D,[$ctx,#8] + ldp $E,$F,[$ctx,#16] + ldp $G,$H,[$ctx,#24] + ldr $t1,[sp,#0] + mov $t2,wzr + eor $t3,$B,$C + mov $t4,wzr + b .L_00_48 + +.align 4 +.L_00_48: +___ + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); + &Xupdate(\&body_00_15); +$code.=<<___; + cmp $t1,#0 // check for K256 terminator + ldr $t1,[sp,#0] + sub $Xfer,$Xfer,#64 + bne .L_00_48 + + sub $Ktbl,$Ktbl,#256 // rewind $Ktbl + cmp $inp,$num + mov $Xfer, #64 + csel $Xfer, $Xfer, xzr, eq + sub $inp,$inp,$Xfer // avoid SEGV + mov $Xfer,sp +___ + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); + &Xpreload(\&body_00_15); +$code.=<<___; + add $A,$A,$t4 // h+=Sigma0(a) from the past + ldp $t0,$t1,[$ctx,#0] + add $A,$A,$t2 // h+=Maj(a,b,c) from the past + ldp $t2,$t3,[$ctx,#8] + add $A,$A,$t0 // accumulate + add $B,$B,$t1 + ldp $t0,$t1,[$ctx,#16] + add $C,$C,$t2 + add $D,$D,$t3 + ldp $t2,$t3,[$ctx,#24] + add $E,$E,$t0 + add $F,$F,$t1 + ldr $t1,[sp,#0] + stp $A,$B,[$ctx,#0] + add $G,$G,$t2 + mov $t2,wzr + stp $C,$D,[$ctx,#8] + add $H,$H,$t3 + stp $E,$F,[$ctx,#16] + eor $t3,$B,$C + stp $G,$H,[$ctx,#24] + mov $t4,wzr + mov $Xfer,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size ${pre}sha256_block_data_order,.-${pre}sha256_block_data_order +___ +} + +{ +my ($out,$inp,$len) = map("x$_",(0..2)); + +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,%function +.align 4 +${pre}sha256_emit: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[$out,#4] + lsr x4,x4,#32 + str w5,[$out,#12] + lsr x5,x5,#32 + str w6,[$out,#20] + lsr x6,x6,#32 + str w7,[$out,#28] + lsr x7,x7,#32 + str w4,[$out,#0] + str w5,[$out,#8] + str w6,[$out,#16] + str w7,[$out,#24] + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,%function +.align 4 +${pre}sha256_bcopy: +.Loop_bcopy: + ldrb w3,[$inp],#1 + sub $len,$len,#1 + strb w3,[$out],#1 + cbnz $len,.Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,%function +.align 4 +${pre}sha256_hcopy: + ldp x4,x5,[$inp] + ldp x6,x7,[$inp,#16] + stp x4,x5,[$out] + stp x6,x7,[$out,#16] + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +{ my %opcode = ( + "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, + "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); + + sub unsha256 { + my ($mnemonic,$arg)=@_; + + $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o + && + sprintf ".inst\t0x%08x\t//%s %s", + $opcode{$mnemonic}|$1|($2<<5)|($3<<16), + $mnemonic,$arg; + } +} + +open SELF,$0; +while() { + next if (/^#!/); + last if (!s/^#/\/\// and !/^$/); + print; +} +close SELF; + +foreach(split("\n",$code)) { + + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sha512\w+)\s+([qv].*)/unsha512($1,$2)/ge or + s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/ge; + + s/\bq([0-9]+)\b/v$1.16b/g; # old->new registers + + s/\.[ui]?8(\s)/$1/; + s/\.\w?64\b// and s/\.16b/\.2d/g or + s/\.\w?32\b// and s/\.16b/\.4s/g; + m/\bext\b/ and s/\.2d/\.16b/g or + m/(ld|st)1[^\[]+\[0\]/ and s/\.4s/\.s/g; + + print $_,"\n"; +} + +close STDOUT; diff --git a/blst/asm/sha256-portable-x86_64.pl b/blst/asm/sha256-portable-x86_64.pl new file mode 100755 index 0000000..eca0564 --- /dev/null +++ b/blst/asm/sha256-portable-x86_64.pl @@ -0,0 +1,337 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# Scalar-only version with minor twist minimizing 'lea' instructions. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +sub ROUND_00_15() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + # $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); + +$code.=<<___; + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 + mov $f,$a2 + + xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $g,$a2 # f^g + + mov $T1,`$SZ*($i&0xf)`(%rsp) + xor $a,$a1 + and $e,$a2 # (f^g)&e + + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g + + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 + add $a2,$T1 # T1+=Ch(e,f,g) + + mov $a,$a2 + add `$SZ*$i`($Tbl),$T1 # T1+=K[round] + xor $a,$a1 + + xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) + mov $b,$h + + and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + + xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) + add $T1,$d # d+=T1 + add $T1,$h # h+=T1 +___ +$code.=<<___ if ($i==31); + lea `16*$SZ`($Tbl),$Tbl # round+=16 +___ +$code.=<<___ if ($i<15); + add $a1,$h # h+=Sigma0(a) +___ + ($a2,$a3) = ($a3,$a2); +} + +sub ROUND_16_XX() +{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + +$code.=<<___; + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 + + mov $a0,$T1 + ror \$`$sigma0[1]-$sigma0[0]`,$a0 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + + xor $T1,$a0 + shr \$$sigma0[2],$T1 + ror \$$sigma0[0],$a0 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + + ror \$$sigma1[0],$a2 + xor $a0,$T1 # sigma0(X[(i+1)&0xf]) + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) + add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + + add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a2,$T1 + mov $a,$a1 +___ + &ROUND_00_15(@_); +} + +$code=<<___; +.text + +.globl $func +.type $func,\@function,3,"unwind" +.align 16 +$func: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg +.cfi_end_prologue + + mov $SZ*0($ctx),$A + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H + jmp .Lloop + +.align 16 +.Lloop: + mov $B,$a3 + lea $TABLE(%rip),$Tbl + xor $C,$a3 # magic +___ + for($i=0;$i<16;$i++) { + $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; + $code.=" bswap $T1\n"; + &ROUND_00_15($i,@ROT); + unshift(@ROT,pop(@ROT)); + } +$code.=<<___; + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: +___ + for(;$i<32;$i++) { + &ROUND_16_XX($i,@ROT); + unshift(@ROT,pop(@ROT)); + } + +$code.=<<___; + cmpb \$0x19,`$SZ-1`($Tbl) + jnz .Lrounds_16_xx + + mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) + lea 16*$SZ($inp),$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop + + lea $framesz+6*8(%rsp),%r11 +.cfi_def_cfa %r11,8 + mov $framesz(%rsp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbp +.cfi_restore %rbp + mov -8(%r11),%rbx +.cfi_restore %rbx +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size $func,.-$func + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/sha256-x86_64.pl b/blst/asm/sha256-x86_64.pl new file mode 100755 index 0000000..22b3763 --- /dev/null +++ b/blst/asm/sha256-x86_64.pl @@ -0,0 +1,789 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# ==================================================================== +# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +# project. +# ==================================================================== +# +# sha256_block procedure for x86_64. +# +# This module is stripped of AVX and even scalar code paths, with +# raionale that +# +# a) AVX1 is [justifiably] faster than SSSE3 code path only on *one* +# processor, venerable Sandy Bridge; +# b) AVX2 incurs costly power transitions, which would be justifiable +# if AVX2 code was executing most of the time, which is not the +# case in the context; +# c) all comtemporary processors support SSSE3, so that nobody would +# actually use scalar code path anyway; +# +# See original module at CRYPTOGAMS for further details. + +$flavour = shift; +$output = pop; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; + +$pre="blst_"; +$func="${pre}sha256_block_data_order"; +$TABLE="K256"; +$SZ=4; +@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", + "%r8d","%r9d","%r10d","%r11d"); +($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); +@Sigma0=( 2,13,22); +@Sigma1=( 6,11,25); +@sigma0=( 7,18, 3); +@sigma1=(17,19,10); +$rounds=64; + +$ctx="%rdi"; # 1st arg, zapped by $a3 +$inp="%rsi"; # 2nd arg +$Tbl="%rbp"; + +$_ctx="16*$SZ+0*8(%rsp)"; +$_inp="16*$SZ+1*8(%rsp)"; +$_end="16*$SZ+2*8(%rsp)"; +$framesz="16*$SZ+3*8"; + +$code=<<___; +.text + +.align 64 +.type $TABLE,\@object +$TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by \@dot-asm" +___ + +###################################################################### +# SIMD code paths +# +{{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.globl ${pre}sha256_block_data_order_shaext +.hidden ${pre}sha256_block_data_order_shaext +.type ${pre}sha256_block_data_order_shaext,\@function,3,"unwind" +.align 64 +${pre}sha256_block_data_order_shaext: +.cfi_startproc +___ +$code.=<<___ if ($win64); + sub \$0x58,%rsp +.cfi_adjust_cfa_offset 0x58 + movaps %xmm6,-0x58(%r11) +.cfi_offset %xmm6,-0x60 + movaps %xmm7,-0x48(%r11) +.cfi_offset %xmm7,-0x50 + movaps %xmm8,-0x38(%r11) +.cfi_offset %xmm8,-0x40 + movaps %xmm9,-0x28(%r11) +.cfi_offset %xmm9,-0x30 + movaps %xmm10,-0x18(%r11) +.cfi_offset %xmm10,-0x20 +.cfi_end_prologue +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x100-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*16-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*16-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*16-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*16-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -0x58(%r11),%xmm6 + movaps -0x48(%r11),%xmm7 + movaps -0x38(%r11),%xmm8 + movaps -0x28(%r11),%xmm9 + movaps -0x18(%r11),%xmm10 + mov %r11,%rsp +.cfi_def_cfa %r11,8 +.cfi_epilogue +___ +$code.=<<___; + ret +.cfi_endproc +.size ${pre}sha256_block_data_order_shaext,.-${pre}sha256_block_data_order_shaext +___ +}}} +{{{ + +my $a4=$T1; +my ($a,$b,$c,$d,$e,$f,$g,$h); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub body_00_15 () { + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&ror ($a0,$Sigma1[2]-$Sigma1[1])', + '&mov ($a,$a1)', + '&mov ($a4,$f)', + + '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', + '&xor ($a4,$g)', # f^g + + '&ror ($a0,$Sigma1[1]-$Sigma1[0])', + '&xor ($a1,$a)', + '&and ($a4,$e)', # (f^g)&e + + '&xor ($a0,$e)', + '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] + '&mov ($a2,$a)', + + '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', + '&xor ($a2,$b)', # a^b, b^c in next round + + '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) + '&and ($a3,$a2)', # (b^c)&(a^b) + + '&xor ($a1,$a)', + '&add ($h,$a0)', # h+=Sigma1(e) + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + + '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h + '&add ($h,$a3)', # h+=Maj(a,b,c) + + '&mov ($a0,$d)', + '&add ($a1,$h);'. # h+=Sigma0(a) + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); +} + +###################################################################### +# SSSE3 code path +# +{ +my $Tbl = $inp; +my $_ctx="0(%rbp)"; +my $_inp="8(%rbp)"; +my $_end="16(%rbp)"; +my $framesz=4*8+$win64*16*4+8; + +my @X = map("%xmm$_",(0..3)); +my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); + +$code.=<<___; +.globl ${func} +.hidden ${func} +.type ${func},\@function,3,"unwind" +.align 64 +${func}: +.cfi_startproc + push %rbp +.cfi_push %rbp + push %rbx +.cfi_push %rbx + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + shl \$4,%rdx # num*16 + sub \$$framesz,%rsp +.cfi_adjust_cfa_offset $framesz + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + mov $ctx,0(%rsp) # save ctx, 1st arg + #mov $inp,8(%rsp) # save inp, 2nd arg + mov %rdx,16(%rsp) # save end pointer, "3rd" arg +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x20(%rsp) +.cfi_offset %xmm6,-0x78 + movaps %xmm7,0x30(%rsp) +.cfi_offset %xmm7,-0x68 + movaps %xmm8,0x40(%rsp) +.cfi_offset %xmm8,-0x58 + movaps %xmm9,0x50(%rsp) +.cfi_offset %xmm9,-0x48 +___ +$code.=<<___; + mov %rsp,%rbp +.cfi_def_cfa_register %rbp +.cfi_end_prologue + + lea -16*$SZ(%rsp),%rsp + mov $SZ*0($ctx),$A + and \$-64,%rsp # align stack + mov $SZ*1($ctx),$B + mov $SZ*2($ctx),$C + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + +$code.=<<___; + #movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*$rounds`+64(%rip),$t5 + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + mov $inp,$_inp # offload $inp + movdqu 0x00($inp),@X[0] + movdqu 0x10($inp),@X[1] + movdqu 0x20($inp),@X[2] + pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] + lea $TABLE(%rip),$Tbl + pshufb $t3,@X[1] + movdqa 0x00($Tbl),$t0 + movdqa 0x10($Tbl),$t1 + pshufb $t3,@X[2] + paddd @X[0],$t0 + movdqa 0x20($Tbl),$t2 + pshufb $t3,@X[3] + movdqa 0x30($Tbl),$t3 + paddd @X[1],$t1 + paddd @X[2],$t2 + paddd @X[3],$t3 + movdqa $t0,0x00(%rsp) + mov $A,$a1 + movdqa $t1,0x10(%rsp) + mov $B,$a3 + movdqa $t2,0x20(%rsp) + xor $C,$a3 # magic + movdqa $t3,0x30(%rsp) + mov $E,$a0 + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + sub \$`-16*$SZ`,$Tbl # size optimization +___ +sub Xupdate_256_SSSE3 () { + ( + '&movdqa ($t0,@X[1]);', + '&movdqa ($t3,@X[3])', + '&palignr ($t0,@X[0],$SZ)', # X[1..4] + '&palignr ($t3,@X[2],$SZ);', # X[9..12] + '&movdqa ($t1,$t0)', + '&movdqa ($t2,$t0);', + '&psrld ($t0,$sigma0[2])', + '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] + '&psrld ($t2,$sigma0[0])', + '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] + '&pslld ($t1,8*$SZ-$sigma0[1]);'. + '&pxor ($t0,$t2)', + '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t1)', + '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. + '&pxor ($t0,$t2);', + '&movdqa ($t2,$t3)', + '&pxor ($t0,$t1);', # sigma0(X[1..4]) + '&psrld ($t3,$sigma1[2])', + '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2)', + '&pshufb ($t3,$t4)', # sigma1(X[14..15]) + '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) + '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] + '&movdqa ($t2,$t3);', + '&psrld ($t3,$sigma1[2])', + '&psrlq ($t2,$sigma1[0])', + '&pxor ($t3,$t2);', + '&psrlq ($t2,$sigma1[1]-$sigma1[0])', + '&pxor ($t3,$t2);', + '&movdqa ($t2,16*$j."($Tbl)")', + '&pshufb ($t3,$t5)', + '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) + ); +} + +sub SSSE3_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 104 instructions + + if (0) { + foreach (Xupdate_256_SSSE3()) { # 36 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } else { # squeeze extra 4% on Westmere and 19% on Atom + eval(shift(@insns)); #@ + &movdqa ($t0,@X[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &palignr ($t0,@X[0],$SZ); # X[1..4] + eval(shift(@insns)); + eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t1,$t0); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,$t0); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t0,$sigma0[2]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[0..3] += X[9..12] + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrld ($t2,$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[3],0b11111010); # X[4..15] + eval(shift(@insns)); + eval(shift(@insns)); #@ + &pslld ($t1,8*$SZ-$sigma0[1]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + &pxor ($t0,$t1); + eval(shift(@insns)); + eval(shift(@insns)); + &pslld ($t1,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t2); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &movdqa ($t2,$t3); + eval(shift(@insns)); + eval(shift(@insns)); + &psrld ($t3,$sigma1[2]); + eval(shift(@insns)); + eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &psrlq ($t2,$sigma1[1]-$sigma1[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &pxor ($t3,$t2); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa ($t2,16*$j."($Tbl)"); + eval(shift(@insns)); #@ + eval(shift(@insns)); + &pslldq ($t3,8); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ + eval(shift(@insns)); + eval(shift(@insns)); + } + &paddd ($t2,@X[0]); + foreach (@insns) { eval; } # remaining instructions + &movdqa (16*$j."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &SSSE3_256_00_47($j,\&body_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &jne (".Lssse3_00_47"); + + for ($i=0; $i<16; ) { + foreach(body_00_15()) { eval; } + } +$code.=<<___; + mov $_ctx,$ctx + mov $a1,$A + mov $_inp,$inp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + lea 16*$SZ($inp),$inp + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + jb .Lloop_ssse3 + + xorps %xmm0, %xmm0 + lea $framesz+6*8(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0, 0x00(%rsp) # scrub the stack + movaps %xmm0, 0x10(%rsp) + movaps %xmm0, 0x20(%rsp) + movaps %xmm0, 0x30(%rsp) +___ +$code.=<<___ if ($win64); + movaps 0x20(%rbp),%xmm6 + movaps 0x30(%rbp),%xmm7 + movaps 0x40(%rbp),%xmm8 + movaps 0x50(%rbp),%xmm9 +___ +$code.=<<___; + mov $framesz(%rbp),%r15 +.cfi_restore %r15 + mov -40(%r11),%r14 +.cfi_restore %r14 + mov -32(%r11),%r13 +.cfi_restore %r13 + mov -24(%r11),%r12 +.cfi_restore %r12 + mov -16(%r11),%rbx +.cfi_restore %rbx + mov -8(%r11),%rbp +.cfi_restore %rbp +.cfi_epilogue + lea (%r11),%rsp + ret +.cfi_endproc +.size ${func},.-${func} +___ +} +}}} +{ +my ($out,$inp,$len) = $win64 ? ("%rcx","%rdx","%r8") : # Win64 order + ("%rdi","%rsi","%rdx"); # Unix order +$code.=<<___; +.globl ${pre}sha256_emit +.hidden ${pre}sha256_emit +.type ${pre}sha256_emit,\@abi-omnipotent +.align 16 +${pre}sha256_emit: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + bswap %r8 + mov 24($inp), %r11 + bswap %r9 + mov %r8d, 4($out) + bswap %r10 + mov %r9d, 12($out) + bswap %r11 + mov %r10d, 20($out) + shr \$32, %r8 + mov %r11d, 28($out) + shr \$32, %r9 + mov %r8d, 0($out) + shr \$32, %r10 + mov %r9d, 8($out) + shr \$32, %r11 + mov %r10d, 16($out) + mov %r11d, 24($out) + ret +.size ${pre}sha256_emit,.-${pre}sha256_emit + +.globl ${pre}sha256_bcopy +.hidden ${pre}sha256_bcopy +.type ${pre}sha256_bcopy,\@abi-omnipotent +.align 16 +${pre}sha256_bcopy: + sub $inp, $out +.Loop_bcopy: + movzb ($inp), %eax + lea 1($inp), $inp + mov %al, -1($out,$inp) + dec $len + jnz .Loop_bcopy + ret +.size ${pre}sha256_bcopy,.-${pre}sha256_bcopy + +.globl ${pre}sha256_hcopy +.hidden ${pre}sha256_hcopy +.type ${pre}sha256_hcopy,\@abi-omnipotent +.align 16 +${pre}sha256_hcopy: + mov 0($inp), %r8 + mov 8($inp), %r9 + mov 16($inp), %r10 + mov 24($inp), %r11 + mov %r8, 0($out) + mov %r9, 8($out) + mov %r10, 16($out) + mov %r11, 24($out) + ret +.size ${pre}sha256_hcopy,.-${pre}sha256_hcopy +___ +} + +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} +close STDOUT; diff --git a/blst/asm/x86_64-xlate.pl b/blst/asm/x86_64-xlate.pl new file mode 100755 index 0000000..62be619 --- /dev/null +++ b/blst/asm/x86_64-xlate.pl @@ -0,0 +1,1781 @@ +#!/usr/bin/env perl +# +# Copyright Supranational LLC +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by @dot-asm. +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. [Unless function is +# tagged with additional .type tag.] For further details see SEH +# paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $dwarf=$elf; +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | \${CC:-false} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} + +$dwarf=0 if($win64); + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /cmov[n]?[lb]$/) { + # pass through + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . ".byte 0xf3,0xc3"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4" + && !$current_function->{unwind}) { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + my %sifmap = ( ss=>"d", sd=>"q", # broadcast only + i32x2=>"q", f32x2=>"q", + i32x4=>"x", i64x2=>"x", i128=>"x", + f32x4=>"x", f64x2=>"x", f128=>"x", + i32x8=>"y", i64x4=>"y", + f32x8=>"y", f64x4=>"y" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + $self->{label} =~ s/\b([0-9]+)\b/unpack("l",pack("L",$1))/eg; + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?:broadcast|extract|insert)([sif]\w+)$/) + && ($sz=$sifmap{$1}); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($current_function->{name} eq $self->{value}) { + $func .= "\n.cfi_".cfi_directive::startproc() if ($dwarf); + $func .= "\n .byte 0xf3,0x0f,0x1e,0xfa\n"; # endbranch + if ($win64 && $current_function->{abi} eq "svr4") { + my $fp = $current_function->{unwind} ? "%r11" : "%rax"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,$fp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + my $fp = $current_function->{unwind} ? "r11" : "rax"; + $func .= " DB 243,15,30,250\n"; # endbranch + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov $fp,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"). + "\n DB 243,15,30,250"; # endbranch + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +my @xdata_seg = (".section .xdata", ".align 8"); +my @pdata_seg = (".section .pdata", ".align 4"); + +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + # + # In addition the .cfi directives are re-purposed even for Win64 + # stack unwinding. Two more synthetic directives were added: + # + # - .cfi_end_prologue to denote point when all non-volatile + # registers are saved and stack or [chosen] frame pointer is + # stable; + # - .cfi_epilogue to denote point when all non-volatile registers + # are restored [and it even adds missing .cfi_restore-s]; + # + # Though it's not universal "miracle cure," it has its limitations. + # Most notably .cfi_cfa_expression won't start working... For more + # information see the end of this file. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_off, $cfa_rsp, %saved_regs); + my @cfa_stack; + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + + # Following constants are defined in "x64 exception handling" at + # https://docs.microsoft.com/ and match the register sequence in + # CONTEXT structure defined in winnt.h. + my %WIN64_reg_idx = ( + "%rax"=>0, "%rcx"=>1, "%rdx"=>2, "%rbx"=>3, + "%rsp"=>4, "%rbp"=>5, "%rsi"=>6, "%rdi"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + sub xdata { + our @dat = (); + our $len = 0; + + sub allocstack { + my $offset = shift; + + if ($offset) { + if ($offset <= 128) { + $offset = ($offset - 8) >> 3; + push @dat, [0,$offset<<4|2]; # UWOP_ALLOC_SMALL + } elsif ($offset < 0x80000) { + push @dat, [0,0x01,unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,0x11,unpack("C4",pack("V",$offset))]; + } + $len += $#{@dat[-1]}+1; + } + } + + # allocate stack frame + if (my $offset = -8 - $cfa_rsp) { + # but see if frame pointer is among saved registers + if ($cfa_reg ne "%rsp" and my $fp_off = $saved_regs{$cfa_reg}) { + $fp_off = -8 - $fp_off; + allocstack($fp_off-8); + $offset -= $fp_off; + push @dat, [0,$WIN64_reg_idx{$cfa_reg}<<4]; # UWOP_PUSH_NONVOL + $len += $#{@dat[-1]}+1; + } + allocstack($offset); + } + # set up frame pointer + my $fp_info = 0; + if ($cfa_reg ne "%rsp") { + my $offset = $cfa_off - $cfa_rsp; + ($offset > 240 or $offset&0xf) and die "invalid FP offset $offset"; + $fp_info = ($offset&-16)|$WIN64_reg_idx{$cfa_reg}; + push @dat, [0,3]; # UWOP_SET_FPREG + $len += $#{@dat[-1]}+1; + } + # save registers + foreach my $key (sort { $saved_regs{$b} <=> $saved_regs{$a} } + keys(%saved_regs)) { + next if ($cfa_reg ne "%rsp" && $cfa_reg eq $key); + my $offset = $saved_regs{$key} - $cfa_rsp; + if ($key =~ /%xmm([0-9]+)/) { + if ($offset < 0x100000) { + push @dat, [0,($1<<4)|8,unpack("C2",pack("v",$offset>>4))]; + } else { + push @dat, [0,($1<<4)|9,unpack("C4",pack("V",$offset))]; + } + } else { + if ($offset < 0x80000) { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|4, + unpack("C2",pack("v",$offset>>3))]; + } else { + push @dat, [0,(($WIN64_reg_idx{$key})<<4)|5, + unpack("C4",pack("V",$offset))]; + } + } + $len += $#{@dat[-1]}+1; + } + + my @ret; + # generate 4-byte descriptor + push @ret, ".byte 1,0,".($len/2).",$fp_info"; + $len += 4; + # pad to 8*n + unshift @dat, [(0)x((-$len)&7)] if ($len&7); + # emit data + while(defined(my $row = pop @dat)) { + push @ret, ".byte ". join(",", + map { sprintf "0x%02x",$_ } @{$row}); + } + + return @ret; + } + sub startproc { + return if ($cfa_rsp == -8); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", -8, -8); + %saved_regs = (); + return "startproc"; + } + sub endproc { + return if ($cfa_rsp == 0); + ($cfa_reg, $cfa_off, $cfa_rsp) = ("%rsp", 0, 0); + %saved_regs = (); + return "endproc"; + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { $dir = startproc(); last; }; + /endproc/ && do { $dir = endproc(); + # .cfi_remember_state directives that are not + # matched with .cfi_restore_state are + # unnecessary. + die "unpaired .cfi_remember_state" if (@cfa_stack); + last; + }; + /def_cfa_register/ + && do { $cfa_off = $cfa_rsp if ($cfa_reg eq "%rsp"); + $cfa_reg = $$line; + last; + }; + /def_cfa_offset/ + && do { $cfa_off = -1*eval($$line); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { my $val = 1*eval($$line); + $cfa_off -= $val; + if ($cfa_reg eq "%rsp") { + $cfa_rsp -= $val; + } + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_off = -1*eval($2); + $cfa_rsp = $cfa_off if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $saved_regs{$$line} = $cfa_rsp; + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $cfa_off = $cfa_rsp; + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + delete $saved_regs{$$line}; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + /remember_state/ + && do { push @cfa_stack, + [$cfa_reg,$cfa_off,$cfa_rsp,%saved_regs]; + last; + }; + /restore_state/ + && do { ($cfa_reg,$cfa_off,$cfa_rsp,%saved_regs) + = @{pop @cfa_stack}; + last; + }; + /offset/ && do { if ($$line =~ /(%\w+)\s*,\s*(.+)/) { + $saved_regs{$1} = 1*eval($2); + $dir = undef if ($1 =~ /%xmm/); + } + last; + }; + /restore/ && do { delete $saved_regs{$$line}; last; }; + /end_prologue/ + && do { $dir = undef; + $self->{win64} = ".endprolog"; + last; + }; + /epilogue/ && do { $dir = undef; + $self->{win64} = ".epilogue"; + $self->{value} = join("\n", + map { ".cfi_restore\t$_" } + sort keys(%saved_regs)); + %saved_regs = (); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return $self->{value} if ($dwarf); + + if ($win64 and $current_function->{unwind} + and my $ret = $self->{win64}) { + my ($reg, $off) = ($cfa_reg =~ /%(?!rsp)/) ? ($', $cfa_off) + : ("rsp", $cfa_rsp); + my $fname = $current_function->{name}; + + if ($ret eq ".endprolog") { + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + + push @pdata_seg, + ".rva .LSEH_begin_${fname}", + ".rva .LSEH_body_${fname}", + ".rva .LSEH_info_${fname}_prologue",""; + push @xdata_seg, + ".LSEH_info_${fname}_prologue:", + ".byte 1,0,5,0x0b", # 5 unwind codes, %r11 is FP + ".byte 0,0x74,1,0", # %rdi at 8(%rsp) + ".byte 0,0x64,2,0", # %rsi at 16(%rsp) + ".byte 0,0x03", # set frame pointer + ".byte 0,0" # padding + ; + push @pdata_seg, + ".rva .LSEH_body_${fname}", + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_info_${fname}_body",""; + push @xdata_seg,".LSEH_info_${fname}_body:", xdata(); + $ret = "${decor}SEH_body_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + } elsif ($ret eq ".epilogue") { + %saved_regs = (); + $saved_regs{"%rdi"} = 0; # relative to CFA, remember? + $saved_regs{"%rsi"} = 8; + $cfa_rsp = $cfa_off; + + push @pdata_seg, + ".rva .LSEH_epilogue_${fname}", + ".rva .LSEH_end_${fname}", + ".rva .LSEH_info_${fname}_epilogue",""; + push @xdata_seg,".LSEH_info_${fname}_epilogue:", xdata(), ""; + $ret = "${decor}SEH_epilogue_${fname}:"; + $ret .= ":" if ($masm); $ret .= "\n"; + if ($gas) { + $ret .= " mov ".(0-$off)."(%$reg),%rdi\n"; + $ret .= " mov ".(8-$off)."(%$reg),%rsi\n"; + } else { + $ret .= " mov rdi,QWORD$PTR\[".(0-$off)."+$reg\]"; + $ret .= " ;WIN64 epilogue\n"; + $ret .= " mov rsi,QWORD$PTR\[".(8-$off)."+$reg\]\n"; + } + } + return $ret; + } + return; + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg,$unwind) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + $current_function->{unwind} = $unwind; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif ($dir =~ /\.size/) { + $self->{value} = "" if (!$elf); + if ($dwarf and my $endproc = cfi_directive::endproc()) { + $self->{value} = ".cfi_$endproc\n$self->{value}"; + } elsif (!$elf && defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([px])data/) { + $v.=" rdata align="; + $v.=$1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([px])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... +# However, it should not be used in functions' prologues explicitly, as +# it's added automatically [and in the right spot]. Which leaves only +# non-function indirect branch targets, such as in a case-like dispatch +# table, as application area. + +my $endbr64 = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +if ($nasm) { + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} + +sub process { + my $line = shift; + + $line =~ s|\R$||; # Better chomp + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +while(<>) { process($_); } + +map { process($_) } @pdata_seg if ($win64); +map { process($_) } @xdata_seg if ($win64); + +# platform-specific epilogue +if ($masm) { + print "\n$current_segment\tENDS\n" if ($current_segment); + print "END\n"; +} elsif ($elf) { + # -fcf-protection segment, snatched from compiler -S output + my $align = ($flavour =~ /elf32/) ? 4 : 8; + print <<___; + +.section .note.GNU-stack,"",\@progbits +.section .note.gnu.property,"a",\@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align $align +2: +___ +} + +close STDOUT; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). +# +######################################################################## +# As of May 2020 an alternative approach that works with both exceptions +# and debugging/profiling was implemented by re-purposing DWARF .cfi +# annotations even for Win64 unwind tables' generation. Unfortunately, +# but not really unexpectedly, it imposes additional limitations on +# coding style. Probably most significant limitation is that frame +# pointer has to be at 16*n distance from stack pointer at the exit +# from prologue. But first things first. There are two additional +# synthetic .cfi directives, .cfi_end_prologue and .cfi_epilogue, +# that need to be added to all functions marked with additional .type +# tag (see example below). There are "do's and don'ts" for prologue +# and epilogue. It shouldn't come as surprise that in prologue one may +# not modify non-volatile registers, but one may not modify %r11 either. +# This is because it's used as temporary frame pointer(*). There is one +# exception to this rule, and it's setting up frame pointer that is +# non-volatile or %r11. But it must be last instruction in the prologue. +# Constraints for epilogue, or rather on its boundary, depend on whether +# the frame is fixed- or variable-length. In fixed-frame subroutine +# stack pointer has to be restored in the last instruction prior the +# .cfi_epilogue directive. If it's variable-frame subroutine, and a +# non-volatile register was used as frame pointer, then last instruction +# prior the directive has to restore its original value. This means that +# final stack pointer adjustment would have to be pushed past the +# directive. Normally this would render the epilogue non-unwindable, so +# special care has to be taken. To resolve the dilemma, copy frame +# pointer to a volatile register in advance. To give an example: +# +# .type rbp_as_frame_pointer,\@function,3,"unwind" # mind extra tag! +# rbp_as_frame_pointer: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# mov %rsp,%rbp # last instruction in prologue +# .cfi_def_cfa_register %rbp # %rsp-%rbp has to be 16*n, e.g. 16*0 +# .cfi_end_prologue +# sub \$40,%rsp +# and \$-64,%rsp +# ... +# mov %rbp,%r11 +# .cfi_def_cfa_register %r11 # copy frame pointer to volatile %r11 +# mov 0(%rbp),%rbx +# mov 8(%rbp),%rbp # last instruction prior epilogue +# .cfi_epilogue # may not change %r11 in epilogue +# lea 16(%r11),%rsp +# ret +# .cfi_endproc +# .size rbp_as_frame_pointer,.-rbp_as_frame_pointer +# +# To give an example of fixed-frame subroutine for reference: +# +# .type fixed_frame,\@function,3,"unwind" # mind extra tag! +# fixed_frame: +# .cfi_startproc +# push %rbp +# .cfi_push %rbp +# push %rbx +# .cfi_push %rbx +# sub \$40,%rsp +# .cfi_adjust_cfa_offset 40 +# .cfi_end_prologue +# ... +# mov 40(%rsp),%rbx +# mov 48(%rsp),%rbp +# lea 56(%rsp),%rsp +# .cfi_adjust_cfa_offset -56 +# .cfi_epilogue +# ret +# .cfi_endproc +# .size fixed_frame,.-fixed_frame +# +# As for epilogue itself, one can only work on non-volatile registers. +# "Non-volatile" in "Windows" sense, i.e. minus %rdi and %rsi. +# +# On a final note, mixing old-style and modernized subroutines in the +# same file takes some trickery. Ones of the new kind have to appear +# after old-style ones. This has everything to do with the fact that +# entries in the .pdata segment have to appear in strictly same order +# as corresponding subroutines, and auto-generated RUNTIME_FUNCTION +# structures get mechanically appended to whatever existing .pdata. +# +# (*) Just in case, why %r11 and not %rax. This has everything to do +# with the way UNWIND_INFO is, one just can't designate %rax as +# frame pointer. diff --git a/blst/assembly.S b/blst/assembly.S new file mode 100644 index 0000000..a1a7c54 --- /dev/null +++ b/blst/assembly.S @@ -0,0 +1,123 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# else +# include "elf/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# else +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# else +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# if defined(__BLST_PORTABLE__) +# include "coff/sha256-portable-x86_64.s" +# else +# include "coff/sha256-x86_64.s" +# endif +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# else +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# else +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# else +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# define __sub_mod_384x384 __sub_mont_384x384 +# if defined(__ADX__) && !defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# else +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif diff --git a/blst/blst.h b/blst/blst.h new file mode 100644 index 0000000..aaee107 --- /dev/null +++ b/blst/blst.h @@ -0,0 +1,480 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specifc Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specifc Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specifc Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specifc Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(); +#endif // SWIG + +/* + * BLS12-381-specifc point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif diff --git a/blst/blst_aux.h b/blst/blst_aux.h new file mode 100644 index 0000000..41c2901 --- /dev/null +++ b/blst/blst_aux.h @@ -0,0 +1,79 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +typedef struct {} blst_uniq; + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); + +#endif diff --git a/blst/bulk_addition.c b/blst/bulk_addition.c new file mode 100644 index 0000000..81afc53 --- /dev/null +++ b/blst/bulk_addition.c @@ -0,0 +1,168 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * This implementation uses explicit addition formula: + * + * λ = (Y₂-Y₁)/(X₂-X₁) + * X₃ = λ²-(X₁+X₂) + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * But since we don't know if we'll have to add point to itself, we need + * to eventually resort to corresponding doubling formula: + * + * λ = 3X₁²/2Y₁ + * X₃ = λ²-2X₁ + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * The formulae use prohibitively expensive inversion, but whenever we + * have a lot of affine points to accumulate, we can amortize the cost + * by applying Montgomery's batch inversion approach. As a result, + * asymptotic[!] per-point cost for addition is as small as 5M+1S. For + * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things + * considered, the improvement coefficient varies from 60% to 85% + * depending on platform and curve. + * + * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an + * application that requires constant time-ness, speak up!] + */ + +/* + * Calculate λ's numerator and denominator. + * + * input: A x1 y1 - + * B x2 y2 - + * output: + * if A!=B: A x1 y1 (x2-x1)*mul_acc + * B x2+x1 y2-y1 (x2-x1) + * + * if A==B: A x y 2y*mul_acc + * B 2x 3*x^2 2y + * + * if A==-B: A 0 0 1*mul_acc + * B 0 3*x^2 0 + */ +#define HEAD(ptype, bits, field, one) \ +static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ +{ \ + ptype *A = AB, *B = AB+1; \ + limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ + vec_is_zero(B, sizeof(ptype##_affine)); \ + static const vec##bits zero = { 0 }; \ +\ + sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ + add_##field(B->X, B->X, A->X); /* X2+X1 */ \ + add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ + sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ + if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ + inf = vec_is_zero(A->Z, sizeof(A->Z)); \ + vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ + sqr_##field(B->Y, A->X); \ + mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ + vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ + } /* B->Y is numenator */ \ + /* B->Z is denominator */ \ + vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ + vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ + vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ + vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ + if (mul_acc != NULL) \ + mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ +} + +/* + * Calculate λ and resulting coordinates. + * + * input: A x1 y1 - + * B x2+x1 nominator - + * lambda 1/denominator + * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 + */ +#define TAIL(ptype, bits, field, one) \ +static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ +{ \ + ptype *A = AB, *B = AB+1; \ + vec##bits llambda; \ + limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ +\ + mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ + /* alt. 3*X1^2/2*Y1 */ \ + sqr_##field(llambda, lambda); \ + sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ +\ + sub_##field(D->Y, A->X, D->X); \ + mul_##field(D->Y, D->Y, lambda); \ + sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ +\ + vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ + vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ +} + +/* + * |points[]| is volatile buffer with |X|s and |Y|s initially holding + * input affine coordinates, and with |Z|s being used as additional + * temporary storage [unrelated to Jacobian coordinates]. |sum| is + * in-/output, initialize to infinity accordingly. + */ +#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ +HEAD(ptype, bits, field, one) \ +TAIL(ptype, bits, field, one) \ +static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ +{ \ + ptype *dst; \ + void *mul_acc; \ + size_t i; \ +\ + while (n >= 16) { \ + if (n & 1) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ + n /= 2; \ + for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ + ptype##_head(points, mul_acc); \ +\ + reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ +\ + for (dst = points, i = n; --i;) { \ + dst--; points -= 2; \ + mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ + ptype##_tail(dst, points, points[-2].Z); \ + mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ + } \ + dst--; points -= 2; \ + ptype##_tail(dst, points, points[0].Z); \ + points = dst; \ + } \ + while (n--) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ +} \ +\ +void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + /* Performance with 288K scratch is within 1-2-3% from optimal */ \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 2048 : 1024; \ + ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ + sizeof(ptype)); \ + const ptype##_affine *point = NULL; \ +\ + vec_zero(sum, sizeof(*sum)); \ + while (npoints) { \ + size_t i, j = npoints > stride ? stride : npoints; \ + for (i=0; iY, p->Y, cbit); } + +void blst_p1_cneg(POINTonE1 *a, int cbit) +{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p1_on_curve(const POINTonE1 *p) +{ return (int)POINTonE1_on_curve(p); } + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE1_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + mul_fp(ret.X, ret.X, BLS12_381_RR); + + sqr_fp(ret.Y, ret.X); + mul_fp(ret.Y, ret.Y, ret.X); + add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp(out->Y, out->Y, sgn0_pty); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y, sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X, ret.X, BLS12_381_RR); + mul_fp(ret.Y, ret.Y, BLS12_381_RR); + + if (!POINTonE1_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE1_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE1_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ return POINTonE1_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) +POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) + +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) +{ return (int)POINTonE1_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) +#endif + +static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) +}; + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + vec_copy(out->X, in->X, 2*sizeof(out->X)); + mul_fp(out->Z, in->Z, beta); +} + +/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ +static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* SK/z^2 [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s+16, val.s }; + POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ + size_t i; + + POINTonE1_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + mul_fp(table[1][i].X, table[0][i].X, beta); + cneg_fp(table[1][i].Y, table[0][i].Y, 1); + vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); + } + + POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); + POINTonE1_cneg(out, 1); + mul_fp(out->Z, out->Z, beta); + mul_fp(out->Z, out->Z, beta); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) +{ + vec384 Z, ZZ; + limb_t inf; + + POINTonE1_mult_glv(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp(ZZ, Z); + mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) +{ POINTonE1_sign(out, &BLS12_381_G1, SK); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) +{ POINTonE1_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, &BLS12_381_G1, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 176) { + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE1_mult_glv(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p1_is_inf(const POINTonE1 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE1 *blst_p1_generator(void) +{ return &BLS12_381_G1; } + +int blst_p1_affine_is_inf(const POINTonE1_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE1_affine *blst_p1_affine_generator(void) +{ return (const POINTonE1_affine *)&BLS12_381_G1; } diff --git a/blst/e2.c b/blst/e2.c new file mode 100644 index 0000000..eafc486 --- /dev/null +++ b/blst/e2.c @@ -0,0 +1,632 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 2); + lshift_fp(out[1], out[1], 2); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 4); + lshift_fp(out[1], out[1], 4); +} + +static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) +{ cneg_fp2(p->Y, p->Y, cbit); } + +void blst_p2_cneg(POINTonE2 *a, int cbit) +{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p2_on_curve(const POINTonE2 *p) +{ return (int)POINTonE2_on_curve(p); } + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinitiy bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinitiy bits */ + } else { + limb_t sign = POINTonE2_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + + sqr_fp2(ret.Y, ret.X); + mul_fp2(ret.Y, ret.Y, ret.X); + add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); + limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); + mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); + + if (!POINTonE2_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return BLST_SUCCESS; +} + +static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, + const unsigned char in[192]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE2_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE2_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, + const unsigned char in[192]) +{ return POINTonE2_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) +POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) +{ return (int)POINTonE2_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) +#endif + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } + }; + static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + }; + + vec_copy(out, in, sizeof(*out)); + cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); + cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); + cneg_fp(out->Z[1], out->Z[1], 1); +} + +/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ +static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* break down SK to "digits" with |z| as radix [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + div_by_z(val.l); + div_by_z(val.l + NLIMBS(256)/2); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s, NULL }; + POINTonE2 table[4][1<<(5-1)]; /* 18KB */ + size_t i; + + POINTonE2_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + psi(&table[1][i], &table[0][i]); + psi(&table[2][i], &table[1][i]); + psi(&table[3][i], &table[2][i]); + POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ + POINTonE2_cneg(&table[3][i], 1); + } + + POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) +{ + vec384x Z, ZZ; + limb_t inf; + + POINTonE2_mult_gls(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp2(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) +{ POINTonE2_sign(out, &BLS12_381_G2, SK); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) +{ POINTonE2_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, &BLS12_381_G2, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 144) { + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE2_mult_gls(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p2_is_inf(const POINTonE2 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE2 *blst_p2_generator(void) +{ return &BLS12_381_G2; } + +int blst_p2_affine_is_inf(const POINTonE2_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE2_affine *blst_p2_affine_generator(void) +{ return (const POINTonE2_affine *)&BLS12_381_G2; } diff --git a/blst/ec_mult.h b/blst/ec_mult.h new file mode 100644 index 0000000..192f733 --- /dev/null +++ b/blst/ec_mult.h @@ -0,0 +1,289 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +/* Works up to 9 bits */ +static limb_t get_wval(const byte *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + + ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; + + return ret >> (off%8); +} + +/* Works up to 25 bits. */ +static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) +{ + size_t i, top = (off + bits - 1)/8; + limb_t ret, mask = (limb_t)0 - 1; + + d += off/8; + top -= off/8-1; + + /* this is not about constant-time-ness, but branch optimization */ + for (ret=0, i=0; i<4;) { + ret |= (*d & mask) << (8*i); + mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + d += 1 & mask; + } + + return ret >> (off%8); +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode(limb_t wval, size_t sz) +{ + limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + + wval = (wval + 1) >> 1; + wval = (wval & ~mask) | ((0-wval) & mask); + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling agorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ +static void ptype##_gather_booth_w##SZ(ptype *restrict p, \ + const ptype table[1<<(SZ-1)], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + bool_t booth_sign = (booth_idx >> SZ) & 1; \ +\ + booth_idx &= (1< 0) \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + else \ + wval = (scalar[0] << 1) & wmask; \ +\ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +\ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; scalar_s = scalars; \ + } \ +\ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (scalar[0] << 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} \ +\ +static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ + const byte *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t j, window; \ + ptype temp[1]; \ + ptype table[1<<(SZ-1)]; \ +\ + ptype##_precompute_w##SZ(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table, wval); \ +\ + while (bits > 0) { \ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table, wval); \ + if (bits > 0) ptype##_add(ret, ret, temp); \ + else ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + bool_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, \ + const ptype##_affine *p_affine, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/blst/ec_ops.h b/blst/ec_ops.h new file mode 100644 index 0000000..0d531f8 --- /dev/null +++ b/blst/ec_ops.h @@ -0,0 +1,787 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ + mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ +\ + sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(S1, S1, J); /* S1*J */\ +\ + mul_##field(p3.Y, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle either input at infinity, with |p1| encoded as Z==0, + * and |p2| as X==Y==0. + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, H, HH, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(I, J, p1->Y); /* Y1*J */\ +\ + sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, I); \ + sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3.Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(B, B, p1->X); /* X1+B */\ + sqr_##field(B, B); /* (X1+B)^2 */\ + sub_##field(B, B, A); /* (X1+B)^2-A */\ + sub_##field(B, B, C); /* (X1+B)^2-A-C */\ + add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, B); \ + sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, B, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + bool_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from /.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + bool_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} + +#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ +static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, Z2Z2; \ + ptype##_affine a1, a2; \ + bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ +\ + mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ + mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle + * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| + * and replacing few first references to |X3| in the formula, up to step + * 21, with it. 12M[+27A], doubling and infinity are handled by the + * formula itself. Infinity is to be encoded as [0, !0, 0]. + */ +#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ + const ptype##proj *p2) \ +{ \ + vec##bits t0, t1, t2, t3, t4, t5; \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ + add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ + add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ + add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ + mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ + add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ + sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ + add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ + add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ + mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ + add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ + sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ + mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ + mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ + mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ + add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ + mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ + mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ + mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle + * |p2| being infinity encoded as [0, 0]. 11M[+21A]. + */ +#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype##proj p3[1]; \ + vec##bits t0, t1, t2, t3, t4; \ + limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ + mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ + add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ + mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ + add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ + mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ + mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ + mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ + add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ + mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ + mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ + mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ +\ + vec_select(out, p1, p3, sizeof(*out), p2inf); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle + * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y + * and reordering operations to bring references to |p1| forward. + * 6M+2S[+13A]. + */ +#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ +{ \ + vec##bits t0, t1, t2, t3; \ +\ + sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ + mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ + sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ + mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ + lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ + mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ + mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ + mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ + add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ + mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ + mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ + sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ + mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ + add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ + mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ + add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ +} + +#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ +{ \ + vec##bits ZZ; \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + mul_##field(out->Y, in->Y, ZZ); \ + vec_copy(out->Z, in->Z, sizeof(out->Z)); \ +} + +#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ +static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ +{ \ + vec##bits ZZ; \ + limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ + mul_##field(out->Z, ZZ, in->Z); \ +} + +/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 + * with twist to handle either input at infinity. Addition costs 12M+2S, + * while conditional doubling - 4M+6M+3S. + */ +#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ +static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##xyzz *p2) \ +{ \ + vec##bits U, S, P, R; \ +\ + if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3, p2, sizeof(*p3)); \ + return; \ + } \ +\ + mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ + mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + sub_##field(P, P, U); /* P = U2-U1 */\ + sub_##field(R, R, S); /* R = S2-S1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, U, PP); /* Q = U1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, S, PPP); /* S1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ + mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ + mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ + mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits V, W, M; /* double |p1| */\ +\ + add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ + sqr_##field(V, U); /* V = U^2 */\ + mul_##field(W, V, U); /* W = U*V */\ + mul_##field(S, p1->X, V); /* S = X1*V */\ + sqr_##field(M, p1->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ + mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 + * with twists to handle even subtractions and either input at infinity. + * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. + */ +#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##_affine *p2, \ + bool_t subtract) \ +{ \ + vec##bits P, R; \ +\ + if (vec_is_zero(p2, sizeof(*p2))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ + cneg_##field(p3->ZZZ, one, subtract); \ + vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ + return; \ + } \ +\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + cneg_##field(R, R, subtract); \ + sub_##field(P, P, p1->X); /* P = U2-X1 */\ + sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ + mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits U, S, M; /* double |p2| */\ +\ + add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ + sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ + mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ + mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ + sqr_##field(M, p2->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ +{ \ + mul_##field(out->X, in->X, in->ZZ); \ + mul_##field(out->Y, in->Y, in->ZZZ); \ + vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ +} + +#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ +static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ +{ \ + vec_copy(out->X, in->X, 2*sizeof(out->X)); \ + sqr_##field(out->ZZ, in->Z); \ + mul_##field(out->ZZZ, out->ZZ, in->Z); \ +} + +#endif diff --git a/blst/elf/add_mod_256-armv8.S b/blst/elf/add_mod_256-armv8.S new file mode 100644 index 0000000..57476aa --- /dev/null +++ b/blst/elf/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/blst/elf/add_mod_256-x86_64.s b/blst/elf/add_mod_256-x86_64.s new file mode 100644 index 0000000..2f41781 --- /dev/null +++ b/blst/elf/add_mod_256-x86_64.s @@ -0,0 +1,572 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,@function +.align 32 +check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size check_mod_256,.-check_mod_256 + + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,@function +.align 32 +add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,@function +.align 32 +sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/add_mod_384-armv8.S b/blst/elf/add_mod_384-armv8.S new file mode 100644 index 0000000..55e0888 --- /dev/null +++ b/blst/elf/add_mod_384-armv8.S @@ -0,0 +1,931 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,%function +.align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,%function +.align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,%function +.align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,%function +.align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,%function +.align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret +.size vec_prefetch,.-vec_prefetch diff --git a/blst/elf/add_mod_384-x86_64.s b/blst/elf/add_mod_384-x86_64.s new file mode 100644 index 0000000..df61986 --- /dev/null +++ b/blst/elf/add_mod_384-x86_64.s @@ -0,0 +1,1809 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,@function +.align 32 +rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,@function +.align 32 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,@function +.align 32 +div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,@function +.align 32 +vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,@function +.align 32 +vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,@function +.align 32 +vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,@function +.align 32 +vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,@function +.align 32 +vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,@function +.align 32 +vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_prefetch,.-vec_prefetch + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/add_mod_384x384-x86_64.s b/blst/elf/add_mod_384x384-x86_64.s new file mode 100644 index 0000000..084f3d8 --- /dev/null +++ b/blst/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ct_inverse_mod_256-armv8.S b/blst/elf/ct_inverse_mod_256-armv8.S new file mode 100644 index 0000000..347eb31 --- /dev/null +++ b/blst/elf/ct_inverse_mod_256-armv8.S @@ -0,0 +1,784 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + .inst 0xd503233f + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + .inst 0xd50323bf + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/blst/elf/ct_inverse_mod_256-x86_64.s b/blst/elf/ct_inverse_mod_256-x86_64.s new file mode 100644 index 0000000..c4d8d6d --- /dev/null +++ b/blst/elf/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1185 @@ +.text + +.globl ct_inverse_mod_256 +.type ct_inverse_mod_256,@function +.align 32 +ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +.type __smulq_512x63,@function +.align 32 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,@function +.align 32 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256x63,.-__smulq_256x63 +.type __smulq_256_n_shift_by_31,@function +.align 32 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +.type __ab_approximation_31_256,@function +.align 32 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +.type __inner_loop_31_256,@function +.align 32 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,@function +.align 32 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62_256,.-__inner_loop_62_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ct_inverse_mod_384-armv8.S b/blst/elf/ct_inverse_mod_384-armv8.S new file mode 100644 index 0000000..d7eca17 --- /dev/null +++ b/blst/elf/ct_inverse_mod_384-armv8.S @@ -0,0 +1,717 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extenstion + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 diff --git a/blst/elf/ct_is_square_mod_384-armv8.S b/blst/elf/ct_is_square_mod_384-armv8.S new file mode 100644 index 0000000..ce670b7 --- /dev/null +++ b/blst/elf/ct_is_square_mod_384-armv8.S @@ -0,0 +1,324 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // just load + mov x14, x9 // ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret +.size __inner_loop_30,.-__inner_loop_30 +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x15, x15, #1 + and x21, x14, x24 + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x14, #2 + lsr x8, x8, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 diff --git a/blst/elf/ct_is_square_mod_384-x86_64.s b/blst/elf/ct_is_square_mod_384-x86_64.s new file mode 100644 index 0000000..fec1493 --- /dev/null +++ b/blst/elf/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,479 @@ +.text + +.globl ct_is_square_mod_384 +.type ct_is_square_mod_384,@function +.align 32 +ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,@function +.align 32 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +.type __ab_approximation_30,@function +.align 32 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_30,.-__ab_approximation_30 +.type __inner_loop_30,@function +.align 32 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,@function +.align 32 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_48,.-__inner_loop_48 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ctq_inverse_mod_384-x86_64.s b/blst/elf/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..b702262 --- /dev/null +++ b/blst/elf/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1195 @@ +.text + +.globl ct_inverse_mod_383 +.type ct_inverse_mod_383,@function +.align 32 +ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +.type __smulq_767x63,@function +.align 32 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_767x63,.-__smulq_767x63 +.type __smulq_383x63,@function +.align 32 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383x63,.-__smulq_383x63 +.type __smulq_383_n_shift_by_62,@function +.align 32 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +.type __ab_approximation_62,@function +.align 32 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62,@function +.align 8 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/ctx_inverse_mod_384-x86_64.s b/blst/elf/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 0000000..25a5fa5 --- /dev/null +++ b/blst/elf/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1574 @@ +.text + +.globl ctx_inverse_mod_383 +.type ctx_inverse_mod_383,@function +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +.type __smulx_767x63,@function +.align 32 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_767x63,.-__smulx_767x63 +.type __smulx_383x63,@function +.align 32 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383x63,.-__smulx_383x63 +.type __smulx_383_n_shift_by_31,@function +.align 32 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +.type __smulx_191_n_shift_by_31,@function +.align 32 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +.type __ab_approximation_31,@function +.align 32 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31,.-__ab_approximation_31 +.type __inner_loop_31,@function +.align 32 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31,.-__inner_loop_31 + +.type __inner_loop_62,@function +.align 32 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_62: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/div3w-armv8.S b/blst/elf/div3w-armv8.S new file mode 100644 index 0000000..a2b1d67 --- /dev/null +++ b/blst/elf/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + specilative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 diff --git a/blst/elf/div3w-x86_64.s b/blst/elf/div3w-x86_64.s new file mode 100644 index 0000000..00ae569 --- /dev/null +++ b/blst/elf/div3w-x86_64.s @@ -0,0 +1,123 @@ +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,@function +.align 32 +div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,@function +.align 32 +quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + + + + + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,@function +.align 32 +quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mul_mont_256-armv8.S b/blst/elf/mul_mont_256-armv8.S new file mode 100644 index 0000000..8bb1197 --- /dev/null +++ b/blst/elf/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/blst/elf/mul_mont_384-armv8.S b/blst/elf/mul_mont_384-armv8.S new file mode 100644 index 0000000..c048e81 --- /dev/null +++ b/blst/elf/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __sqr_384,%function +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret +.size __sqr_384,.-__sqr_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/blst/elf/mulq_mont_256-x86_64.s b/blst/elf/mulq_mont_256-x86_64.s new file mode 100644 index 0000000..37abd43 --- /dev/null +++ b/blst/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,714 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulq_mont_384-x86_64.s b/blst/elf/mulq_mont_384-x86_64.s new file mode 100644 index 0000000..fa9dd35 --- /dev/null +++ b/blst/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3620 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redc_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulx_mont_256-x86_64.s b/blst/elf/mulx_mont_256-x86_64.s new file mode 100644 index 0000000..20a0207 --- /dev/null +++ b/blst/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,627 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/mulx_mont_384-x86_64.s b/blst/elf/mulx_mont_384-x86_64.s new file mode 100644 index 0000000..9f9f740 --- /dev/null +++ b/blst/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2968 @@ +.text + + + + + + + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__sub_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __add_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __sub_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __sub_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __add_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __sub_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __sub_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __sub_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __sub_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redc_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redc_tail_mont_384,@function +.align 32 +__redc_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/sha256-armv8.S b/blst/elf/sha256-armv8.S new file mode 100644 index 0000000..7341dec --- /dev/null +++ b/blst/elf/sha256-armv8.S @@ -0,0 +1,1077 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with raionale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.text + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl blst_sha256_block_armv8 +.type blst_sha256_block_armv8,%function +.align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,%function +.align 4 +blst_sha256_block_data_order: + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,%function +.align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,%function +.align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,%function +.align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret +.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/blst/elf/sha256-portable-x86_64.s b/blst/elf/sha256-portable-x86_64.s new file mode 100644 index 0000000..20b5c41 --- /dev/null +++ b/blst/elf/sha256-portable-x86_64.s @@ -0,0 +1,1754 @@ +.text + +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 16 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp +.cfi_adjust_cfa_offset 16*4+3*8 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/elf/sha256-x86_64.s b/blst/elf/sha256-x86_64.s new file mode 100644 index 0000000..47fdc5b --- /dev/null +++ b/blst/elf/sha256-x86_64.s @@ -0,0 +1,1446 @@ +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext +.hidden blst_sha256_block_data_order_shaext +.type blst_sha256_block_data_order_shaext,@function +.align 64 +blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext +.globl blst_sha256_block_data_order +.hidden blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 64 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $40,%rsp +.cfi_adjust_cfa_offset 40 + leaq (%rsi,%rdx,4),%rdx + movq %rdi,0(%rsp) + + movq %rdx,16(%rsp) + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,8(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 0(%rbp),%rdi + movl %r14d,%eax + movq 8(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq 16(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + leaq 40+48(%rbp),%r11 +.cfi_def_cfa %r11,8 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq 40(%rbp),%r15 +.cfi_restore %r15 + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbx +.cfi_restore %rbx + movq -8(%r11),%rbp +.cfi_restore %rbp + + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/blst/errors.h b/blst/errors.h new file mode 100644 index 0000000..425daeb --- /dev/null +++ b/blst/errors.h @@ -0,0 +1,19 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_ERRORS_H__ +#define __BLS12_381_ASM_ERRORS_H__ + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +#endif diff --git a/blst/exp.c b/blst/exp.c new file mode 100644 index 0000000..55c5c5a --- /dev/null +++ b/blst/exp.c @@ -0,0 +1,55 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} diff --git a/blst/exports.c b/blst/exports.c new file mode 100644 index 0000000..833c18a --- /dev/null +++ b/blst/exports.c @@ -0,0 +1,584 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" + +/* + * BLS12-381-specifc Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, int flag) +{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_from_scalar(vec256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); + vec_zero(out, sizeof(out)); + } +} + +void blst_scalar_from_fr(pow256 ret, const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + from_mont_256(out, a, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_fr_check(const pow256 a) +{ return (int)(check_mod_256(a, BLS12_381_r) | + bytes_are_zero(a, sizeof(pow256))); +} + +int blst_sk_check(const pow256 a) +{ return (int)check_mod_256(a, BLS12_381_r); } + +int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) +{ + vec256 a_fr, b_fr; + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { + limbs_from_le_bytes(a_fr, a, sizeof(a_fr)); + limbs_from_le_bytes(b_fr, b, sizeof(a_fr)); + a = (const byte *)a_fr; + b = (const byte *)b_fr; + } + mul_mont_sparse_256(a_fr, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(b_fr, (const limb_t *)b, BLS12_381_rRR, + BLS12_381_r, r0); + mul_mont_sparse_256(a_fr, a_fr, b_fr, BLS12_381_r, r0); + from_mont_256(a_fr, a_fr, BLS12_381_r, r0); + le_bytes_from_limbs(ret, a_fr, sizeof(a_fr)); + + return (int)(vec_is_zero(a_fr, sizeof(a_fr)) ^ 1); +} + +void blst_sk_inverse(pow256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { + limb_t *out = (limb_t *)ret; + mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, int flag) +{ cneg_fp(ret, a, is_zero(flag) ^ 1); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) +{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } + +/* + * Scalar serialization/deseriazation + */ +void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = (unsigned int)(*a++); + w |= (unsigned int)(*a++) << 8; + w |= (unsigned int)(*a++) << 16; + w |= (unsigned int)(*a++) << 24; + ret[i] = w; + } +} + +void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + *ret++ = (byte)(w >> 32); + *ret++ = (byte)(w >> 40); + *ret++ = (byte)(w >> 48); + *ret++ = (byte)(w >> 56); + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = (unsigned long long)(*a++); + w |= (unsigned long long)(*a++) << 8; + w |= (unsigned long long)(*a++) << 16; + w |= (unsigned long long)(*a++) << 24; + w |= (unsigned long long)(*a++) << 32; + w |= (unsigned long long)(*a++) << 40; + w |= (unsigned long long)(*a++) << 48; + w |= (unsigned long long)(*a++) << 56; + ret[i] = w; + } +} + +void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + vec256 out; + limbs_from_le_bytes(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + int i; + + from_mont_256(out, a, BLS12_381_r, r0); + for (i = 0; i < 4; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + while (n > 32) { + limbs_from_le_bytes(t.digit, bytes, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + bytes += 32; + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_le_bytes(t.digit, bytes, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + struct { vec256 out, digit, radix; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + vec_copy(t.radix, BLS12_381_rRR, sizeof(t.radix)); + + bytes += n; + while (n > 32) { + limbs_from_be_bytes(t.digit, bytes -= 32, 32); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.radix, t.radix, BLS12_381_rRR, BLS12_381_r, r0); + n -= 32; + } + + vec_zero(t.digit, sizeof(t.digit)); + limbs_from_be_bytes(t.digit, bytes -= n, n); + from_mont_256(t.digit, t.digit, BLS12_381_r, r0); + mul_mont_sparse_256(t.digit, t.digit, t.radix, BLS12_381_r, r0); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(t.out, 2*sizeof(t.out)); + + return (int)(ret^1); +} + +/* + * Test facilitator + */ +static unsigned char nibble(char c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'a' && c <= 'f') + return 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + return 10 + c - 'A'; + else + return 16; +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +void blst_scalar_from_hexascii(vec256 ret, const char *hex) +{ limbs_from_hexascii(ret, sizeof(vec256), hex); } + +void blst_fp_from_hexascii(vec384 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/blst/fields.h b/blst/fields.h new file mode 100644 index 0000000..3e451c4 --- /dev/null +++ b/blst/fields.h @@ -0,0 +1,211 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +#ifndef __CUDA_ARCH__ +/* + * BLS12-381-specifc Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) +{ rshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void div_by_2_fp(vec384 ret, const vec384 a) +{ div_by_2_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void redc_fp(vec384 ret, const vec768 a) +{ redc_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specifc Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define vec_load_global vec_copy + +static void reciprocal_fp(vec384 out, const vec384 inp); +static void flt_reciprocal_fp(vec384 out, const vec384 inp); +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); +static bool_t sqrt_fp(vec384 out, const vec384 inp); + +static void reciprocal_fp2(vec384x out, const vec384x inp); +static void flt_reciprocal_fp2(vec384x out, const vec384x inp); +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, const vec384x magic_ZZZ); +static bool_t sqrt_fp2(vec384x out, const vec384x inp); +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#else + +extern "C" { +__device__ void mul_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void sqr_fp(vec384 ret, const vec384 a); +__device__ void add_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void sub_fp(vec384 ret, const vec384 a, const vec384 b); +__device__ void cneg_fp(vec384 ret, const vec384 ap, unsigned int flag); +__device__ void rshift_fp(vec384 ret, const vec384 a, unsigned int cnt); +__device__ void lshift_fp(vec384 ret, const vec384 a, unsigned int cnt); +__device__ void mul_by_3_fp(vec384 ret, const vec384 a); +__device__ void from_fp(vec384 ret, const vec384 a); + +#pragma diag_suppress 3151 +__device__ void mul_384(vec768 ret, const vec384 a, const vec384 b); +__device__ void sqr_384(vec768 ret, const vec384 a); +#pragma diag_default 3151 +__device__ void redc_fp(vec384 ret, const vec768 a); +__device__ void add_fpx2(vec768 ret, const vec768 a, const vec768 b); +__device__ void sub_fpx2(vec768 ret, const vec768 a, const vec768 b); + +__device__ void vec_load_global(limb_t *ret, const limb_t *a, + unsigned int sz = 48); +} + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ lshift_fp(ret, a, 3); } + +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + add_fp(ret[0], a[0], b[0]); + add_fp(ret[1], a[1], b[1]); +} + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + sub_fp(ret[0], a[0], b[0]); + sub_fp(ret[1], a[1], b[1]); +} + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ + mul_by_3_fp(ret[0], a[0]); + mul_by_3_fp(ret[1], a[1]); +} + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ + lshift_fp(ret[0], a[0], 3); + lshift_fp(ret[1], a[1], 3); +} + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_fp(ret[0], a[0], count); + lshift_fp(ret[1], a[1], count); +} + +static inline void cneg_fp2(vec384x ret, const vec384x a, limb_t flag) +{ + cneg_fp(ret[0], a[0], flag); + cneg_fp(ret[1], a[1], flag); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ + vec384 aa, bb, cc; + + add_fp(aa, a[0], a[1]); + add_fp(bb, b[0], b[1]); + mul_fp(bb, bb, aa); + + mul_fp(aa, a[0], b[0]); + mul_fp(cc, a[1], b[1]); + + sub_fp(ret[0], aa, cc); + sub_fp(ret[1], bb, aa); + sub_fp(ret[1], ret[1], cc); +} + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ + vec384 t0, t1; + + add_fp(t0, a[0], a[1]); + sub_fp(t1, a[0], a[1]); + + mul_fp(ret[1], a[0], a[1]); + add_fp(ret[1], ret[1], ret[1]); + + mul_fp(ret[0], t0, t1); +} +#endif + +#define neg_fp(r,a) cneg_fp((r),(a),1) +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/blst/fp12_tower.c b/blst/fp12_tower.c new file mode 100644 index 0000000..037b7db --- /dev/null +++ b/blst/fp12_tower.c @@ -0,0 +1,771 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 && !defined(__BLST_NO_ASM__) +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + reciprocal_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specifc Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } + +int blst_fp12_is_one(const vec384fp12 a) +{ + return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); +} + +const vec384fp12 *blst_fp12_one(void) +{ return (const vec384fp12 *)BLS12_381_Rx.p12; } diff --git a/blst/hash_to_field.c b/blst/hash_to_field.c new file mode 100644 index 0000000..42733b1 --- /dev/null +++ b/blst/hash_to_field.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + if (DST_len > 255) { + sha256_init(&ctx); + sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); + sha256_update(&ctx, DST, DST_len); + sha256_final(b_0.c, &ctx); + DST = b_0.c, DST_len = 32; + } + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ + len_in_bytes /= 32; /* caller being responsible for accordingly large + * buffer. hash_to_field passes one with length + * divisible by 64, remember? which works... */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} +#endif + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} + +void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); + unsigned char *buf_ptr = bytes; + + if (buf_len > 255*32) + return; + + if (buf_len != len_in_bytes) + buf_ptr = alloca(buf_len); + + expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, + DST, DST_len); + if (buf_ptr != bytes) { + unsigned char *ptr = buf_ptr; + while (len_in_bytes--) + *bytes++ = *ptr++; + vec_zero(buf_ptr, buf_len); + } +} diff --git a/blst/keygen.c b/blst/keygen.c new file mode 100644 index 0000000..de749ac --- /dev/null +++ b/blst/keygen.c @@ -0,0 +1,182 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef __BLST_HKDF_TESTMODE__ + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef __BLST_HKDF_TESTMODE__ + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef __BLST_HKDF_TESTMODE__ +void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + unsigned char salt[32] = "BLS-SIG-KEYGEN-SALT-"; + size_t salt_len = 20; + + if (IKM_len < 32) { + vec_zero(SK, sizeof(pow256)); + return; + } + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + do { + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt, &scratch.ctx.ctx); + salt_len = sizeof(salt); + + /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, salt, salt_len, + IKM, IKM_len, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); + /* + * Given that mul_mont_sparse_256 has special boundary conditions + * it's appropriate to mention that redc_mont_256 output is fully + * reduced at this point. Because we started with 384-bit input, + * one with most significant half smaller than the modulus. + */ + mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, + BLS12_381_r, r0); + } while (vec_is_zero(scratch.key, sizeof(vec256))); + + le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} +#endif diff --git a/blst/map_to_g1.c b/blst/map_to_g1.c new file mode 100644 index 0000000..6613d68 --- /dev/null +++ b/blst/map_to_g1.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) +{ + POINTonE1_dadd(out, out, p, NULL); + while(n--) + POINTonE1_double(out, out); +} + +static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) +{ + POINTonE1_double(out, in); /* 1: 0x2 */ + POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_times_minus_z(out, &p); + POINTonE1_dadd(out, out, &p, NULL); +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void sigma(POINTonE1 *out, const POINTonE1 *in); + +#if 0 +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const byte zz_minus_1_div_by_3[] = { + TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_dadd(out, out, in, NULL); + } +} +#else +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_dadd(out, out, p, NULL); +} + +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ + POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ + POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, P); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} +#else +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + POINTonE1_times_minus_z(&t0, P); + POINTonE1_times_minus_z(&t1, &t0); + POINTonE1_cneg(&t1, 1); /* [-z²]P */ + + sigma(&t0, P); /* σ(P) */ + sigma(&t0, &t0); /* σ²(P) */ + + return POINTonE1_is_equal(&t0, &t1); +} +#endif + +int blst_p1_in_g1(const POINTonE1 *p) +{ return (int)POINTonE1_in_G1(p); } + +int blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ + POINTonE1 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE1_in_G1(&P); +} diff --git a/blst/map_to_g2.c b/blst/map_to_g2.c new file mode 100644 index 0000000..90fd86e --- /dev/null +++ b/blst/map_to_g2.c @@ -0,0 +1,444 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ + /* a^2 + b^2 */ + { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), + TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), + TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, + /* (a^2 + b^2)^((P-3)/4) */ + { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), + TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), + TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + recip_ZZZ, magic_ZZZ); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const byte h_eff[] = { + TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), + TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), + TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), + TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), + TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_dadd(out, out, p, NULL); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void psi(POINTonE2 *out, const POINTonE2 *in); + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ + POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static bool_t POINTonE2_in_G2(const POINTonE2 *P) +{ +#if 0 + POINTonE2 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, P); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_dadd(&t0, &t0, &t2, NULL); + POINTonE2_cneg(&t0, 1); + POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +#else + POINTonE2 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + psi(&t0, P); /* Ψ(P) */ + + POINTonE2_times_minus_z(&t1, P); + POINTonE2_cneg(&t1, 1); /* [z]P */ + + return POINTonE2_is_equal(&t0, &t1); +#endif +} + +int blst_p2_in_g2(const POINTonE2 *p) +{ return (int)POINTonE2_in_G2(p); } + +int blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ + POINTonE2 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE2_in_G2(&P); +} diff --git a/blst/multi_scalar.c b/blst/multi_scalar.c new file mode 100644 index 0000000..d0b3dee --- /dev/null +++ b/blst/multi_scalar.c @@ -0,0 +1,414 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * Infinite point among inputs would be devastating. Shall we change it? + */ +#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ +static void ptype##s_to_affine(ptype##_affine dst[], \ + const ptype *const points[], size_t npoints) \ +{ \ + size_t i; \ + vec##bits *acc, ZZ, ZZZ; \ + const ptype *point = NULL; \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ +\ + while (npoints) { \ + const ptype *p, *const *walkback; \ + size_t delta = strideZ, sizeof(vec##bits)); \ + for (i = 1; i < delta; i++, acc++) \ + point = *points ? *points++ : point+1, \ + mul_##field(acc[0], acc[-1], point->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + walkback = points-1, p = point, --delta, dst += delta; \ + for (i = 0; i < delta; i++, acc--, dst--) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], p->Z, acc[0]); \ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + p = (p == *walkback) ? *--walkback : p-1; \ + } \ + sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + ++delta, dst += delta, npoints -= delta; \ + } \ +} \ +\ +void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ + size_t npoints) \ +{ ptype##s_to_affine(dst, points, npoints); } + +POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) +POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) + +/* + * This is two-step multi-scalar multiplication procedure. First, given + * a set of points you pre-compute a table for chosen windowing factor + * [expressed in bits with value between 2 and 14], and then you pass + * this table to the actual multiplication procedure along with scalars. + * Idea is that the pre-computed table will be reused multiple times. In + * which case multiplication runs faster than below Pippenger algorithm + * implementation for up to ~16K points for wbits=8, naturally at the + * expense of multi-megabyte table. One can trade even more memory for + * performance, but each wbits increment doubles the memory requirement, + * so at some point it gets prohibively large... For reference, without + * reusing the table it's faster than Pippenger algorithm for up ~32 + * points [with wbits=5]... + */ + +#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) + +#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ + const ptype##_affine *point) \ +{ \ + size_t i, j, n = (size_t)1 << (wbits-1); \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ + vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ + ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < n; i += 2, j++) \ + ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ + size_t wbits, size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t i, j; \ + vec##bits *acc, ZZ, ZZZ; \ +\ + src += total; \ + acc = (vec##bits *)src; \ + vec_copy(acc++, one, sizeof(vec##bits)); \ + for (i = 0; i < npoints; i++) \ + for (j = nwin; --src, --j; acc++) \ + mul_##field(acc[0], acc[-1], src->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + for (i = 0; i < npoints; i++) { \ + vec_copy(dst++, src++, sizeof(ptype##_affine)); \ + for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], src->Z, acc[0]); \ + mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ + } \ + } \ +} \ +\ +/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ +static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ + size_t i, top = 0; \ + ptype *rows, *row; \ + const ptype##_affine *point = NULL; \ + size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ + if (stride == 0) stride = 1; \ +\ + while (npoints >= nmin) { \ + size_t limit = total - npoints; \ +\ + if (top + (stride << wbits) > limit) { \ + stride = (limit - top) >> wbits; \ + if (stride == 0) break; \ + } \ + rows = row = (ptype *)(&table[top]); \ + for (i = 0; i < stride; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ + top += stride << (wbits-1); \ + npoints -= stride; \ + } \ + rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ + for (i = 0; i < npoints; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ +} \ +\ +size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ +{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ +void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ ptype##s_precompute_wbits(table, wbits, points, npoints); } + +#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ + size_t wbits, limb_t booth_idx) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ + bool_t idx_is_zero; \ + static const ptype##_affine infinity = { 0 }; \ +\ + booth_idx &= ((limb_t)1 << wbits) - 1; \ + idx_is_zero = is_zero(booth_idx); \ + booth_idx -= 1 ^ idx_is_zero; \ + vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ + const byte *scalar, *const *scalar_s = scalars; \ + const ptype##_affine *row = table; \ +\ + size_t scratch_sz = SCRATCH_SZ(ptype); \ + if (scratch == NULL) { \ + scratch_sz /= 4; /* limit to 288K */ \ + scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ + scratch = alloca(sizeof(ptype) * scratch_sz); \ + } \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = nbits % wbits; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + nbits -= window; \ + z = is_zero(nbits); \ + wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ + row += nwin; \ +\ + i = 1; vec_zero(ret, sizeof(*ret)); \ + while (nbits > 0) { \ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +\ + for (j = 0; j < wbits; j++) \ + ptype##_double(ret, ret); \ +\ + window = wbits; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + nbits -= window; \ + i = 0; row = table; scalar_s = scalars; \ + } \ +\ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +} \ +\ +size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ +{ \ + const size_t scratch_sz = SCRATCH_SZ(ptype); \ + return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ +} \ +void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } + +PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) + +PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) + +/* + * Pippenger algorithm implementation, fastest option for larger amount + * of points... + */ + +static size_t pippenger_window_size(size_t npoints) +{ + size_t wbits; + + for (wbits=0; npoints>>=1; wbits++) ; + + return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); +} + +#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ +typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; + +#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ +static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ + size_t wbits) \ +{ \ + ptype##xyzz ret[1], acc[1]; \ + size_t n = (size_t)1 << wbits; \ +\ + /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ + vec_copy(acc, &buckets[--n], sizeof(acc)); \ + vec_copy(ret, &buckets[n], sizeof(ret)); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + while (n--) { \ + ptype##xyzz_dadd(acc, acc, &buckets[n]); \ + ptype##xyzz_dadd(ret, ret, acc); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + } \ + ptype##xyzz_to_Jacobian(out, ret); \ +} \ +\ +static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits, const ptype##_affine *p) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ +\ + booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ + else wbits = cbits = window; \ + ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ + bit0, wbits, cbits); \ +} \ +void prefix##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[]) \ +{ ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); } + +DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) + +DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/blst/no_asm.h b/blst/no_asm.h new file mode 100644 index 0000000..4f12f53 --- /dev/null +++ b/blst/no_asm.h @@ -0,0 +1,1287 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if LIMB_T_BITS==32 +typedef unsigned long long llimb_t; +#endif + +#if defined(__clang__) +# pragma GCC diagnostic ignored "-Wstatic-in-inline" +#endif + +static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n+1], carry; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (carry=0, j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + limbx = tmp[i] + (hi + (llimb_t)carry); + tmp[i-1] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + limbx = hi + (llimb_t)carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#define SUB_MOD_IMPL(bits) \ +inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_MOD_IMPL(256) +SUB_MOD_IMPL(384) + +static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], + size_t n) +{ + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n], two_a[n]; + size_t i; + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; + mask = (limb_t)0 - flag; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + return borrow & (is_zero(acc) ^ 1); +} + +#define CHECK_MOD_IMPL(bits) \ +inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ +{ return check_mod_n(a, p, NLIMBS(bits)); } + +CHECK_MOD_IMPL(256) + +static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + add_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define ADD_N_CHECK_MOD_IMPL(bits) \ +inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_N_CHECK_MOD_IMPL(256) + +static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + sub_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define SUB_N_CHECK_MOD_IMPL(bits) \ +inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_N_CHECK_MOD_IMPL(256) + +static void from_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n]; + size_t i, j; + + for (j=0; j> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + a = tmp; + } + + /* this is needed only if input can be non-fully-reduced */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + for(i=0; i> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + b = tmp; + } + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + + for(i=0; i> LIMB_T_BITS); + } + + for (next=ret[0], i=0; i> 1; + next = ret[i+1]; + ret[i] = limb | next << (LIMB_T_BITS-1); + } + ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); + + a = ret; + } +} + +#define RSHIFT_MOD_IMPL(bits) \ +inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +RSHIFT_MOD_IMPL(256) +RSHIFT_MOD_IMPL(384) + +#define DIV_BY_2_MOD_IMPL(bits) \ +inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } + +DIV_BY_2_MOD_IMPL(384) + +static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) +{ + llimb_t limbx; + limb_t carry, borrow, ret, tmp[n]; + size_t i; + + ret = a[0] & 1; /* parity */ + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + ret |= ((carry - borrow) & 2) ^ 2; + + return ret; +} + +inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) +{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } + +inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) +{ + vec384 tmp; + + from_mont_n(tmp, a, p, n0, NLIMBS(384)); + + return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); +} + +inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) +{ + limb_t re, im, sign, prty; + + re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); + im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); + + /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ + sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); + sign = (re & sign) | (im & ~sign); + + /* a->re==0 ? prty(a->im) : prty(a->re) */ + prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); + prty = (im & prty) | (re & ~prty); + + return (sign & 2) | (prty & 1); +} + +inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) +{ + vec384x tmp; + + from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); + from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); + + return sgn0_pty_mod_384x(tmp, p); +} + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0) +{ + vec384 aa, bb, cc; + + add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); + add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); + mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); + mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); + mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); + sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); + sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); + sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); +} + +/* + * mul_mont_n without final conditional subtraction, which implies + * that modulus is one bit short, which in turn means that there are + * no carries to handle between iterations... + */ +static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + llimb_t limbx; + limb_t mx, hi, tmp[n+1]; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = tmp[i] + hi; + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + } + + vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); +} + +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b) +{ + while(count--) { + mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); + a = ret; + } + mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); +} + +void sqr_mont_382x(vec384x ret, const vec384x a, + const vec384 p, limb_t n0) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + vec384 t0, t1; + + /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + + /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + + /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); + + /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); + + /* account for t1's sign... */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#define MSB(x) ((x) >> (LIMB_T_BITS-1)) + +static size_t num_bits(limb_t l) +{ + limb_t x, mask; + size_t bits = is_zero(l) ^ 1; + + if (sizeof(limb_t) == 8) { + x = l >> (32 & (8*sizeof(limb_t)-1)); + mask = 0 - MSB(0 - x); + bits += 32 & mask; + l ^= (x ^ l) & mask; + } + + x = l >> 16; + mask = 0 - MSB(0 - x); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0 - MSB(0 - x); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0 - MSB(0 - x); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0 - MSB(0 - x); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + bits += l >> 1; + + return bits; +} + +#if defined(__clang_major__) && __clang_major__>7 +__attribute__((optnone)) +#endif +static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) +{ + size_t r = LIMB_T_BITS - l; + limb_t mask = 0 - (is_zero(l)^1); + return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); +} + +/* + * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. + */ +static void ab_approximation_n(limb_t a_[2], const limb_t a[], + limb_t b_[2], const limb_t b[], size_t n) +{ + limb_t a_hi, a_lo, b_hi, b_lo, mask; + size_t i; + + i = n-1; + a_hi = a[i], a_lo = a[i-1]; + b_hi = b[i], b_lo = b[i-1]; + for (i--; --i;) { + mask = 0 - is_zero(a_hi | b_hi); + a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; + b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; + a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; + b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; + } + i = LIMB_T_BITS - num_bits(a_hi | b_hi); + /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ + + a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); + b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); +} + +typedef struct { limb_t f0, g0, f1, g1; } factors; + +static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], + size_t n) +{ + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; +} + +static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) +{ + llimb_t limbx = 0; + limb_t carry; + size_t i; + + for (carry=neg&1, i=0; i> LIMB_T_BITS); + } + + return 0 - MSB((limb_t)limbx); +} + +static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) +{ + llimb_t limbx; + limb_t carry; + size_t i; + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + return carry; +} + +static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) +{ + llimb_t limbx; + limb_t hi; + size_t i; + + for (hi=0, i=0; i> LIMB_T_BITS); + } + + return hi; +} + +static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, + const limb_t b[], limb_t *g_, + size_t n) +{ + limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; + size_t i; + + /* |a|*|f_| */ + f = *f_; + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + (void)cneg_n(a_, a, neg, n); + hi = umul_n(a_, a_, f, n); + a_[n] = hi - (f & neg); + + /* |b|*|g_| */ + g = *g_; + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + (void)cneg_n(b_, b, neg, n); + hi = umul_n(b_, b_, g, n); + b_[n] = hi - (g & neg); + + /* |a|*|f_| + |b|*|g_| */ + (void)add_n(a_, a_, b_, n+1); + + /* (|a|*|f_| + |b|*|g_|) >> k */ + for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); + carry = a_[i+1]; + ret[i] = hi | (carry << 2); + } + + /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ + neg = 0 - MSB(carry); + *f_ = (*f_ ^ neg) - neg; + *g_ = (*g_ ^ neg) - neg; + (void)cneg_n(ret, ret, neg, n); + + return neg; +} + +static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, + const limb_t v[], limb_t g, size_t n) +{ + limb_t u_[n], v_[n], neg, hi; + + /* |u|*|f_| */ + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + neg = cneg_n(u_, u, neg, n); + hi = umul_n(u_, u_, f, n) - (f&neg); + + /* |v|*|g_| */ + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + neg = cneg_n(v_, v, neg, n); + hi += umul_n(v_, v_, g, n) - (g&neg); + + /* |u|*|f_| + |v|*|g_| */ + hi += add_n(ret, u_, v_, n); + + return hi; +} + +static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], + const limb_t mod[], const limb_t modx[], size_t n) +{ + llimb_t limbx; + limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; + limb_t a_[2], b_[2], sign, carry, top; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + vec_zero(u, sizeof(u)); u[0] = 1; + vec_zero(v, sizeof(v)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); + (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + smul_2n(t, u, fg.f0, v, fg.g0, 2*n); + smul_2n(v, u, fg.f1, v, fg.g1, 2*n); + vec_copy(u, t, sizeof(u)); + } + + inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); + + sign = 0 - MSB(top); /* top is 1, 0 or -1 */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + top += carry; + sign = 0 - top; /* top is 1, 0 or -1 */ + top |= sign; + for (i=0; i> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + L += ((t_lo & b_lo) >> 1) & borrow; + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + + L += (b_lo + 2) >> 2; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; + + return L; +} + +static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) +{ + limb_t a[n], b[n], t[n]; + limb_t a_[2], b_[2], neg, L = 0; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); + neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + L += (b[0] >> 1) & neg; + } + + L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + + return (L & 1) ^ 1; +} + +#define CT_IS_SQR_MOD_IMPL(bits) \ +inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ + const vec##bits mod) \ +{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } + +CT_IS_SQR_MOD_IMPL(384) + +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) +{ + llimb_t Rx; + limb_t r_lo = div_top[0], r_hi = div_top[1]; + limb_t Q = 0, mask, borrow, rx; + size_t i; + + for (i = 0; i < LIMB_T_BITS; i++) { + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS); + + /* "if (R >= D) R -= D" */ + r_lo = ((r_lo ^ rx) & borrow) ^ rx; + rx = (limb_t)Rx; + r_hi = ((r_hi ^ rx) & borrow) ^ rx; + + Q <<= 1; + Q |= ~borrow & 1; + + /* "D >>= 1" */ + d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); + d_hi >>= 1; + } + + mask = 0 - MSB(Q); /* does it overflow? */ + + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + + Q <<= 1; + Q |= borrow ^ 1; + + return (Q | mask); +} + +static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, + limb_t quotient, size_t n) +{ + llimb_t limbx; + limb_t tmp[n+1], carry, mask, borrow; + size_t i; + + /* divisor*quotient */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + tmp[i] = carry; + + /* remainder = dividend - divisor*quotient */ + for (borrow=0, i=0; i<=n; i++) { + limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + + /* if quotient was off by one, add divisor to the remainder */ + for (carry=0, i=0; i> LIMB_T_BITS) & 1; + } + + return (div_rem[i] = quotient + mask); +} + +inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } + +inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } + +/* + * Unlock reference implementations in vect.c + */ +#define mul_by_8_mod_384 mul_by_8_mod_384 +#define mul_by_8_mod_384x mul_by_8_mod_384x +#define mul_by_3_mod_384x mul_by_3_mod_384x +#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x +#define add_mod_384x add_mod_384x +#define sub_mod_384x sub_mod_384x +#define lshift_mod_384x lshift_mod_384x +#define sqr_mont_384x sqr_mont_384x + +inline void vec_prefetch(const void *ptr, size_t len) +{ (void)ptr; (void)len; } + +/* + * SHA-256 + */ +#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) +#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) +#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) +#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) +#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) +#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +void blst_sha256_block_data_order(unsigned int *v, const void *inp, + size_t blocks) +{ + static const unsigned int K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; + const unsigned char *data = inp; + size_t round; + + a = v[0]; + b = v[1]; + c = v[2]; + d = v[3]; + e = v[4]; + f = v[5]; + g = v[6]; + h = v[7]; + + while (blocks--) { + for (round = 0; round < 16; round++) { + l = (unsigned int)data[0] << 24; + l |= (unsigned int)data[1] << 16; + l |= (unsigned int)data[2] << 8; + l |= (unsigned int)data[3]; + data += 4; + T1 = X[round] = l; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; round < 64; round++) { + s0 = X[(round + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(round + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + a += v[0]; v[0] = a; + b += v[1]; v[1] = b; + c += v[2]; v[2] = c; + d += v[3]; v[3] = d; + e += v[4]; v[4] = e; + f += v[5]; v[5] = f; + g += v[6]; v[6] = g; + h += v[7]; v[7] = h; + } +} +#undef ROTR +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj + +void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) +{ + size_t i; + + for (i=0; i<8; i++) + dst[i] = src[i]; +} + +void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + size_t i; + + for (i=0; i<8; i++, md+=4) { + unsigned int h_i = h[i]; + md[0] = (unsigned char)(h_i >> 24); + md[1] = (unsigned char)(h_i >> 16); + md[2] = (unsigned char)(h_i >> 8); + md[3] = (unsigned char)h_i; + } +} + +void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) +{ + unsigned char *dst = dst_; + const unsigned char *src = src_; + size_t i; + + for (i=0; iZ); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | + vec_is_zero(&P[0], sizeof(P[0]))) ) { + /* + * Special case of infinite aggregated signature, pair the additive + * group's identity with the multiplicative group's identity. + */ + vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); + return; + } + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ + pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from /pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, + P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); +} + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } + +static bool_t is_cyclotomic(const vec384fp12 f) +{ + vec384fp12 a, b; + + frobenius_map_fp12(a, f, 2); + frobenius_map_fp12(b, a, 2); + mul_fp12(b, b, f); + + return vec_is_equal(a, b, sizeof(a)); +} + +int blst_fp12_in_group(const vec384fp12 f) +{ + vec384fp12 a, b; + + if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) + return 0; + + frobenius_map_fp12(a, f, 1); + raise_to_z(b, f); + + return (int)vec_is_equal(a, b, sizeof(a)); +} diff --git a/blst/point.h b/blst/point.h new file mode 100644 index 0000000..4d041b0 --- /dev/null +++ b/blst/point.h @@ -0,0 +1,61 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const byte *scalar, size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, bool_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, bool_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, bool_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/blst/rb_tree.c b/blst/rb_tree.c new file mode 100644 index 0000000..207becd --- /dev/null +++ b/blst/rb_tree.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +struct node { + struct node *leafs[2]; + const void *data; + size_t len_n_colour; /* len<<1 | colour */ +}; + +struct rb_tree { + struct node *root; + size_t n_nodes; + struct node nodes[1]; +}; + +static long bytes_compare(const unsigned char *ptr0, size_t len0, + const unsigned char *ptr1, size_t len1) +{ + size_t i, len = len0len_n_colour &= ~(size_t)1) +#define PAINT_RED(p) ((p)->len_n_colour |= 1) +#define IS_RED(p) ((p)->len_n_colour & 1) + +static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) +{ + struct node *nodes[8*sizeof(void *)]; /* visited nodes */ + unsigned char dirs[8*sizeof(void *)]; /* taken directions */ + size_t k = 0; /* walked distance */ + struct node *p, *y, *z; + + for (p = tree->root; p != NULL; k++) { + long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); + + if (cmp == 0) + return 0; /* already in tree, no insertion */ + + /* record the step */ + nodes[k] = p; + p = p->leafs[(dirs[k] = cmp>0)]; + } + + /* allocate new node */ + z = &tree->nodes[tree->n_nodes++]; + z->leafs[0] = z->leafs[1] = NULL; + z->data = data; + z->len_n_colour = len<<1; + PAINT_RED(z); + + /* graft |z| */ + if (k > 0) + nodes[k-1]->leafs[dirs[k-1]] = z; + else + tree->root = z; + + /* re-balance |tree| */ + while (k >= 2 && IS_RED(y = nodes[k-1])) { + size_t ydir = dirs[k-2]; + struct node *x = nodes[k-2], /* |z|'s grandparent */ + *s = x->leafs[ydir^1]; /* |z|'s uncle */ + + if (s != NULL && IS_RED(s)) { + PAINT_RED(x); + PAINT_BLACK(y); + PAINT_BLACK(s); + k -= 2; + } else { + if (dirs[k-1] != ydir) { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + struct node *t = y; + y = y->leafs[ydir^1]; + t->leafs[ydir^1] = y->leafs[ydir]; + y->leafs[ydir] = t; + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x->leafs[ydir] = y->leafs[ydir^1]; + y->leafs[ydir^1] = x; + + PAINT_RED(x); + PAINT_BLACK(y); + + if (k > 2) + nodes[k-3]->leafs[dirs[k-3]] = y; + else + tree->root = y; + + break; + } + } + + PAINT_BLACK(tree->root); + + return 1; +} + +#undef IS_RED +#undef PAINT_RED +#undef PAINT_BLACK + +size_t blst_uniq_sizeof(size_t n_nodes) +{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } + +void blst_uniq_init(struct rb_tree *tree) +{ + tree->root = NULL; + tree->n_nodes = 0; +} + +int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) +{ return (int)rb_tree_insert(tree, data, len); } diff --git a/blst/recip-addchain.h b/blst/recip-addchain.h new file mode 100644 index 0000000..e4e436a --- /dev/null +++ b/blst/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/blst/recip.c b/blst/recip.c new file mode 100644 index 0000000..e0c7006 --- /dev/null +++ b/blst/recip.c @@ -0,0 +1,139 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + static const byte BLS12_381_P_minus_2[] = { + TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), + TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), + TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} +#else +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P +# undef sqr_n_mul +# undef mul +# undef sqr +#endif + +static void flt_reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + flt_reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const vec384 Px8 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), + TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), + TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) + }; +#ifdef __BLST_NO_ASM__ +# define RRx4 BLS12_381_RR +#else + static const vec384 RRx4 = { /* (4<<768)%P */ + TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), + TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), + TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) + }; +#endif + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); + redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* sign goes straight to flt_reciprocal */ + mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); + if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | + vec_is_zero(temp.r[1], sizeof(vec384))) + vec_copy(out, temp.r[0], sizeof(vec384)); + else + flt_reciprocal_fp(out, inp); +#else + vec_copy(out, temp.r[0], sizeof(vec384)); +#endif +#undef RRx4 +} + +void blst_fp_inverse(vec384 out, const vec384 inp) +{ reciprocal_fp(out, inp); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ reciprocal_fp(ret, a); } + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +void blst_fp2_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +static void reciprocal_fr(vec256 out, const vec256 inp) +{ + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + + ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); + redc_mont_256(out, temp, BLS12_381_r, r0); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fr_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } + +void blst_fr_eucl_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } diff --git a/blst/server.c b/blst/server.c new file mode 100644 index 0000000..52c1812 --- /dev/null +++ b/blst/server.c @@ -0,0 +1,24 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "map_to_g1.c" +#include "e2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "aggregate.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "bulk_addition.c" +#include "multi_scalar.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" +#include "rb_tree.c" diff --git a/blst/sha256.h b/blst/sha256.h new file mode 100644 index 0000000..77ddb6d --- /dev/null +++ b/blst/sha256.h @@ -0,0 +1,140 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include "vect.h" + +#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ + defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_data_order_shaext +#elif defined(__aarch64__) && \ + defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_armv8 +#else +# define sha256_block_data_order blst_sha256_block_data_order +#endif +#define sha256_hcopy blst_sha256_hcopy +#define sha256_bcopy blst_sha256_bcopy +#define sha256_emit blst_sha256_emit + +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((len != 0) & ((n = ctx->off) != 0)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/blst/sqrt-addchain.h b/blst/sqrt-addchain.h new file mode 100644 index 0000000..4e7f0be --- /dev/null +++ b/blst/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/blst/sqrt.c b/blst/sqrt.c new file mode 100644 index 0000000..cf149fd --- /dev/null +++ b/blst/sqrt.c @@ -0,0 +1,261 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const byte BLS_12_381_P_minus_3_div_4[] = { + TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), + TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), + TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static bool_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +int blst_fp_sqrt(vec384 out, const vec384 inp) +{ return (int)sqrt_fp(out, inp); } + +int blst_fp_is_square(const vec384 inp) +{ + return (int)ct_is_square_mod_384(inp, BLS12_381_P); +} + +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + bool_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +/* + * |inp| = a + b*i + */ +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, + const vec384x magic_ZZZ) +{ + vec384 aa, bb, cc; + vec384x inp_; + bool_t is_sqrt; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ + + /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ + mul_fp2(inp_, inp, recip_ZZZ); + /* ... and adjust |aa| and |cc| accordingly */ + { + vec384 za, zc; + + mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ + mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ + vec_select(aa, aa, za, sizeof(aa), is_sqrt); + vec_select(cc, cc, zc, sizeof(cc), is_sqrt); + } + vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); + + mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ + + sub_fp(bb, inp_[0], aa); + add_fp(aa, inp_[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(out[1], inp_[1]); + mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* bound to succeed */ + (void)sqrt_align_fp2(out, out, out, inp_); + + mul_fp(out[0], out[0], cc); /* inverse the result */ + mul_fp(out[1], out[1], cc); + neg_fp(out[1], out[1]); + + return is_sqrt; +} + +static bool_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + /* don't pay attention to return value, final "align" will tell... */ + (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ + + sub_fp(bb, inp[0], aa); + add_fp(aa, inp[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(ret[1], inp[1]); + mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} + +int blst_fp2_sqrt(vec384x out, const vec384x inp) +{ return (int)sqrt_fp2(out, inp); } + +int blst_fp2_is_square(const vec384x inp) +{ + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + return (int)ct_is_square_mod_384(aa, BLS12_381_P); +} diff --git a/blst/vect.c b/blst/vect.c new file mode 100644 index 0000000..1834a48 --- /dev/null +++ b/blst/vect.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +#ifdef __BLST_NO_ASM__ +# include "no_asm.h" +#endif + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, + const vec384 mod) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, + const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, + const vec384 mod) +{ + lshift_mod_384(ret[0], a[0], n, mod); + lshift_mod_384(ret[1], a[1], n, mod); +} +#endif + +#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif + +limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); +limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); +limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); + +/* + * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. + */ +static void div_by_zz(limb_t val[]) +{ + static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), + TO_LIMB_T(0xac45a4010001a402) }; + size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); + limb_t d_lo, d_hi; + + d_lo = zz[zz_len - 2]; + d_hi = zz[zz_len - 1]; + for (loop = zz_len, zz_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); + (void)quot_rem_128(val + loop, zz, q); + } + /* remainder is in low half of val[], quotient is in high */ +} + +/* + * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. + */ +static void div_by_z(limb_t val[]) +{ + static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; + size_t loop, z_len = sizeof(z)/sizeof(z[0]); + limb_t d_lo, d_hi; + + d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; + d_hi = z[z_len - 1]; + for (loop = z_len, z_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); + (void)quot_rem_64(val + loop, z, q); + } + /* remainder is in low half of val[], quotient is in high */ +} diff --git a/blst/vect.h b/blst/vect.h new file mode 100644 index 0000000..11b5836 --- /dev/null +++ b/blst/vect.h @@ -0,0 +1,483 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include + +#if defined(__x86_64__) || defined(__aarch64__) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) +typedef unsigned int limb_t; +# define LIMB_T_BITS 32 +# ifndef __BLST_NO_ASM__ +# define __BLST_NO_ASM__ +# endif + +#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# define __BLST_NO_ASM__ +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +typedef unsigned char byte; +#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ + (byte)(limb64>>16),(byte)(limb64>>24),\ + (byte)(limb64>>32),(byte)(limb64>>40),\ + (byte)(limb64>>48),(byte)(limb64>>56) +typedef byte pow256[256/8]; + +/* + * Internal Boolean type, Bolean by value, hence safe to cast to or + * reinterpret as 'bool'. + */ +typedef limb_t bool_t; + +/* + * Assembly subroutines... + */ +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +# define ct_inverse_mod_383 ctx_inverse_mod_383 +#elif defined(__BLST_NO_ASM__) +# define ct_inverse_mod_383 ct_inverse_mod_384 +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, + const vec256 one); +limb_t check_mod_256(const pow256 a, const vec256 p); +limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); +limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); + +void vec_prefetch(const void *ptr, size_t len); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); +void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, + const vec384 modx); +void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, + const vec256 modx); +bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); + +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define sqr_n_mul_mont_384x sqrx_n_mul_mont_384x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384x(vec384x ret, const vec384x a, size_t count, + const vec384 p, limb_t n0, const vec384x b); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void div_by_zz(limb_t val[]); +static void div_by_z(limb_t val[]); + +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if defined(__CUDA_ARCH__) +# define inline inline __device__ +#endif + +#if !defined(inline) && !defined(__cplusplus) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +static inline bool_t is_bit_set(const byte *v, size_t i) +{ return (v[i/8] >> (i%8)) & 1; } + +static inline bool_t byte_is_zero(unsigned char c) +{ return ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); } + +static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) +{ + unsigned char acc; + size_t i; + + for (acc = 0, i = 0; i < num; i++) + acc |= a[i]; + + return byte_is_zero(acc); +} + +static inline void bytes_zero(unsigned char *a, size_t num) +{ + size_t i; + + for (i = 0; i < num; i++) + a[i] = 0; +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + bool_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask = (limb_t)0 - cbit; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +#ifdef __CUDA_ARCH__ +extern "C" { +__device__ void vec_select_48(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_96(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_192(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_144(void *ret, const void *a, const void *b, + unsigned int sel_a); +__device__ void vec_select_288(void *ret, const void *a, const void *b, + unsigned int sel_a); +} +#else +void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); +#endif +static inline void vec_select(void *ret, const void *a, const void *b, + size_t num, bool_t sel_a) +{ +#ifndef __BLST_NO_ASM__ + if (num == 48) vec_select_48(ret, a, b, sel_a); + else if (num == 96) vec_select_96(ret, a, b, sel_a); + else if (num == 144) vec_select_144(ret, a, b, sel_a); + else if (num == 192) vec_select_192(ret, a, b, sel_a); + else if (num == 288) vec_select_288(ret, a, b, sel_a); +#else + if (0) ; +#endif + else { + limb_t bi, *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = (limb_t)0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } + } +} + +static inline bool_t is_zero(limb_t l) +{ return (~l & (l - 1)) >> (LIMB_T_BITS - 1); } + +static inline bool_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return is_zero(acc); +} + +static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return is_zero(acc); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + volatile limb_t *rp = (volatile limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; + +#if defined(__GNUC__) && !defined(__NVCC__) + asm volatile("" : : "r"(ret) : "memory"); +#endif +} + +static inline void limbs_from_be_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= *in++; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void be_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + limb_t limb; + + while(n--) { + limb = in[n / sizeof(limb_t)]; + *out++ = (unsigned char)(limb >> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + limb_t limb; + size_t i, j, r; + + if ((uptr_t)out == (uptr_t)in && is_endian.little) + return; + + r = n % sizeof(limb_t); + n /= sizeof(limb_t); + + for(i = 0; i < n; i++) { + for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) + *out++ = (unsigned char)limb; + } + if (r) { + for (limb = in[i], j = 0; j < r; j++, limb >>= 8) + *out++ = (unsigned char)limb; + } +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +#elif defined(_MSC_VER) +# pragma warning(disable: 4127 4189) +#endif + +#if !defined(__wasm__) +# include +#endif + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */ diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..84bcc77 --- /dev/null +++ b/build.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +CFLAGS=${CFLAGS:--O -fno-builtin -fPIC -Wall -Wextra} +CC=gcc +AR=ar + +${CC} ${CFLAGS} -c blst/server.c +${CC} ${CFLAGS} -c blst/assembly.S +${AR} rc libblst.a server.o assembly.o + +${CC} ${CFLAGS} -o ctm ctm.c fstoken.c debugprint.c libblst.a \ No newline at end of file diff --git a/ctm b/ctm new file mode 100755 index 0000000000000000000000000000000000000000..0932e57164693ea3a0459e2dbc8ae0848a59f37b GIT binary patch literal 190000 zcmeEvdwf;Jwf_k(gFsJI(6qinJ@$rc(OAV23zBG%ee|TFK&6TjBouGyWPXK0d7*)!Uiklsp%X&g z0PTW*c_y{>^L zKR!=^{G#MfLH0%wXHg(EXXPtT4)Cv8kPMKCZyZ`Kg zXZ1hp%u{DhKlNuq0_kEMl&N9A92M#`FH}L8EzjS*@sDg}`m9?E_n)`w$Qw@$Z|HyX z(SLhw?4~!?SR5?FJp5yRLz2An_wo40Ji7g50w;`r8z;Vbt^U>i|L+0#$aa!$>i~Xw z2k=`ufd3GLWg)Hg-*o`AS8v}A;7@b_zorBD9UZ_&bO1lE1Ndbfz%T3oen|)L0UhvR zb_e+%?EwCx4&W;~fN$&oKA{8nvmL-+=m7p|2k?Rp+6!>I{r&*p#-!j^@{j8J$H~{^ zud64P%osOq!t`rHW5XGp0-{nG_l~wRrNlnbT*KgeFZd zo;WQuY5dgbGbc)};v4uVnLd?|X;Ns$L`j;8jG$=ZwCj;HVd9J#p=&2zJ0AHb&6qfD zf)pNC5}G)BN=cHinVPUkvt~>wnHU;3e*DClGeabjTI8(Iuvqlm^Tz({)SnITf1ZA( z|MTbmuQQUspPf1&G&VZ2a4aa8c=eQ-B@<_iEIe=O^l1}Ej=O5=L{xqCwbQ2=9*osl z+s#NF(+Rq!6ZA?a*bTk~|0A?3{smLgpnsT0|D_@Qef+d~w)}U$hkEK3`pm!<$$Qt( zPV;`ulqtPvq4t^grIHQ`ATY#H`lBh`NmVZj5dN&aw03H-Q6|;K9$I*jfiGXEDYJ0< zzT_?qr|vW+PVg@ez{M{RDC!NsL9qW<9e~?5GqENB=h(G>?+4&kS2L|P0Jpx4lri)l zh8M#E0jCGxmiJ7^2*7Da>|bU8uI+_R$qK+(hyBYAz%B2XmJ@)ZWBk8?0r=rQG!)7U zz>NzgsZIdiGmw9H0B%gaq>c{2!-4!o0eDc4Ob)<<_NF)hKPrG{b^uQMZ2#s3;C(C* z&jkVa(E<3v0Q{H${QdwuGXP&4fd4Q6epZ~{!|Kq^_ap3n3+4R*zSDHo&y)AN`A(BnUo7vp^PQ%u zzDVAG%XgZn`r-0^Bj0J7>ht7%2H$Ct>T~4%8otvM)o023ReYzZug{S8%lJ-{R3DP} zOZZMxRKNQm04HC-ce()eweo&0-)WNSYvlcGzS9)dSIGO%_)Zg4zeL{uo9{F|^^4{G z$9$*Bsb47X=ib~YG-)B;59#mpol$q=m5%$F6W_Lb_{gEBSDe1V@dj^316%sMg&@LL zlTHnvIA3tF|B$u;6*ykkHGJgmE9ryoe-;_p{oPw?!za#T=1TKK{y#|m0eAD!@1W!E zc2@5i;;i1+*-2aPY&=-f8wkn`g7lW!N#PRjQk`mw44O0JqKD^0Qij$ z_*4Vd_<$M>NZVqIyrKblKHymb+$}(Jw6d8V@L|P%zFP(Mq0U<7gQxphH~Fy3eOPp> zMLEoez2L*n@L{Ld+I%Pauu>n^!-sA3Va?yQv|L`9e`)@&@<-*595sT2IO?KnzK*(^ zqVDVH-NrfSt;9g}GO{{uWz20>f5*?X$^!4EUWMaZVs4Ad5Y$82^WpiNU=sK(az$MO zd>TOSrXM)tThu=V-X$WL9|Rng+$NReIElF-brdVY562r&w>>SSzBKPKcdx3G-=(1_ zGw%478Tm3OqCSxqDfJRdtz<14%is9^jJywu0PCOl{xII9`>CZazaoE3{uTLG=8vsh zP6gDkLqGnhA0MYMXkgUc8+A83?rSl3o#VwKnT|IylI3_Mk!wI9Q+6nsIf=HLb^1UV9htA&?E?)pFh2<@L+ZitB^xSdZce<J}1&86fQp*KhdPu&?MHFsa^n{sJmX6yPIV|Ux1jTy z)jN1PPvkRFSPNA*(@aItThsoJ1^=zJgp^ zm4*Ki<8{OcYSPs#{Hp&s1|0^*gOF@>D*(dO`o+i$l8!KFv(&dp(v+9{*l|~DltqHF zMV){|qW%dHeg!52;f?-K$+1`)3IV^{hiCfm0f0Z}!$W|x$vvt%&m$S-J4+6$8Y=Up zq2XoQ@NegNUctl=WMg1JHk`7?`=GVL68xmd;$yiY&cm@yJ2A(W{O=%bv-`BZxIC78l2lizqckzlWo%`#5 zBKKv;oq}T@p#{Wo1d12>l^u)XiII`?L`kG)zFXsdfz*q9cLCx4&p^ylz}OS%eotgN?pBpU z;q(UVMiz=-hI)($-Ac;p5XYMX#6j6oFG%uM^+q$YfmP_!!7re&9wVCpm0$H2Xd&+Q zu}ylJNo-5~X-Eoa2+N!Uj{9Z4`)~k**m8e;grAghz;=rrP8T_b^ z_xBp_ID?nz>c{}zMF2Q%osYGM8TuNmxdy9l?BHvBdz`1S_ApqPu3jcgs4cmoz^yNU zBjXKN2}|cIyt5@olltsy=n%SGU~0DyEe9F_70%VK(@DqrNgMs7zv`qMKWUktbQhE8 z1AARxETzfJ0s5`JxOm7Y4sp`ZI^`{S;YHx@-teM|y!3UYc^FYq_bYWV@2@GJO*UFil z^w4L*66A~TYbnW%jaSjtpLc$Vbj7>@C!-*w=`KMo2BI5s$^b(vvV*GD8!!SC!y%jx z#PC1#c$|gouTZ?zPLd+p;8I_`^b=TePpO zfO-7R{PhJtJpenr9SzI(iR?vOUe^;eH`OlY#b^lOPExr7{HPw)he@GGr4EaFD zP+oCh44r}WSCAZ1PvTcJ(VIT;^*ZA*!HkC^Su|q9IpShhf}ag%=z(}7%xuGdfw-H0 z7^Rs7io?+iwSKoWZBa#dS%q7rW+0ne2|MIFQuH&LK0?kp$mx4h=|loQdO%Mk=^9=~ zI)o}cuF?j#fZFhFY&0M}lMok>_<=6rJNH3YqKPpy?pxK7#2nw37oN}ADv)l}jXQgK zfD8SRNvr!SA>W1z%ikAv9TCjf#m_iE2EyRikV%vHVw1)3Bb}6vigeQd_(?4y-Rz<~ zQsZt>KjCbNUZ8Le?sHGNY3fB-J-4#{Ey+Ki9wvrDR0>4UTtmR>TlyI?^L2Z!nHFfz z6@G(QcbFIkj7b?#OV;H2cObl{c{rQ-rntCb-#w+Ms(ib zkl&-P?}S1t-*$69$89Jazb%&bdCWU`6q3Ws&Olpqw?B?d&^KugbWI(K_w|c=ND;c} zBdB{RFL8v&WXhdQPufKTn$&}!Ub~C1ZD{qIIwMSDr4)SK7OXn#>h0aHsT$U${MoBt zsv4F)wcan-QicTDd3VAZh9r#^_EST*dpAHGq0xl=VP z3yc9fE<&xobt~sU*1xL3yt5xUlvMz@?t83F8caO7r z2VQ$0<8-fye|nhnj;f3ztkTT|?goR|arXoFJ3C+})0~Y>j(2X0(|4cKywz#` z(CNF`>AtTL;F7as(&4xWIj&*8(yyHO{+`!&b>gZM_@>hEn)V`fo)iB7jNjkY37@-K z&-E+4VKsxgUw@1@?2$oTuRp>Y_Da0z=YzT}rzr`aTZQB&D{XirN<*<1B0lfzoRX8p z+bVRwDooo}a%yl{MCY7g0K!yB&8^ zEDWZ3qVl}4QDw0UTx?G1hIZ^JLPYO=Z`{C%#D3q9e$)@LBVkg;w_rSL;SuT|4Lu4>idLHKFzFN9i+L*h( zw7%hiXkrAc=>=&~_ri2%^=Dn9UhJrhSlWk9)w!9%j;eFn6{!B)EDYlQX^wkMwpeBD zMUDhf%Sw)0;J7{aK=K}6S1`wd-F5X!t>u}VK47sL3mlL%dHnA=B%wQaNmsY zP0I@}+v_BHH=~6mk-g_;rj?-MF3JRZ$7LRL;#XxJj0(Q{r1px2r_hFt1!u>)+*Wc_ zEYal)aG)mU_WUC1^0}IznA-(`p+R2+@$Bn#eQ(HMXD2aoEl4fsLQ>Z>6hz$~4W?Iz zI*DOm!f>?hQZS*mD-7dd8BW?hrz+1dAy1g#xW$$Ule2*!n&|Ari!u+2j|%O`eub(? znE+tSihBdaoOWqeEHQmD8+=(If*zytU&%w`3i~z1a`(>|7xQLiU+4`wHRkSDZ(NcV zDopz>>Ww+RVg;~*{}B8pb?I!R6B}es3;C6{_T)i2=9G_2@Q0dl{^I3F7W1@ zmaS$0wUc(mMgQ$?FTcWzW;2Xy3qD*E*OYtyWo%TaO_(aKVK|`d7|^3&){B z&g!P@yahL1wcxz*u3D`!7o0bN&x{4-)vb77&o0PZUH&;8}%xvKUwerGqNvXJ< zqb|lzzm=6SZ<@{))B1Z7a>hV_H=+%@>c?ZiiWP%X>wX#Y{<1{aT&Ipjg5y2EknKTj zSD^~i1u%;6Z*=YKjm^W;v^A8OCvaPL8T^X;t>7VP%_bKv6wdn=(|Lz zKp>boF)wl$_}9(xj_ZU;+C3;7U4EeDJN);hOHGAd()N0^izFzZXuL58iU|K2a-vX1 zrEl3zt`svRBOXI*z2o+-GCk3=bQctS5hyH5Vh61e((3gYyaztvz6FL}%@!ENuO07} zKz^9quXd875S=}2;d++vULvcKSS1-M=%9LB5UvAHUaIII%Y2PtcL&K6kt}vQDRpA4 z5g>opG;`!SFuOMzPQAUaXX#GID=l)on^NfRKS=1VkGY?zBc#d~q<`DeE?IWnMozB< z;V+QzcIcOERmg@cBpz8>w?VQz#wBe*`-Qjp+ zBE^n3KSD6#v(cB6qVcKY^`FoP{4h0$IYT!0-a_x|pD>V`)XP$GX`}|#=#nz%4>Iaodn=ncKT?507VCZ)VFh>JEtd!|*`0 zJ6Q8TQCo;)GuI!GE9N~EsSW00&@U_Ij)^!i(0{#TSsKX+)U-q5f}~@#sb+WUR7VF+ zP{#ntaz`W&tc|*_S!zMdURE-kl^kK{nAe()bY{tBmR;o}by0*9JRb+?0D_T`yqLRL zy=Jm3X^jO4Mn*E3YN zmN_$}7Vz2Da%>8+00%myBm~=^Nab3v{U&L0TLPXpNkt|pK*Ie6` z&Iu;TG3Wveji#UrP?uxM~7duikBrl zkhH1eq!ma?GAY2lM@^YL)2aY{x03c3IZmq>!;UO>EZTOkXOa6(6Ax|%ZW z@NNgpB)QNIZEu+*+oAwlmm(=S`~wLOrX&Q~^V^ORrbvPvE&-_;5%ev`4cX}iU8uTVH?rtK(^HMP5&bO1xf!RwS0gJ_loVkLkw$b zrhupvn@sdYQZofGU1%~bZj;Fl3B)uMX)B-4J$BN+BkDbH>@wS1l_Qj5;pIl$oIHmJ8xne?+`z^BR6aV z@|ZpfxM9x&O6@S*sDOYQc7I$aS&bS{0&`80HE#hyzse+K8H9m>GR!1d^A|s$vwakhgx&KrAF|puc9Ti6avJEs z6(-5HDUkGtNwV|>I5FQOStbP-JH;ef(gR7q>^SKRBqh5ez`Y-tGI^#|0s2}v`3+d~ z0F$;MDTyvX!fTRXxfc+x=UO2PkZ@l|Wo|ZQ+Tq>!R_Fq>IVQ=rD8SbKQj?v91rmCv zBm~;?E#{`}DvZx0!Rm#8)NPQ2b~vzv(_FCA(iPyqy(zT>x?oO9LV&LE9VJ{~YDqW! z8KCSmla$pqsdoyx00H09EQ08E1hE!L?Xsq-ql9NWO1M`i1iTur|D#wLY~_YcYm+V2 z4g2N4NIzB5J9fi3|DYA!2#!rD>r8nD%!X5M+pNPh(KFc!yc)vjNwri!im`QTtwW^{%e@ z$e17h{6xpy=Xm!mgs0Vd4Ng|3v5V>S55mhfIPv;4Qi9N-iNhBE8ZF~8W7#?>9sl-- z(v1>fs#AOFb?w~G8d$hWp`eSQAp9RwuxoT{3cC9g6bC6_jiLkly%!EG@{Sztz(Tn9m%cC*oOyMVt5lf zN8r>s-t|QeoUR(EN(a%d47!lT$bzj_&dQ_!)jQttcWp)$F2df})62-=iNB$^z8uLD zjP>FYW4XFTb(4gpkqp739QOlpiDj)B8Jg9}q@D}n&?p#V)|qZnuV2H2$0AFR5XA}s zOL8|L*Tr{Vr>mP6D1`7~i@Fbm=oJo^LK|#39bJU#h`Dd6=TVq>vLw%|wo*`oP#vRp zmk1I}4ls%=hR@)}s~e=+rI89G*kaq+h%BM?>Bz!B2{`^pYf=SL-OG`pK%s+b*Ax=* zawI2NA~AvVp`wE-!;}dH%4}CFP)5@uQAMr_VH{_u&x$ptLYQ{Eh<&b*21KO?T=*8i zU{+QgS4-w=RpPEQ7`#5e*obr>fH#kg0afZjpm7jVm3bAp4UROap~KQbFJdbxP?5Bl zJC#(TS=gju;CoQn!Pj+V&u?dCi)8M?3Cdy)ULun?^}h3gjP}tYYgXT3BW$u!Su#kQ zRT#9q$fF=^DFkuA|7_6sM*x2eYre&PGe%0yK;NW3z{WK8>|#hTV#`wR81^X^$D}c{ zSwByl*!r8)9Mm7f7)XzLm*!}sCUXMVQfN4Py{6%<&q>3aNFFMXo-lKw7L^&m?*o67 z(!%T*`WelHY}Bc2RAKAX+-*_4*@y)8zs0OR8tkAn9Cm^gXYUb5R9C|>ts7C^YF<8LO7L@=D=7CDW>W|hF^C!ScDDSBIHTh8- zU1hZdSH-;Jr@R9UWPXb}I0=<{Lvy&zB+ssy6+r25S)@sAMKbnML@<3d%V~~_6A+Nf zbxffEamk!nm*PNtAXk(6lg)t;yyVz(p%lUT8ZZc?)(p?K zQq~2eqaN`Qn4Y^6fGC!+3elb-$^ms#lR5@?3%O}P1Op>W#4ewP7TPj#CJ1U)ccVI% zL${z^+GaJWLBObs>YxG9rAtBJ>K!qtjwT^(HZ2G22i5~KqANQ+mKI8TZ0OTqTl8gg z5>*4Zwq2cdKS@~=^JYTd_aFTx-#}xtnu`SFmgZsn>1@Kjt!fsnnH1UGi@g zEQ$3?U2Hd67c7>+VzQ>!;mR)ez7(}u*?qAlNTV-g9C+uTBD1nv1lx*#4MlBMcE8t@ zccm#`3{yU$RV@Ki{{CooLco+iel8m#rW{qaGUfg$e1jx)vp)ID=7S z7h8c>jTYddNCo+ovU1AQv!yb-AtBXO;B7V7-Ggczmj`BGDYRkNw6-c1nYj#O><}_j(SdvFNY>0|A8f$`JKV6t5zao<~{qVq0ikfi+D+3y+8SxH9 z^{s>b29ONGpcT+_Z9$I!l;k9+0~y59tcJ2GuGdq?NoWrgG^=ST1xNv*naC`aQC|t< zd|Q_fITMhWkv&br;S|)MpSKC~8qIwq@CV z*+Xca=1sGjjuf)Wc$6*be>7Ixw@s=)NHH32(K?EG)YVUa8^dsQ zaj}7*gSCj91|&k6$oV)Z2946%M~yORAIJI#jD7qi08!n`h4AjlK#?G^k{buJUs_wq zk48wH)=J)iO09KGTFJe>flXS;KT5%%m8_w8FI&qV=%{xT)CzaYL)G9!=X)H+jLJv- zi4HU(T$sPSY8ZhLW_C%87)ipeERv16qDb_tMLf+dG!~~zj#biVxTB1GN8JrXNQDEI zV&#&)jPXmBjKE2ReCTtS5%0f`a&1wno*tSXjt(TaF*!DN8Mp`Sa zK&<-tKtLd~C;Y589dLLU8sKRCJG)2(aE-90MLk3D($h-qkXxU5*3pGei9vClO(0wT!*K1E8$(+$Rs6HZG^FiZ0oCv(RMV3q) zm=&Slu9ggE%1Uz1B+@4BRreEKwwAITlzZP%!2a{7`-bEFElcESBfM7C2M7G3-WAwH zyR!rlmnOtan5rd2Rb%o3Uk4kvzHKS}G`LTSVWP4|jfng6tUHKp(Y>+hR`z;jmhRGE zuh)UKAT7~*wc{>YD?M&^gYAgA+jM1=pwX<)TS>Gpq7`5WM$C4VD?9#H&cl58MW|d0 z{;>v!Fr}1frODkxgY$6q|HZ6iZhb6vWl=7`8AaCdN_$&Q z%cvG?L4n(hiRjnM_-m_r#)D5RoB6OuZ{B)@x^AN2!0Ak$n{nWnMX_()>ow$Qq@+r%p)gP%|7@ST8au~Ve~Qhvy?bNd_<|AWU2gy0*HZcc zWG%;8VW)6t>yyN?_DZQ0=X`@6ArJCduhaB=>NdY_r8^7V&k@B-g9C~Q0Vf5$Mgt)x z8LAit08@}6zhn^{%1U^+A~oGg8{wQTB;jU8;PpDelspS1O(|oJ-9?oZ(neDj$GXgZ zzo&Ks9>e)!h+fS7My8NW>MIO0?9WWQ$opyJLT}{BA#SS1$*(a_p`B@Kl5-R(BHAav z5FL{CIZoB&g#!e_BtVZU7$MeKC3zgO;Np=Py5{Ln2%^wxJIdcR5$J;+Ea89s%${z6>w%A9{o zj)BM<(6|Q0>*&YG*(#rG>MDdf+f?^5^Z#5(-}P^i-iquJ;K8I;HPUsBud|e|>xd1b zQrK`)QOXGXFB0|H<$fbl_x)nGZ#QI+u9n@tnP(%X9y7ZC*}I(q6vbwv1m7QS7g?*q z$f4daM_&F#xV`UrBv8WbQaTuJ=ecleiw-f|{vS~A$hoa4NQT?1F=CPw{BKt`u(Bbm z8_zt3o>_oNLhIFyUz_T*z)d##nF5CpSx$qCk1H%Q2FsbT3 z1Vr#U)NFAmXIhy%#Gz+YZENbDdIi+|L}M)Kcto^Ep3E0hNxXvC-lxl0R!P ziQ)J3Fym_{(;7)v()wW>99$*iK*CMxh@T^qu{szB8u;B=0xW_lD)NDK0u0(3Kk0Ry zRKaneRlT3|cb!D@f+VprwwB)`Nd}u7I4G+US;io|2*ZFn)H(-cjnym-6UqQ$%B%ye z$&uC|GKjbY&g08mTcLXT;MzBQ7V}jCygmgIL=)q3Lk^obvk2k=e-%!ZdK^W?gE0rd zYMn&v-6H3Vo;6N)ob|n0^suos#~VeO8M$mt z@GUDzjwSdActY??QwSa`p)C^pr6@=tSe+^aQx_Y8--8F+hTxo3g1PCw4Z$K4VxO#< zLl1xxT0S@#!Mx%xA8@f@Tfhc-{nx%s(lNb(`y~@Ai68mSl#Pkjsb8d$mgxDeb4F}a zD9On7=+b-#)6vGB?--85H-an_C6$tO(2cg>*=^8Ur}n-NdXtWX*Z-IRu`+$z{wr`W zs0}o|>duQ&`1rpj76VQ*H@SWoZ**h={UxqKig~N_Fu;QCos@aq?j9N1gjD0eAr2o> z#CAZ$YDC0bgvjh69?S$C@16>XT(G$(XT$J7icIW#KcWsXo(oPYQJFYM{QBWvN>+(; zp(8nNcAUZdrQjq64nziX^JcXPz(5^Dt~B}Da1gu&m*jW zXb6LH-hiJh=L_!N0T6v^d~@yB1z9ya$f{Z}D8{wzJM%EdIb>F8U7}evFV$D`PCoD= z39+oY3-+)>R^1JAO`b3$Xjt`#W>rxNt8VX@Rj1M3dLx+Mux+$YPXYzL3r$UjM$*Hgr>+T6QeDFBnEkj?ZK1rMtU1mc2fAzQD?4 zh*XXst*~w;k^Q}0Jp-tmz}2$}Z@?JT1(x7Q(HS_G)qSHs89~3l=`_D5yA1liDOa*# za)#(^j_JAKIOj!2sk#*RFSM5O?z>AOX2ODLaq<{cHy-y2=m62{Th-jxaBsl&lvR2v zV!3ndE$=dY*#@E>*p7i9O(ypz;HTH6f``?xoY9Rj{~X(<1>gdrA?tAh(@x1*Z+T02 z{$fa;Q&mw6yX2$GUY>)xh!CgVbi50fDP#c_isD8$ec%l{1294msA0%Fz2OoVoSH&+ zpNt5wYO5N-=m=c)XxYw%M2F>zQcU7MizOlv#$vL0co9#W-(16Img6}%5RTAx$?05N z$YRv5#=&vS#8k8p#Y-}9?C!XKuty^iY)olLM9Uh9yWthwX&_;yfsS_#%7|5wyXLAY zYW$|fE2K4}mkT{)-J0b^Bu#U1xW1i9)+PXrx$80emi;e~lrI`OV~w)VdxB+S-gQN< z;E5~My6kh@F5k$x>v)Tt-NRY|4%g;kV1jn21w6g}lSJqzFmhAC1wY{%0%~li6LSZl zIaP9|69)x-MbDW(vZr&-u9wGJV0%&*SF}1x;AXc!$r@Qfq{ze!;3Gg~>Fz@eZ!L>* zrBX)}^g00I7XVujBVD3I89zea9IU;VT~<^Q8EPKkIAR{v`)Q)_^+mB1?M5gho$XzLI5n(NNeM0yMO&SRS>q ztaZhjrIakh=IYd?T$<%^)LzdBkc+e2WRb4s!l3Gudn5(%HqkB<0jICY2p{$k+ycI_ z_%vNyozG+pFl?D(6s_+E+^B?XMVPu@0JVmVRG@#a+HyJvO^Q;K_}cvxBlp{&rW*X* z-;p;lay|SwoE}ZpRMlc+u^+0u`i3Jp8h?rFIly7!;T;M376smqiIz2NX@{wZK+tmJ zwc^t84?WM8U?4ZE)9H}PYz8AWL!2kT#aBHVNfKwZMz!n7WUkobqjC2ZxdV2H4yN3| z0O9^0*lr%?3?xc#bO#){#eX;KKI;^KQu!2ULs{ed!Y?Zu%PR1y#e>GPUGXS3;_(G| z(yn;?={a9KMk98l`}a-(CdK0f%u94iiKJNZ=*Ia9MxParBAw|XU0l^+Zr+Y~yh6Gm zAr7R-@fPY<>{Yorcr9DYHU^BR$Vs-Som4rw{cp&gShSwSQ$?jGF5kfA7WBuosPtWq zJjg63lI=-dL~l_WF()NQZt;TTywzc?y$`)s5ztLqKvS#9wgTE=HP1YI$c>X~4)ErY z)~#91n`T;#gR(}Fa!3XjOZwX2nn0#E;25DQ#o*F5(BM9)4KD9h3mDuYvRw>r(Z6AE z7pEB9KZ@^JSp6LXS4M z6{e*yYypEyXa@$DWnG!3pQ&bd?^)CnQpRrTX;z;CNE6L14>plzcLU+t?2;MLvh`$! zvAk#h?=0{A9AS1m__rTIJJIE_miH4}k(pw7%No&e@cKZ^dt7KiQ@p!K+@Z~4xOFch z#ec^d>$%T>nX=BqSoK%Jq217mc5>IGDh8lOy~kt`9rQ_NI4pIl3Yi^mzP~SV6$7)btkYwz4B(}mRI%-f@GcULV$E-5OY>`nNOLqUx57tlnJSv zR4~)m#Lf5@>V;_`t*S2y1}!))MTK5oTZ7I_oouA)<-XN2Pfi>cak}%SO0qtn_fzcN z0%{koHZ_~Y>6rZZeHJq0G4jqEBrAoa3@*Z;R%f?&LfJfP3ccdWp1S0Eb4uDzZqvKUG_uh>TY zuIn`Y(O#LZsZJV16OEaTwcL#`wv@RdVI-qzCi^>*@YFoQDO9XGn|PZ3DYXd- zK|F=_+Iy+i$G~v>NNvQ~VI6hmWTeSeO7O(~!9EZd=8%UO=7ojc%Mo30Uld)P1qu#j zfKpsInkY=ZlS~GB$3OfeyAN(2TsZ`8FtNB0GkynaREvR}H(YVjLN}1x8_*YTssEM? zFGp(80<&FL55irli7dwm$+B=xLAI;+0j474hVVQ=OK(va+N55RM4r+_8Ex&GG8(T znQpY{F2(zM+UdxP0OmxJ9l0dXRTx}N>SvP3kStm@+-`Bk&Y%``43paQVp;&|_hKV9 z-?h_=^V;-c`%1OlUSv;o)Qj5iVM0qi>bUPk5fy?pD-HxBy_0ACh3W^>f+qD-8T>d# zAu%#6&%ssNs?Netblguk(J02eKPUpR+}cuX&)5`g-o!9zwC|?q>Ye%mEKDd@{s{%m z{s}Q0daaznd(75~*$3$IUVHyc0`{nBnXIQh{F|fAn>GH;(bY{uWTzx}hShFdCWnaT zSHH(H%UdVGP3eB15OIJM^!Ew+{pv2Uaz;0!>O53^ld1ZsW9$-rkIsk#rN?sr$?$Kb z(~S5x_8Ro9bXGU`_0lv;y~nZMOHpsM`zOIVk3R``wH8pzkH$}`9Cor)OYN{ng+Pv; z!Soo4>ESxm`8k7eZR&hlTnMQXd>i20OVmxO_Qz6fz#H^>J&grEnmt+;hjEr)^e1c8 zzc9qX2`H}y%1cJhxS_y1c_i?Ph-_8$a1&$RwHOB_6ccO{&^D%7y~Py9Rv|aC_hVHb zQX@a##CZ;+LD`C5(yTg(&GQ$BwyNJTGva{7h9}YuPVPJG206VY-LP8DOhFLQc~Uqle=_km48qNOE-W4F+F&{!`2dSw|j^<9NO9W)^G7 z%8@3R;IlNT#*c{Xg?UmX&inu;64fSv?F<_+m_=*nQGf#@7`_rtUI`TS)FKsO@Bc4qbWMY@uIlONp8~m8qV6nb>H!wT$6Wl<`c9Ag>qLKI`oGa!C;<4r?-Zh0TRKqeM!v z$yvJ$_ai6I^)Hs34@I)jCDKB<_zCh1i5Nrex>a3kkn;PfQRoP=$L5YjM#tP&&2T7? zjCV-);vxV+hi&h&!CIPnlF7i{)7voVp-ct_)5=aJ1BC*|h73V)-t}TDW-QbZAOD2B zfZL2djk?v6s~|EP{;HE;>trYQKTIZee6p@g)|G%vVWt~{=kz<1t3q;FlCwavX;U#X z1RXHrTVr%&t254|EW9$DGG@H)z3K!F6u}c9^;U*DOoNyBc^Mj3U%)W~1{oNB=5+O; z24|rAG-6Pr)&YpNQr_Vu;VyH^R?W$6fX(VA4TQKdA3W(gHQ9n>i|9a314bo_7BOEQ zhkS-QCkdWRup%3730m{cuDQJe%Or6qxTK;F;kE`n^T>Z0|pzgbLSD36|e&n z-#3s0L(OXPJ5=VNw^?N`!GMK*@$SuI4+OX1tSQ}X)yG=wlv`(YM`r0x12>y0u}57* zz^az*IHQtw{YBVwLDhj6Yf(OSQBXEP0LL3E-uAXn!()HrPwlKUsoN2-W&9yfyfF7b z={mHQE$XPX`qn)7yUn7TaXvyMN_1+K+VCD#*36=)cTo{l@or#}D(&>LN&N*`s0bXh zCd@+`d4+aX`j}kG0ya>_Vw^F`ob)=ut2f$XzOwTnii1r1<$OUlMPhUqK!k1JdPxcp zlu#4cgBSymP7s6~?&v`d`yUA{@G_?IYB|h;MdiH(he1w7Kp_lA>!5hVMHxxW^tRC} ze?=yNV$Ff0LKj^`D2gp)had<`4+e{1$$9Tg!To0~dvtHa%-5;Q1^x^>QVMB!lj^Li zF%}8^+N5?fnt?iae+p*x$jSxsd2vURn( z)xH`MmE2vUlFky``@kG&EFuwtxy}Hfy+jJ~*v?CO`R&XRT@y}&xI=vnFV^vJhM77{Z-qhuwZ(uz z0Kjrzs41}4q|DPPEN4>wYEo)-3bmq1S;&;KwWL=oiQv}Zvc}OeVtC@-@or^}oYChH zi}(rfKCq!;1DLO!HwhZjNs)Pvcl?pkNQ4@i)zwF_z5XC+QrGuj^Gx_nqo7Wx6vnFg)gl(QNS(UsNX=3i(U=kz z*>S<4IMgIk&Dp4orkJh#FQ!UVg_Qh3-q;#5R?;oN>Ye5zq`hXWi)+1&Ae=<>c=eI5pw2(PB~^jyQ*jzv_E|F zli+}auSJl@9e6&OSmTGfz_|?6IE@{eWk$h2u>`}f zcwNAlN_q@Sf*C7Onp_d{1PWYem?^16AzpH@{R|L#cG`9)nDgM()nmA?L z)_xxa5ex`gE4$sqo?pO_!hHS>TrtdkHjAXEu$3-2FVmEpQ5la&VDa`^>>*l?S)+)iY*>SrK+`Bf=MDHVX2vp` zdXK|?>0$C_7DYwN^@VzlVw_{H{&HkEuuJYe%0*V$T1W9wmxC&9HSq3f<6q!b`+$Gp zpXT+CZy-9xF2&5HQizP3n|$$=rpJUzj8qaUM>fRNZB_SifdNV#PPH7_FklWSfc+yI z20TTB{UaL&e2E78M>Y)j7X(AW-AChGww5#{8JgU7Rn|zS433xVs>d`@&xSa?Lqxku zosTa{nBDapnrs2BCMSw|bjx-WhQ*JQocdc{HbmU4Zd}W@+{<$KHYAFO#9{*RmKuvB znP?(SOgK)5Notd-*~?PL#}@fR8Ma6I@rEA(1R{fzGU^i0o0puFjWD@5DU$#-M!v~~ z>6V?~^+zsVAQPOFiQd7fQkgtqzM>~(oh9cYnUuxIB5@j^g{(8Po(>A;E>-IN9}?+9 z5l+e^3Tv~D;e-nsSv1mf$VGxUWiTy!L<)i$&Y-DwQzl?#k_rlJDnb`_{wBUtox0N2 zj46{K7{jR-R^9xm*C2z0Q!mLel2Q4$gY&K*8YG-|86+@^mh5j4GE23mOp@&%1Jx>- zvaQsEi=h^%FZHAbQvAg5fgDb{?$+QX0WhW%b2PLz0Ogcoq6Tx)rE!3XYB+(KgW{o} zAx-LX3F&FSl@$Tlth#9+We*^6V!mXsM%zSy@|deugM9A^vnx)zHYCB~@L_rdP5*on zT1=#)^`z^eBzQK#MYLbH0cgn<59arnnmIT}WPy9td<{W%iBO?rhAI}=N=~{g7#zAx zL#b&jlpH!wgMIgc{iRMN@cSoS?}O7c{n?Jo?NB2TSMbg+;$T9c^iI|tjJ|UFUVEEV zVGKErGk`2rIK2M1FC?RP&`Zt`YlizDI0#6BmMo8s63|!O$i^}B z;m89-xqq9Wc-5&J;phs`9^-FPbpooF?P>5(`tlPHLqwTi$Us)J)ybR`A{L8Ywk9^X zVV6tJ?|Q#%$zO>Kq3AoGiywjSFdn47-}b9c?CwscpH zf@f!=$o!Nqrd@OI=TQQ;6d#>=&v6TvyQPZ@yz}8)yj6fu`8(JqHFx1EHVsE4yU#jY zPMP*Y!uI@EM&VP^;bn!@(RZ5Qz%FN|Sh!%HlUuqx{M#$>D%(v3O|9b&t zMw}WN`WUKRP|s!;B!)hwKO>tMx&(WQ@}jG^b#)R$@5fGAbR4jU%gMlof!KM8=qN(S zFg2ys?9>@>h2Uz#-yO=Yf)wU9mHZ%@=!S)tBKX>+Z#G~X?c680x|0}%V8cPRSRzjQ zK>1_bc$9`3`^|-uJ{12-7dW}QXN*PGg#zdYZ(?7fbTL+Uku_1c#7V1ma@U08BhU}{ z1`gpkzq?t!fpZpN?KU9ac=U6hs2_(+Y%Yjej?K%4J6;!F90#H+GMs*EIb{ty@$I{@ zZ>nT$zQpFD@6<){*(p6kaZ4*2PEBEF!{2Ogq30xw6!zYdVU(PGr#Ig+O)~)z|I^6@@Z!{4n!S6*Cy@E3*Q?97kKDPeW$?yiQMq@5W{ql8riZi8;$&U``# zrR{|(cBXzvcE+EtdcE$PPwM2lflA7IaIwuCr^fufNnNlaU}`uo_Du~pY8q2>m^L*W z&74RdEoH>i$V^&anDGD8)ch3}@O5Zv#(^x^X!);9%{}0Qn3|nrNK#XG!|$M}p=NFP zMO#xdy{)M^f_k+~a~Rg-Wj(ySsn()fI$J-sYGQSXY zTLGB0il7!t>H^*X(_$uK(Cc8Og!Jw~X0j+pn2o&qk)Q^D#ldghNW&d8RLEA*iUu`8 z$}f`g+U?c*V4U0!+ua9)Wp$}Zj7vJ_aG<0~#n3p~JK|@+G|$>fklw7G#T!r>vG^e{OFreXb2}52Qt+p8>=& znfWgOOl6!R8R!?sO1YphJ_u2?)(FLYE_UNwisH#jN^iIsZ3FYJJIKtk-l$C!GAVipIV-jNq;O!OYhkthrqJzP;`qJ}i!I(eZDHCCeY$NSv2+^O7U+j|bQrO9g-1k@y%sv&jKNUirg`Be6%Q?MUSJ z#canlsqQ8y!)~_2hU3^DjMZ?Ph`SbK`5A(P@o{7@R+TP|TFqj`_n1ptQHZofu@9UW{8%l={44-KZxXsx2*3(L zQdBccV00vxJ~pcvs4|%d5SRh`%C5Z>N#tm;bS+uM_DOk6-I~+K@M?*YyOD^@B9&uW!DT?g)amCqbEPWezpZ zb&Wn_v>9ETWb$Owlg}gzTC@59KuNx&n~ViQN1LgezVwgQ^dvKYN7KlHglw9fBGr>% z99qF34Gv&j+Ja!O*2Xw)7)@Rlhr5HLusIwQ;^RGynHjnwk7EJfHOC!qSi0n_O#C!# za^a~neN`Nl{XFDH)Q{Os43Ha-UuWHIcOOtzi)h2MXv3j4^uhpj1_N@3X30foaO}0# z8Q0BjqcaXN(8|_q$C=AlpuEL=4X{|6CW|4{% zg)tjcP`qK8e0giDFz#l&!YB;T6Hpkl8AF1?s6crt47LW+pHvvLb{hbsSRA-+_3Sq7 zOebY)Ams$Cr^1K`&WA*aQDM-dwUxG~FuqDgCsGu~cTJ?}j>uw^??7SPp!{(WP#EJ6 zXQRGPVU(fDB=IQU)&IRR`i=r@2Zw<(@0IM*< z09Ynjh4E1a`&%$ug`udfQ$d@Oprpbms`DA6yX~SRl~!Rqlq_f!h6_-#(Nx8?^6H@ai~7mH?wY+`?s;mx=+0 zzC44Q41uOr=+=j((Cu~VL+gQ|mu2v5_?#`n-se8AT*tBiV*dw#=7@~f=W&d2V;nx_ zCJ;F~Bg0VysaiIM3@s9CYZFOJ({z%Hb%ZMQ7#Dcl_oJRi7PO=JHcHfv_x8y5Uc`In zq}UrU1z5NW+!r1>TP!VV#h8gR4Py|@ZC1^2ofupQp8COxsDyg!)DHtq;C!vSw60mD z0Um=AnFj*l`HS_1L2OY^Xx~%TSS46|-!{t}VS>OadnFk&LGQ%$228u2q~k^?B+A$W z&n5RLU?}t^VuB~TpP;8Q*f-~(Cc||)1de= zN0ve8^{KtqlxbFdkmz`e^n$bLPU*|IYOYftYCeZfYaey?TLSuCJYe?tb+PZHXHpUe#mtxq<|Tc_i~cbcCS@% zI}V)zxqT3gryUZ9vz5S{2~n3GMtlIXe;G-10jy|4E-o(8o%bYWjn-SUasgg7sU;|A zdsQeN`%gQPN~f_bfOYBu0R1jFAfrY!YO`{xh_>r^2kNNXIRXWUqlETidY zuO z29-b?fT$Lg!wD{YjT)53F%QFLug^ea5`n>KYR*L*iH_K>lp9Rv8B7>A64KhN`qJ7a zusIT{1?5EG-e^D-1_pOY$difDQjg86;4~0O7QRQM-Dr_X)l476gtqg!pn8W6z%tdT zvK1n4{>~f5<3<9UbIIGgF0W)t{PQ8Q_J=dAm~P?X$u&hUB7f-cbXl9rtttIF@GUpN z;gr|&N^)*ltw7H9kHjEgi?qNL(~%X4FGQeY-Y-u>T)^pvd)sPJ(dx8<@Ur}tSoaF- zFTlj6;LIW?_nXp>W8S&u4wv|vu?{|W*Y6u$peilAY*vfYeYX?;l=*w+#B!@jKXI1B zt>oW{PT%>F1td=P+2bZguh)!gFd)3HJtp%pJV$l<7+YYG#l#?Gqb@hdoH25JPRG|b zg8kU4-TF`Yf2r zxsab`}n@jvo3Hps?|HY z@&@wL*P)3YNd_1^8{OJbOa7X(4urE!n*bEissAGu_8k5F!{+zv7bV{sb_0w!L`y@R z-A$zL>Vh)hoHi>;gK{RWcH7M8Lb}79S*bmm2r4O-?sYVC*Y53MEJ#`>Go zHVJnl-Xw$ey)77)xWq$7G8zJ4ohk#0sP{+(anqCkcW#Q{HR3S11S z^z`fW0*Tz}>u+JIQ}ZVL?VXRb!2>%3!IK5e-x3oVpMjB+o>Ov5Lkmw%Gs5Kmz8ry4_Q`Pz{TtBNTP&8+so`dIBx+j?bGY!@;PP zw-MJ(Q76_%ZBm=j6tTDH$JF>ovwD!>VARbX45W!GbPLj~HM67UH-ORnvb{FHaQ+fW zz+PDjoWySE9%h?G%aPJAXq9z7#$!loE*SlF3CE~}%^dUsoK8`2-@!){Zdf&V3d9cyox{C^^Y7y*NR&$!>e<`y9!tEyOfTVQNN=-?xz9WS2)_OqbIW;MRKp-_lzSU? zvAe%@qJSIqYIi$sISV&D3c=zKIT~Mh;3zuhGxqh2xIUs#CusQ0Ths?LGaPzk?kTeYey zTCSeryNjLF!VM15^|;pTC2$t9Qm*e!EGJuVjY~Z|tx$e=*`P>Ti!;8(IcX(p(s;(V z!E-b4#N3r^j-aa#vAe*9sJqrl+&d3&)Job&*WzpOdXlijr!A~;(z{NqBn|4nfm1vS zAkajK08hfTUA&Lqu;0mjm|b1cn_ZCb$k%Av>)ahWuc7=UHrp7N!S}LULt1GQvXK(U zyZBqDN~jouY|EL~iGS0jWDq1wNKHtCkX?i+_qCaSi+N{$6kWZ&J7AQFXd3Pr$8jOI zoGq-s0cJCR(Mn-%WjOv23GoYyK*iE_)3^0jEa`phcxW zVmKtzr5DH~euA+8AH8XZL&rrELoqS}EGqc}HrKL?8Wy(Rm}9r5l{Mx#GVpMa?!^g=VJVumeP+o$rlL3EcZ@Q)}(I!1D?>4S;(ob zo@^`S@xh8GoLMuYI}LjQr(!AYuiAIQouu4v$$3 zs#am14|@5>-0!*7<5r@kf)~AV z-k*B0D_q%n38*lH!zn$d_p|npO&+`i;C|EG4y)Z+GqoRu95{U<;=(dA=gzA^eyC{9 z{Xe>njI`?%O6Lt!G4dN$p|4ldu>p$?H59sUF)omUJb;=Rr^Fqvc#j<^cg6`lrnc=c zsF>Ls6nZB19L%VOVTU7VlJXCWZIzEj*~|yk0zKVNy?qu_lidth`S*%aV{d``&DkB8 zZo8xP-j{4_LX@*~I4k-Cy+yr*GSFHq63yLOS_6+D{#j@5vb1QwcT1wYKA-0tp$afiCZivqsm&M4 z^_AFF4lXz9yLlBwRBbgmw7AcE5UAbl_4AMf)iDN|*d|O3Cw4 zvi>h9#=5^3pBHN)ACuzZ3Sa=mrK-3TA_Vc5NoFSb{@53pcA?VJ7I`6=?hI;(GqrosMf9O@%!6!LfscYhGV{9we1A+u@d-Z z?`nJPVrF$vtwhc!E{TF~wymU(>BE0`nil!wS_3iYPszYt;>Rr1eMmPSjYFSrRanMq zS%I;o1q!!6@f`wa-^=)(m3)W(YFUwyd>@B*T&bLHKdDmI$l-^|$V5Fbp<(J&=LN`T z4n8-jDJW&EN0XX(ACusF)TuLYg9sW=6heB+3^^Mtm-xS2*%ZPkP#@PJfB8oxCmTEU zI^Ur!n22fI`d>>lg3MaDoB9Nc*OG#bmv~aIR8V(7A*m^Qo?+7L}Oe%v=GhDjr8U~G)pZ8 zZgaYtD`R(^2gINl8Jg7(k;aLPj(-C?O2hKVWCL|$hC!P%hKRu`5X2cJCl;A1ilT;E zR0khhGQY|EvD%cDKAg{{m>M6yl`X}sQNZUn_Q7*aW1Ce5%Cd35nIl86S$(phRl7*} ztH48k6rn3|g^FdngvH-B1=nI)8sPZXL%?y+o}qp$M8K|qh-77GaEmH#TUm>`0{N3| z#V!?DixB%>ggTN*=x{Nq7x+*UYsZ2)gXlkN4M*QLspFDR=0XwURd+^$U7b1@LmHOP zTU0NwC|MYDuY?OI zxnxc(A%|^^$%2A@tJ(;Vsj0s$kPOf~&r}#LC;R%NCud=WwYZfi>b^fqxFTVU=1H9MRo=pQle_J{hE@M?mw;|7GcDuWT%q~1mJ zCIz1C;1?O#35L|FUu&H=8B)8W4IXMpBh1r@2|bo2GN~sJThtHN8vI~`41>wQPtDA1 zhZa&J&qQ+lkkDgqYi{axw+^Y@ic~8)fmY-GtBYj_n2_2l2tIWXQk(EO@#8$dje+TC zU9(DX7om}iK$^tUrptmO7DV)NF@_KBr(%H1G}h*qadoaF%$FW(8&BJ|OoXac25pR(ec>kMt8u13>X<1T_j;E>1P)Q)3cKPLG z8{=tg5Xa|X*C*p?c_hg#7kb-^r?DpOd>_>Y&nu@ica|ji-^X?Z?wMYw0IikR&rm} zzmBJI7RGOwB;#p>K+NjNd&~;5*D+{oK#JDp8CTd$pz=}444c;mPQ4gUEB4G3w z{#68Q&M3&;_eQ`zF*$VbC5)N>Ap~DA7YnIBrIg%M90V$$lKvuSvthSxd zK?Lka$k`?WcK=LGOGgo~*+~EY8v)aP{Q;T?JuneE+Nx&3+VR7b9JqDK2pC@;x|E$P z)8S)B^QZX#8TcuyEnC$NSyNd{#AFFWdS#9G3;z+-+eAUU4Vr-SvMhsW7T;y-PY9}v zF)B>1>UR@}>jy;>_6P4x?hm1TO*dd=TB={ky#BY}Qr+8l<3QYPvlw^KR3IoRm&6uU zJMP3Mu*SOqi&tG=%ph^`im;yVHh%$N7dol##O26y3-h#}yu^vOq=n}%q_&ICtqz6f zR}xj?iY2&cV0p}Yq5`k6#N3K}v38g|=1H6k6Avs*ZB=hCLvF9vDFBK+HLdvtPQI91 z4qtabwLX2n_G@&g1wn? zeGM-=^6soa!FHk;@5xtAxzYbUe&sZVk6>ceuc)1zZ9j4vo_`$V&$2iJ!9v`1u(%bQ z9hgVZRDU&inYgG6_9I*)!@%5YyZ5lHl3yTh>D@P*s>S<cKzqEKWNwF1 zaLltUEmh~yQqeEb_smwdDeB#lsk)(Hyhm2U#-J%TACQFC@-_Dm#P^58w{DT;DY;A> z_q5bs!B@=qM?2mFq74vjMgC^hxSAXu6B%7U6j{KeAb(=H`?z!V9r@M;Ir)x!>jJJv z0*((qbitslDn<_kLo@w{P$mOY)NHkrp#u%cX0@Myx2ViXU@E&%2*NimS>4Y1Li!3x zhtKQv1=-Iv7ts-MOrPzVnhwlEuo<6+nDgPparhDA6(?BGVu@wrh(3<0)}lul+)fX= zaq^WC#H519&|gPd)_YaO+9&_Vu za2d#Uy3mg^0vO(Q>kXCcACMi%M;a_yWGa|*P_Yl-b*A>j?v(rd%4@<4E|LZXPbRq^ zI?Zo8-g#)}JNz(7_tygbm2UbgM^!*kF(iTdKodG~Dp%omsV6KPw*nz5|HE_0_a^l_ z3)c4uA?1;i@o|$I2?m1)a&u$(1t6w=0x1G$I5a9h;fotVrb6V>);Zq8+%I8N->5sN zd=SLM4Yc9;!`R<$qvJNjyun@bSSRK-_$01~MOlM%s`nJj$GqnuY60nx_zPm2P)b{f*U!+6g-|(r@h>f0en0Ql0WOEzR5vPiH<-bFItWQ&T9m=Ow7$tFU^6PXBQM;XB<|| z-Jy$P-k{F7GYxIT?Z&G`2&>o;XjTP2i@myZHyW~_uzYWLKF;=p^1Z@VRF_Xi1^H%e zqP&Q;C5Bel6Vq~8&&XZvgwKb;maE!0PUv~GikJ<}sIQO_gQdJu9Kf8rKpno{1mGI3 zFuQp4&32{WOP!kJpq`V4K7ppAM73JNcU=1C4f@DgUDws=JQr9-RWo9~{w8#tb`W^c z-r{IlRSehsm1H`(S1c*HL+=_Ig*%kq<*urG8t$j3Kv4oEa1$1A4nG$`Bwir>L(!%( zH)>X|VKyLfliW{A8)N;htpIlG7UA;hk|{B4u+sdPhd`M~0cO_;PnhHuqOZObAI2TJ z5Jhp<-TIlg#ay^4iD#I(9$_(tT1&=zk{2>amJpOm622ufusN|2)^2+y>~(SE|w-$9%}BzSF^ zT5>No@25kg=W#C<7>YZ~lO5d`6t(hn0_)E0S^KwBt^LS=wHJrqTKl=n*aBMnLI1I} z?*!AGV(l+Jn>srgkj_?H-=G!~NH(o~cnNu0tF65dvc^ExJB*%D~S1j z+XHM3tQCp=p3{qFbaFYJA0 z&tuJ+HEXR|vt~{5uK5u4P!yD?zleFj>LV$lemU}@iZAIZBCe<(gu0JG)Jx!Sn7+J# zbH^y^uj1k(ji~>oldeSl*-XyZf6ZHu2Nx>wd zo(hl_^?PReMLiS>Pt<=F-dT>Q7n+#eCd@YZXO@>cRa`AFAm{5>vQLhj9}W%uHUioPQXZMb7^+Am>HE-@HPo#3$#+^sZ7uek)j{7V=ZN3Hd)9Rmhh*LY_UclGjy1 z!n2Q%@B}nP!pB?*|Egg1s1lxFL&8fZl!QNr@TN$3X;7IG=l+e|`IIDl1ReOlD&fB$ zMQ>6he7&Tlk?@BgGZt0>={-q!qA73|>tB@c*{H6YgwHxA3C}C#MV_XW@N+OP{1SdS zf4kx+%P-;kpwsE((eX(5csdFHdKC!?J0t%uN%&XOOZZ_U(3hZu|2~s~DAf}F8SERS z!{|RJ;mSjz1>Bj@UzhMbN2hr z5H1RyAw&Qw9w0;&Jn2{ZRWSNw?`2=%0|yA}{|FZ-z@>c%B%>T4Jn~7ufx=2tPx`Gc z<+cb<-+dYNYd!S$uani@h+}^Ks8ceM*f`o@k61!e)BeCQzFS#FDk{X8LcKCN1)%EJ zFVd{f?eJf@kf$8u>nF$fN{kmi>G~M|?~7xle6E1E!;ZmfA;Uh0bu3 zCkxbp&Srql4k|hu)33@LUUwuqdrb(e_w{FKI=lK0$4Y0*Xg0yVtDY{)!to;D(02y* z4pntur|QG=d#Jkm8QAW1FXb@7hbwqw^+X|G9$CHaOuf@5_O4a00n6BDtnzC&%0=g} z{VYF1ZySQWFqp^|Vq7uZSBB@%5~F5scOp$oVT*g9umV(MR~>@Jmq)PEa1rM=zt#?@ zF)r2Z4^Z7iQr*2+a=27CB9-c9H`bxZ(RmpNPdr2Y2c?gb@Nh_aDUYj+RY$f#sT1KM zY7A*I>bD5t;Z${bmz9IvZBnX5N^NF=`>87k>#~wR`Z3<6*g92g5f5XJ^ol)rK0Cyc zZ&}Iv`7WI^W;eddbaK@6l>Tr((cD1n8;*D|`z_E3`0BJTYkI%=cmD!WJ|RTu^6Yz5 z-KRKp_wT9hZalj?y>pR*+BAljbUpPcVzTme#K>l!}XlFTyvn{@#eRzd=_ZHfz zK*Re_GD{0D4;9>uby|q^)WVp?s+2?P-K&2H)zj%RFk#QQ22JoRLGrApiqRMi0kZA> z1-Mvto3LAC!E)H{Eg#j%UH^o3zHftMGO)K!a!L1n3BJ<;J?FNR3BL<#lC%@vxm2@t z8FJVKuVN37M%y1AvZMh_BoL9_OZ^j z>Qe5tCbnK`?lF^&2q80QnZAnJY5|+0>c3~zD+pv#dclxR~pbFeL-k0Mo=w9>)!aGz?mBGTp0Li zKl^ww;qE`k>5tZ2&XDf(KMcT>;z`$%4kQ9J)XsCe$^I=uZ;N!1mH0}p+N6rnLqZGS z*WCw8>UclM$o?2}*WNwSaPH=j>w;`rR~gZMt{t9tGjJm+BZhhQ;yRY_+d7*)7{V=yMgHv~vp$JpIBu&m*swkj16P~6F@X<>mep5DdrCVS^1^g~Z#l}j68 zFsnN4*X1iB^8yaeI+y z)#TbA)x*GFE$oHVlRT?8=$Kwm3i!3+SoA3IA{WX8or}cuNJyqauwd5*m3Ih0a3I2t z4%iWK8zF9ggm%PjTxDVP_zs;DVQuiq0BMlkB>)1tgA0tXG2Uct> zLuHdaXe1!M97K{vm9o@z**n1HTh9RJGC{NPWuk208i7?U+)a4YDD0>SEyhJTU>+6a zpUD0cu18q~H0`O~65`I=2~c!+c*wuII(wJhi!JfA5;r`O_W=^1?1!^3L6k@j%r|d0 zc@dXjHI+tkt}3Cggk52rNG$_<3E0y!vAaQL*%~qnmy=oIO2$OR8nl=EB%hI@%X~!{-3QJAm~s|E(r$i&1=Nl8C{s-DojPDE zP(awHjO0_uZlZo3#Hi|)johAu`a|R*Y%cBYu%E+t>m;-*Wbzo2jO4=tA_d8bSw--H zKn$qWHL$Ta(kCD}8p^^Pb~JyIArEqEc37Uu4$nMY>j=UQ-w_1ODVmRga#@0AhL8|V_HX~p2GtPrlz3UoTps!?^DE)veJJ;IBX+Iw4&dmx~0e z&bZh>Ed2n;_G9r*|M+}@mAd#sqA7KFSvw4Dz7kb<}jg%HNBJ7;ac7Ul}jsiN3RBN^Wn<#5o1+gkZ)ud#D)*i!1eQ z7Lj92hwTuPngBogT)P-tx(#(FE8%+fK67Eyv*vFGrzULK5((^zjKNEH8QxPWVFW5CHQ5jaZ~rS~4kBn_@|++FC4C z5k)Ctsht;|bQ4R@17#3PC9YU1k)l0|rFl}o)$BzqWw4ApdeR7H9%kwe`)Xu|SSlg^ zZ3Zrb*)TgqES-Z~DPpPo=Wde2H_t*6iG?5@SuC+7>hc6kRP-Z=rQb3St-9$|bjqZ= zk06sCJ=HIhzWfv`lZ#YZ93C#(xd_N3sBZC zkX}tn0yzvS7lzhP$tWt=D&Y`VH41qGX(tjqA>knQ8Ej2LAVJf4MhE97@KQ4c(j(lt z^b4fNL;{F5?N#Opr0+0)?cyr$n!$fTa~e`xVFDIuM1qDi8j4$9 zG*^viX|(V`e(6~nJ^B{_qekQiqUdjbXDv?eos1=dNx;}?|3ospBtxcTjVO0rb{XpT zMA0n|@fRhP7Da>QbE+uHB+!0QbOmG+L=n_`M{0PY=oB{X7e!|vbE+uX^NdrY7DX9S zqeQavD%GN>0h!f8%4to@p+Dn8Ya&$+%>(j}D2Mv=B!}GF*R&j}16^x5l<&$RN%X6M z0XZaJl^o)hHV!K|a!CF3$)T@kIm9Ab4xQSw9AYk44)F(B%V>7hF@LecIC^rs9hX@xz%i}9_%im>Ta z-B-sK^V&duwlP53Xbf4ytBZ{49}p;TRIc_CSrUL3wXM-LpbaQNfj*Xgp{pzWtW{JY zzHeSH??k#F$T%Hf!qH8(Lw>+T^A0$5(qa>8dZiAdXb*ti*m^m?C!l|t2}DpM%&!^Y z3S1oze0)o-N^b5zkCIoRS_${ z0BNdaFf5R>FcO?!ffaj-1u{Gbix%WcwL*d;XeAiz{9A*Bozsw!O;(#cYbS1gpcn1+ z6mH2EImN*G) z|8Lkh^-Pr&SRac93Uh~RtxRPd{b#Zar8DJ69*V%~c#BR$cVz|fJqQ9vb;jpQozJwR zOptH=&qJDL#?1KL%$ibo_kY%GbZ7sj`S3tT)H2@ zjQL&3`?ER>hf?pA7E_Hm8($0bgn3QIl|wvSDhY=JnE7bX1I*W9FeAhzE|ZXVkpFhYyTbm;HXNMvyZG=O@_y z4Zx+-c0PGQL>o0XX(O~7jiVp(dwDUhM16c#WfPqX2|MgV ziyngz|2-Ll2ytwM9!ZFg?OBNH-IF89@iJ-VpO)h{e>S}w|1;A4avbODVD3EtKRxX{8#A@|m!#K=8W%tDhYOlTs5(5EJQMI~AijaOKi z>^HUB7lDYFy+tHAqNzX+M@TBY!&i*2mfoEJ>#hTo&hoR}o)xUH-9GpeRaYcf7t~rb zzezr7D+!q__M$ABdEd0Z0I*GUD$v9c4#Ehd=k^$H(P*-ZW};R&M3dhOZymq{Z0?tf zr9lpTK{0B#H^MT@j3W2cIQ|KZT1l*CV2I42nADot@xr~{}>-|~_$1q3`3Nn$`FHF7$Pm-c32 zYB-Hu|2ltyq4Rq(-RGF2y1`^~ax*uu*`6GCUSbEy+uLvDEOk@k| zlqq5)WadgAE){qHt*|z=y8sZn?2}nL8j<*SSyH3ora~z@?9a-Q1*C{^_he<3EIbg45ztCqVnc=^EG%X_wCI#o zgMzZ%Zi6aqB3g=biNkjLEm5vRy}HQS%vK)m-fCNrd$aH{&iNd84}m-}Y{o6bUB&j+~r zz{!XzBCuTtR5}Yw3O#gY8F>dN1%!x8dL$nP5%t8t8mh!~P1gF@*Qa?cV)2HZc6$@_ zI4shBl0^dW8Gfr~5-l{n>jh`~bV(4T2)Vt31i7+AExS-5=En~EORAs#c;0H3ZG!rt#m_|1GfFJ!wa&aUHXo{f4$!QZDe^u$2i zBKNSmyHYuD8Z|qG9e_`9&>u%7WYpz?`FBW^H{oc6h$r9@?r77S>r3NcdbX=H?Up#ghbemTJZ0>30S ziLR;4ZGvNPH}gVy!r&_ET%7ofVDO_wk?@=v5Wa-Hm@I0{A%h2_xyXV>*~rnCndy!` zHsy=dTy3}igJ-sqSHi;q@VDD}0+YP{7l`(G)ZrWvIXC$UE7X;p<99mlN0;E`66*tj z(+Y7j*6tD1EI6qtoT>nptoedAU(ZX6mu`%JZx~rM@*-~ZM~iZ+zlT8WwFg7h{b~>P zs{Xokw`~)Gc#IcHl^`C&FCq9ZNq3${Yaqd$a7(>D1h>=)&%@6)ms2HBuOBuXTJpxFkOI(#gszZQ@nq%79gR&v2pfXa%u8Oa45 z@@5BC#5SEKy2``%UxzXzrl04Km>?sBn?K&4zd+rHOMVhxZR3Tn>HmyUo;SjxTS8^* zu)l?pFO*9g!p(0+S(=%rq5}J&Dprrux9I=;rK7lE0cusoP1|#B;XC$cGv@x~rWg7C zDBh>-yW@Pm(Kx>MBN3>DhfG~?MyOBGSFYS1NeStjAJsD*r@0x1AIuSTVfQCppGBBzP zWbaZF_YO`D;1?DL=9A(VHKL$a$aEJm$M>5E2s7ADsB(T2= z%A=8_3{P|R$6cs2{J`7z5rf`!3ukWa_BtbJ4I9nK1 zAUglRKJx~`(XbAu`WkAmLDo_+@3Aa?Yn+qyH#}zHiJd|Dja^oG{#`$x)lcmHOS;sJ z{4#qlNoH-!MYqDwzCQ24VKDz^%$qX|3T_5*_sXMVH{e-x^E1zX2G8POjyvaiJkMPA z%J1*Ov*ahMw>^mGunVtWS%+uY&I5;E!1Mg||9M_Mqhpkpqav<-401XcLxbiXc(RK* zN96!PFbDsm76T0$%3PT7W0$;VBm9 zFlQN1pZSp`X0il`jEfD-9NRh(!Fw4k_|8#cJ^9jYqzt6T|v8(#o0&M;88$^8V7KpPP-l%qAx{Is&ySCF(|vW z5R#=^NE7f*UNeP&AP6OqZ`5z^V+_G6X{#CEgws?}Wikv`EHmbInc<2>qsLJJ8i5+& zU#pIgUj@H;Y#{qx0cdYjpJU0pVE@BSRak*wFcz2VU=@Bt%}yC|EMP_$u$Bz*owdZS zcMdkXx9m6KbNWkb!#C7yKm?F-+BcGeR_;foiNC8L6+7&$SU@qnaI*jN{E!^<)5Agx zMEnCk1O3)El+dr0aKNzLRa4t@sEOIT$S(AH4!kJQHyYm9FKz@ntRb=4P8EFkzu|1<4Qz^q z?B6^E8bFMce~Z2X)ghy`jqS(1?EMkiuWeZE(%g6@0C8H*LCfK}tC9DT#GH>=(IxRY zpAZAs=Et#>u9IuF3$y+WvLHNLHt;i}AHQ~e*$P+e$JBNgg28}YJW&bEqy+2=&#mGx z0uk96)l+fuM;)(uxWR{D@o8CaoY=>xZR`=jitVUHDNry~VSCR$Au#*qYBT~wW!tsC z5wwKoVEZrQqsol(@EkQVs)DYX+~_z92p1b$Vos4k)e*m`tI=I#Jf5!rWGFICw4Yrn zQ|)1OY=!4+U_!9tyqbfPE(Nb3chCT~;y7N9z{;MuCkuPs$$ryuPOlHoJ%w5|0_r6s zo6=2*faE~rdAGL_f{mr(ZB#I4gT!uwnI5r``qi{1Cx&Xf`iAFF+Qq+}Re{Yvuq;68 zV_9T&sT2jQ*OQsTH@OowOb?AXFAj1`D|hj-0iolFnJG%9!7w}cFjcY!tHDA zD=O_h`I66`flRfrj7V#T^a`u6j8rr9f+@0J zczYFF%x2q-=O=J~0>|``MTYWY4h+K$^>QO|dG({4Z zP69FF0Y-vs z%?iO~U-R_%!$H=D8&ec(vqJBJwQa&M^~Bn6_$nic{f?{zXb{|ub2f#$`DvSEdF#XA zE%TvRIb_?01KDask9A0ZeuueD_y`>ZQtFW8^0zrKQwe{&73{^t%q9;rq4$%CMH8nM zk&B(}Ogxv1t@}^440YJpU#@{T_*PNBR_u$*`cyNr1;`hz?N=0O4dDa3A-wDW8siV3 zAft5`egri~s7Iu}_}?=K6r}FLk=B)GU1?r?mhr-+l~-R}dD&So;LR<;S0nxixA<{a zWH}yDYs^`AHSmg>A=eL`HvQ{hb)o}8!q-J#8i;h<6ge7MtYpnYM*L+P2)5$C-H(S6 zPx7#REWV`mr-Y!XFRL+{E++TpGgm+J`D}d-*d5uJH5C6;3=LshVQwy}ZOqCuB`MqOrqQTfm7aytGCHrY36{RS9U6K>e!ynX zWlutF!TQIrer^-(u=gEi#rMpTo4aTuu%zn8vk6VI@*xCy6Tp(+ZVo8bICYA>eZnKS zG7>7=>?Yi!gpue`bFS$BkTrZSqDR2U(uGbDt7oj=Nyyduq0>I$cNmFO;m1eO5O#*GAe>`ZtD;4Mx1n{KDoU5b7t})X!buZ=RiR? z%)=?X`kjdR3YY1&wPsT#pyE`(^`wX`)DD%Gw4bsz3C=$_WLU}RrDkGR~CbSj1OcP4^1;r*6eHtec~kM@M5X-i+6YsX`$H9h^-3gYyH5 z>XG!?P_|B3Niw7J#`G5088sFQ%)RJ9Ce~1Wrw>-t=vb9_Xon|Jh3874>_GHwsaJuM{N`Aj zL_4Oj$~b3WQ%alz38l%)Cgv`^yHL+2T5PeHcVzc&@c&1FKG;MJ2I+3ME7@dxw*u2& zsL59d0zY|KmBMJi(Kc1=*EUsQlKpiD#NHD(ku1C8r)$&at+YJGfryN!&dCsDCi&MY97-Y*nlB!^bc_x&44`5PqX+^8o6`U zyW{EEGkY8?*FrwG`R>mcv?z?LYE*NVK}|kr>|3tL@C5|RQACMQK8VNN1GbO3zLn?b z`bY{A;yW!d`Rmi+txEjxEvy4$oR@hmNO!aSyTYz6u)5lo$O)OXOJ}w12n4J@Hnh=C#dr$WNI^hD}FK$03&%LD$({#*H_SqNgntijs=-JgXxq6jZNcE(^(%r> zS3TUImLy!O2#Z*wDXRz-L(#0z_ju!lT7{(K%L`1c*y{olcTe!)((Yda;zjlac(i^+ zKJK@Y4>9^L*S*l|s^s8yJ;&pOl-ws)Y z2&!z4z#?Q^l`*@C4QLYbOOX`jh}^v&fVjl-Vn=p;#;9$A&}pL3aoLSx1I#{NvU47O zLW3891kuS=6yRIU@Y2E*j*hsVJIcWG0vA#RAXRAqFk1So9${<%?wIx0(J*e|x7&2L;8(KDw_ z72BAt5Jwcq zptv*x5&Bs*yV;IkO1V<>H!e<+_cIeZ$bIrO9Xy^|iyUmSyT=Tx6D`sYXh59K{G7|C zl{l5^j|cFhHnHODlqLl4*UG&<7(V&+qrdto=P!PFCSz$2z005Bq=5L_JHh`gcYy!b z`Hp7IW|a6#kRcA4<m;bZn1|F4{j{J&P8=l@lGs{hwF zR``G2cZ&XXL`}&^M<#5LSxpENrIE)Ub}E zMXrjSl7aA8s?HbRDf>(qk(CyZ9MlDBY6CXUJ>_k@Y!)2-Vx9AyRAvQMEB9+(#Kms- zQP&d@%hn$gzMvdU-6d6B$BiP@Ct;j?o^J36^t&IPHi(A9$7H)eukS@G?u&G96>n7V z7yO{tV-??hX^s)_)-S5%xy$Z;K!~RgAO)3xYIfUy8>kwSy$hv%e_<_cePSwjW)&NO z2Oq5dbKVw5zjLzQGI9`JS9U86`o+1bf>K&rPwRr=lzTpl;idOTUHI8LT+$Qo+jKE4 z+)z0l;5TFVqJ>}P&u~Sv5g!NnH!63rk+?V;y0m<+-T!5LYvM5GV#_6;My<7G+kSoapnX%<6uH&C!=Q zLU=D%Z?)gg%BUGuk@!wlCA_-kB6e^F92^i5I;T0F8DFHG7YpFL7@qU*D2d5drAk)B zTeyf^oVY3@USGK4qx#&Mp>%kR(n}#zkz?ZFCEPf~UfyDPDs2T`32g;Xl_e?p2p?+J zV{fbw94>JV>+C3OhLt5=AD%lG?bf~B+f`Vs1&<|*p3;qIA1G4M$N8h z{PorwyHG<4OivyaCTwc9VdkWT>1>!o}mJp6))(bo6zD*UC` z;lUzs4C(%d8e;ImI1FqG4ZQY^_KMB+4l>-h<5GJCY8ETTX1i@=iu>yFmF#$`jV?V) z9mAE?Gs0qmOm%Gor9uK1VvqgO4d5_Cl@9KwG;=+6wiyeRUM^V6WVuEhG!*2Sw zuCDlE?7-uOpceR2rnxQ)xmUt{{gV-a>xCETEztI~ z+cUKoQQIKWvl|}MJ)0suQ^!bxfd4iY)1P2{rGa-KeZr{ z81cUTiPC%2{uTN9CyGOSukyA|p4+(!O4hyv8mDf|kNV4?m)GLUTh14m8S^OG;`{&f7#31C$aB zbw3$bwn?>9ZJw+HJ-$LB<=8K57M#hY>EVj?ZEyb10Qvd#M*;Gyejo>=0x7Jr0E&r= zacF6yK^%4hex+`r{oGKU#a-=*K?~lTCBe{($umwyY1tq>Otd_91FA{hSppX8#Hxa2 z35oiYpz?s28D)aZp#iF0efC_4!2Z6Y2@ANl2Y#(D@Bwcx3;fE<@qf#7eqBW~-ngh- zx=i~X1j#*1ab^qAW)@~-1K#0_aG@LSa*#_8vmht1+1S7s7-6$^M~Ba>n?mf^PhKh) z$x&3kNXlrt+z~k-8t_Xca%i37eQmROG&jQ;SR(%4tp@alIE0yM^GO0lb96Mkn(s?H8$zGw=t39CWr$f+nG&Hz5G! zRaItkdb1av2Sm+n#dg{J#H~8DjV0Fbwmb6}`f2tO4mQTOrmk%X5BOo158*%;qCVWE zzQhb)xWq~hD>v)6XPKehW^w|Qb@4O1X8XF1PvD!9&e0(fR#F-1e5eljk(Z}G_kIV5l_%UVMe}Pq2hkC=FPXgZg2GVowwUdb3 znkp|09Om9;`&A@+Nm!s@5!Pw{r^;M{^WeT=-h&>Rf@M}QdW5Igwccl@8g-0-!GU3?oK)W+}k(HSXAXE3_~>6a#N zyFJ6G-v+H}lQjgP3&L||1KNcdeb`Y8G^*~OGUH2xkU*rGl&W(SDIx`4MJ0O1StPA7 z_2`Uen(R}viwxIvabzx?gD+l^GnYOm`D?(gW>$G$bM4+tcjppzMpNd}7mr~s_4dyt zM3a-bG^Z~y!LD|jPJ19V{nP0W0&&{uG--4X(hb%&13vGim^O=bKT6JEM4f6*J0Houd=>>G}p)PNUeL{QoACL;{)pUi@TRZAALh?T}3$OTYsL%vf-AAcR6 z1p9a2g0V( z1~mvrjt%q>TLRwskW-A@T~G8J8;Ds|Y@j}HC?X$6Uyc$B=-nq&CuI%q#W&bLg|2le zv4E!Up<5Sk)%#~+0nPBYM4t(zBmW33fqsY&A4kUJEp3n6&o-;Un*~O6oP+n~)6U?L zux`n)2R|zbAS)mROU3)@u^RYWN9Roj4B7N}?|v^{j-B|PlsP)plJ=(q4lF|a>8 z{4kC&V6Ey$-qXyFh1sps_}pzIFU26=16``E=89v10Cp7dTUi{a_1kPdA62$tF5yI% zIEi*v|HVi~;Uszr*FD-nly`M?v~5ru%;%oLaxJ6|(0TIv5t&^l(5)jx=Rn+j_?kGp z;XGK*QtgG1JhvUo9s9phXler!KTJtn!o*q5==;={+Fb@Q#0$?mjGX2_^{t-Z8__g_ zS$^9R`>M`IQRSe4|N@O-FCZ{lR7}>%94G|v>p-jx(5p)tV|II zcu8d?-t2OLLe^@(hu-W#;Te-TzuW%iAl(?=8J_cfDqB5F4_qK4ytk3$dtC{7;4;Ew z|GzTviSq)9gMP!ryM_f5A7J95U}FDiOzeM-FQ=!Sf-zmm&u3rl`|Q6Qe9$m{ev?|G z3sKL!w=Xk);cLFkfu?MbJ#ZL>rNn+|yf0Uv#YM#*@wk46CTIMeo1RqKWq4+ z8c^K!K;yq4T|z#Bi}IeI$*TP__(rAg>P-aW-O=dkS2!cN(TK@i`|kdDeeIUM^;%)o%61QIs?!3+WlQMZ@C>Wdx51CmeMfHp!5p@x*J0dKraTM_M)+Zr82*-9YmIQ z>FS_5=EhVm1Ei2s)#teL`TJ!aCo_g0I1(qr=A3pI0FmE8l50In8dip8Bwo$LVdn-C z2URe!t2CJS6(;`ht2)s&iDI$Etv;w^;O-|P4;ZxlM%FZ8$RN5^0J`L>EQt;_zv4J*5y?lWrlg#gkF6a)wmwRRG~oAkwn>@U%2ck2#0 zU_q@pQ%$vj_s<*nK=|3^^Q;F#@fG#0xp?VSzcYW{ml)%lI zS@(yZ9dm!E{#~d=*8RQex8&w5tH^<+-nu`teoKDNviQIU+_G;<+3|o6y-H-Qo!6Hk zCPslAxfsZuH>#I<%bzzYQ@uszjmlDQ#q&n>R&OQqM&YFkJ3E!@@|=8K%HXQtUa7n^V`L^gERs^GeHLC^=TG5(nAn{ihk>@;mdH83B$48 zNREPpt|;0MaBI$^J#xL^1TQVli|xF0$P03>2MsBu%NpKT{hHB(RN3H(;dIg$fOs0t z!qZSWklCFhj2XC|EMc6-64oCFoEU{u1S={`37Iy@CsNZcB+%nG$V0yf?!W69fR`M! z1P{CJ27;=EgHQS<2Q~8Z?3?swpZYg@*F5%*F%!kV9TD!>iHKknFU8IaWH0g|rA%Ic z?1+F5eqj;>*#z}A2(xfW4s<5Uh)DQ;h6DiPECfMUYrIRimHcEB{X*?bxDUrw%d z8000t!rooU_S6+n;79M9=v(tfJKIXx;&Euj7jtS>)sIu1CIP|K}t- z4!_8`mLWHbr3mT2XL@qb3@Ia8%@{P{*ARIB5V$o31w}qV& z#FaZ}GMpFiYE%Kljh7c7uAHVJl*7!N+g1G~HQ6x-5wWIUUztMn&ehQ9vKBY*s1Ul? zXu=&|4#vnwlV`O4&ELgei-8rJ?b^Jhu=4nI%mdrCv#URBGN3|Vj;hrf!@pIOTm3m2 zuMv^}XWn=SpHs8E&BSrD);>fE9MdAQt!y|33La`&#zT$xLd6(2Y@q)>-B{`Zld^ES z?o}f>7Rw_XVOtcMCpUJZ^jGk7`G8Sdf|LCp|0}!8 zjruVSQ}tW}jb4^4IvZQfAkqFHQ44F+WaVZYp+*Rg-VBJ-V$`;@7(=kA#>6mR@$h~l zqv}ZP42k2yD22%@Tl9?he%FysZ1x*%AkfJs9ZLQ-Smxtk^nl$;5Nr^iSh*t% zREwB``+1lO)NFUH`JjHFcyw&YE(XnJ)Da~n$y2d; z6#_jY+I4Uak9OzqTO?>$>u*GiS07@5inH>i7wT-<4ttr*S$IB5FR=)eiMe_Ld&=OB z+OB;A=^g}K=C07n`G(bP$}-Ks0Z35t5>T?p{T~#KZi|vmp#xa(uk=&!3Pr&sih@ah z0SZPPoDncDfP$|A1)mh4;GLvkNWWJiC}_`o<#3mH=QD-GYu|>HfWotxRGJ)Awg3;c zyY!@mCKuy&lHTAK@Ky_+h!&N}-h*m{?T1e*&v^7JKOFg!iu1nv=ApX3Kfdwrt=BFZ zJA6;xo~s|bZ^Td6OgQz=kN#KR^`UEPPkv=?Y0=_)`dwB1?N_E9{_8ul{-e*^zpmc% z>-zeefBoM57r)Zp_}lvM>woB1H08|Jv;J#N^(Alr=TqO>Fz>+hD=xp_(c2qK|FrZ= znNKy{d-bdPPHbts_$U9d=JFes^|@)tkb!egcUB;LG{?b#!p6++*@Js${{bi$;{pQ+zQ_h~e`@SDnPr2do83Si5os-?G z>A{0tuRPRt+vvwS|9Ie)y)(YsnAt0o(YH_UtP@Yj$qt{?zh6eES7vt32`6&u`u}s% zuQZ}xJ@;#o>u;NK!>y6(Y1iFx=cFmqZja!f$c(bDP5j!hGb0yXarNjiqb`odu8drN zcclErDb;6PcjxUHlO~-VK8*(T4FBuC@aGRglF-P#FFYlTzj3*f!c(4~#cbiJ^#%Cn zwZGz@l?9V3!c&?ig})wNmOUwaTXS0$#}5TQkH1N|_~(wuq;OfrX1q-=K^@Uqlfs7= zGie8lt_+v`V+AUzJLw_*d<{RBG1o)L)xccsZU!Scy*ym@`rA&H`N*<_SvuV;D0!H$ ze3HfTn~{5)llv(oG=|IGc*mdl5HlZ^%sY^Ix0AUJnHMATR)6M?nfZ`p22ii>b22w0 za|1H(@@M{pnLn1y0M7%u1(}y1^N0S-J;B^@959ED2jtx20XcF!AeS5m$kFoSfqC>1 zU{*tMxRaDqrNGrxMO06UhNtXzMr$eoe8|PV(Y1^}mLbpKmHg+m@Z1Zq%R|~Gfgf;4 z{dSP$*Td_w_l57pe7JX9?!Lw8=Z>QP&mX+%%IFv@CdWp{VNn571B|%Ifj>9m=i`Vf z_hM*Cys@Z`*GRormzbE97?qtE*RLXRac)Io)WC|wxcrL5#8cILLchLgAA*V!uK@s4 z`rz*}&aJT+o3e~1yiEY#KX=JN0$!H!Ch~s~e|KP<%QDuX$1V6ZT{3UR+f0s9EB^N4 z4|Cw&q9TM2!}GSH+&D7a3kzo{D!CUyj4I3;xG zozc*NXnYfTLeQ48X$4Sx4_tFqgG9ons2YgM(&Y-E!ncuFk%l#O1W5sfD?&RfNPmalBo?Bf zRYD7)gO$X>)cpsep|uJGkU|AV!6k)CV&QFkRHP6s8W&Oy=&lI0RthD6x(En(04Sj%bf7Z6CK>{o6(!&XyUI9gQ_@tcidFAgm!*cU*`7B`ZR9 z)IS1s6+#5)=8~uo0oG9{LWZ!_We5ip04WUNT^~c(4~Bp{RF0D)2oAb&gaiL7M=1Iy zI0C0TnA4}j5sHt*5!U%R!og3QBb-TE=!qjtkNBy<&l2=VXqJG9r&$8zg{FoUmnR$~ zHFV<%TYWs?0C@rg%<=IA;f$#~;o!f>6Ndd0Jb@Gb)8YwbN8$;a{XF5&r^*w~KLSrE z@l%1HC+HE-JOSLtfGtXAl{TOz+Xkfs zX!);1zoIQi>_c!XY3~SnE#yVO>s}y_lIyC{6|OiZpq?mKQXFYog8PJbH@WT3P)}Sd zQLUwPr3h#Rs!upaJ-O`7KqZb{ShW;ZuyVSosit71&{ehi1g;XbK_ROIEGrdIwOnPT zm_H&;wIo$Sly!On6f#iBO(in5xKyH2!-*_PLaHj4xq^{!dO}gjL!?1Rq%8ErU^fZq zQo1YnMCfUGr$n6s(G+ktIIy)bM@J?ZGvKZyr zS>Ct)p1o^)7HyJqipKTA^9)9WHU7Vk_}gEAc(Jx?cmo$y;D- zo)$3@--D?c7Fz>wj0POz0ajJSicd!J)DkN`1O7DQ%Z>PC7;LAGmOHR!jDPh4^vKTX zPgC!Wbl-=y+-Pdb;C51Jr`>!4?WBl>0}oAu9<8H|>K^G@>YkC;J!bIWbMlC5qzd>b`cmz#?B8CLwqv(O?Q+&3|1e^ljm<-mzO zbfqqbAokFeKkARbFA)Wll_@7+048W#)R%n?U~&PI{7uj7sp-liHC=qPrZ4W^^sv;X zGka=!S@scn{iOcbW3OND)2-LfGs)lUd7y%{1UTwQO@H-hO<&%<=?jo#)_(%(nF30h zQii{B{DF$5fQqJ!4}i4?XiW>&LgKC`I{3Bz*ds<>=-mw?eKS)qGQFp!zj>slBS&j` zME9m2B-`;1#v!0JZLb#*ojvvXas9E!UN?}<1*tzTwdq+sHGT1sn*Q?9nl`&P{d00S zf3JJ@)b#5_Sx>!Qpg;E5>*XBSV6O{On)Y}+1l@!(`?MY)5h*hcwp3v!Rra0TtFH5L zvQx<15`AK&uR$!9#7Z~Xjie6Yk~l_;%gP{R>>sfNcAks5i&wvY1bC+e9b06b-p4xq zGF%R{*gAb|u5~&@YP`WZeNx0Y{i5Pns4ZpIB~S zJ|Xl2WX!nn76cmwTr5!*OH9p@$TVtm9*OfEK)RV6`V^+W08s}_57qOCtT z!YHldaViMOY;pNVpGu6;HSd0RTS-#hJ%vWvS!+9ih4{T0)09LebeRGCv*M`5RsZQr?KK<7CP^kRns5F_jWisSxN{LK5V>;7!2+?UW=+!t8+(gLCYE zLm!&0%)ZtDsMiCaUJsav5k?0Fpabdq6&Np{BJOM@lfQuO{1iy@JalkFEd0dM3-Heh{AY3h?VY%mF|s|?u(Uv z5G(!A4B6?}d=DgImF~4l_wcfoP-}o(oc&QOCNV!Ij0Zc8fE|yv5*LA0kA;@F)XG_p zIQ##=-zF2$7x1?eU%|}BmiPEI8(*PmrZ0sT-zel)@_FEx^8$Z5a`S0KjPTW0ui z{cV~Q1D6N|rNhAS80Z=*G2 zNo??PxzK#MIi$`U{FW6Sxg-`Ixg3A%@D4}sW-~l;HC|hAoeEyxz-ybT5~Hv*WUgp6 zJDXTFDr~^trda5`Sm=HFa1D8@wjCF8$$0}F#ZJNP92Dxd{)M}xJp{@0ngX7%cSEl! zVEaZafpMm}SWXFZJGzWZ@bV+^U!N$5;i6hdgBb3@qd-7RRmz0;Mq}_wYiP(SFtaUO z*;#OTu2pb#zEyA?vh21BZZ0-zH)H@8jre;LJ#Qujg-33rPJkya`2-INkA(jh_>g9# zfnH7${l&Mwm!J>4N0?%Z3^{oP{;-H124@+$VORuO^4@0q3rG>?F`@zMd^(Q-4G?2I z+Kr#(M1v7*&-0P-*(bdAaa)pW8u2R}_Eo?+>>8atO_5_64j*^*z!3i0)F{N4D76Y3 zt;F5Xr*LdNnrjx~cI&J2%|cw6c5{SRXBO@=3O_VX|E{Z2Hd}cct-Nhk-tO4o_stc% z5$N1p(GH!|SYewuy0rq!`!-_*Ze0DqSh3rzZy90MZy6EG=`icx#p8fkzjcI}v&gJ} z508yz{Wd&ynK>T@>^DLDK+ABfM*$RlmE)0K$Q%>3`q2}_Pq6)TjmJWDvAow}c`HH4 zvAm6-<5(V|z&FS8-WMwFItmqU&OlhDq5vh6j7h>EUkv>3KK-6~_N}8qf1GpB2f4D( zdH5UK1ASvm?P)T@1{T@x!9C@eM$F+aUi~~Cb%Z9uXu`%dj<{3fUnLg5e&V%%r2hSs zrosgJmqqs*URDmi0%3%|%gRI`B_T|gduAuLK@#R8k+3*W9vnF{9b`G zqt5(8a5I_zIH5s>K4+zvT5~oSO^q1bE-U9Bd1k@H$CT+xBQ15h8ZwmJ_Fh?sk` z@xz^g-M0#`cmhwi3a-c{Q@0AP z$tMo1g6ko%Z;n_6w?JqE3xijhg==wLBe4REC~Sl}VtKfXqOVDt0c`K!_K>M&-Y$w% zGw;LnQq^Oh|I6en_yeX|S`kYXB=aJ~V1WvP`A8LnI^sra0A?k^_YAw2`C!* zu;+;KWkJ%23KIVsdlH|SZNx9iMQ@Dw1Q6zBXaqKxJBlq31v*8d070;0pa;?dJ4aHe z-JGIPVX})#1*C*WB8(BR5*`V$MzjzocgTzeTB~vT1R#Ki!H1n%5ESy;5H1N?(Bz)9 zAZYSxfilMxLyfiKO`dSd+i2vyX5`@prsfQ5XdkNpTSx^kb`_w5 z1vpk!Fc#3Shj|31p&d9p)Y8ky+22#cLH?rT26ihFqtFrXtHeYYbzl=qz$O-79Lrf6 z%W00~tj9$JW=^w_>}!-_+h>yz+Nj082OI1&LqDG$w#^x&3xgMl?0qn-DcowypA??H3-2dDL*_eNQ18WaV*`Cj;pMmS4ZK`~ zm$rKNxL#}{Wi?VRf@}B2O?bHkFKw&n{>seoy4whUG?kYN5!aQR`f0lM;dkn`&v&+u zr)nRLXsUg@s5+aGBJFqJMcT)UYM+@2^(ng0XPq+4DrIt=GI)_PpV39=G^-lIi)x5@ z;X=0&K7y+HNC2fi?0WiFy8c!9`p0+n4^P#m#!*C~S+DbrUMxhSH`T!I(X z5c5i*5~mPeR71=x;FUY2@S;jFvy>X`l){TD#ms>b7~RtdIQmgK{2M;}^PTv|Q#HEO zX%sK2(S>Z3_+O6~!9QLUsLU*&+D<9Fs8Y-?f0Rt}r z#sgEKnmLd>}IQKgt!N=SL$7rWfA|J`UO97jjn84$!fS=G}V=pXbs>7OY1=$|P0u!M?|kN!ck<7%z|{Y!+T ziv{Rnf_61@vjE*pz_`XWOac1IyXzgp9FOHueW}t9%c{0()SpbP|CiTQvS8$E&u;J zetYVlw-xt)*Z$fQeE$pAud{a!q&_h4h4E=DjckHs3|rJ{UPWnU|1AsEt;io7r^n@mZbW+d+_v0rAHJFV*d>eL-tu=*7r=24S?hm zZQGCX%K&`g_CNb~tlR&Z|NG{j6F&isVl&vc=g+$cjVjoc>pQ&Q7o@3mRA>xf50_Y{1aGP5p)pNRtsI_K7lY`?4Vd+z}Pat zs`hfQu6M9y;4t71z<@u&>a|Y%AHpLMZxFmHJQ9uoU{=^q0cQl49|!h65guwV>+?`H zR)ILyt*!hRU9s|WrCVmt%vDI^_#P|L0t_(K{-f~+UEhd1+YrE_&{uv>^8{uKv02by zL2Cs~6|hrSad>3X1Oe8M(-Aa1K&suy7!k5chvcAX2vz{vBWQseQ5#A6?ft5{WWm8$%3=dcKiS!bpbu+<*W%Azn_5scqnFyQ^B!V4G zQA99?`$iULQKvl>Hcp4MZu4%TgZM~uIw^EDd9E))Ra6Jo~9IIo{+wcJdinRQli+%!wm({E?_5CKM`B`nZOTpQZAf7wPT66 z`Ao`(XN#M#5;-DpZE+KDsqds>6TT>k3$s-vt92u8DR?zul%z4QZVR8qw?(((T zFc>E;6OL%c8_mug*u$k;7`P*x?2OJW;#Sr{2(e(o+6QF671Pgk9b4_VHAx<_O1h#2 zT+3RpI21p={jm_j`8Q{np%r{^Jt8fjHgO!2`H$M+s3yN-q1V~4*-yQ_db0W0kfI6hHtWzqSvD1N2eB#tWJ2~de)=alDOER zy7$Q0;GBdFlM02P+;k!Dcf%s4i;(xCk}&^|c&DtkL5k$oL2J{BS7n=by>^!=0i@28~W zUGG?`zhMnM$13=Kc8r?kh0rW7g=TqWL^R85i=j1k;O_wbkoP=1-k@>-MUzS;y-%oA z=7Z8uVj|@GF>DR`fE4*F^AnpLI4|S3Z(yDp;RU#IxX$Q2UC1AYF5tm=L$h zIwYHUm}|ren$&p^DTr8MIzT~_IvtmPIq3ieZK?D)Jg*kK6;pn~su)1=n)Eot4FCn* zmpQFjgyaV(;HT?xDn@~VCUqWkLt5D4s$EP%tJo*s%fyP3k=Fs5+SrP|&1K-|VFW6ttz&1vw&>hz_qU zge4_xtOqcl-yI4DM|lfyJm19g?+ za_r;fNfupz;y7J3FfIc}B0Ka?aNL1dq2HLMk zf2G+8Z34>~{Tpp9!MsI)Mw3bjGzk3x8XT8c&>wUHR5&iNz#r;l(v)_I94UmyKa=wC z{km+KZ1UX<;o=KwL`=t|b!3eR&s{eileSIZaz&2H&xUn7W(Zf(IQ6i7cZnBz9|BH} zM8uo0ob>k+oO}{sak>c|Pg)^|=gQLQWejh^5Sl62!wYDQljWY%>lpg4G9Vp5K{FI= z_<8>%x<4`(^jTq(6n0c(jZTG|r7mt9;TOOG;n6})V?*HpxRIWFGSI~h5WZC4(rTOF z7`VZ?3bpFwr8XJ}jT;9QYB3GmcsTfn>LO$WapQH}5k)kjg%{8TR&e9tz>`VD4K!qp z8?A7&Lf}S3kSN~Z+myc4wH@3vySQBhssYK0rfI^xFbx+5E{_$J_nwk^SphXYSY5;xHGHEulV zNZcrW0!N-GyQmKEtdoPA^)7B4^&o%)!lTsyjSZJ>q~}75>pdJ03^#wfmk!*(qI6+I z;Re!I0iki@phAQuapU1&lfn&zDRJX<-4WJw(he`U(ZP*}15aubH+3#<1X$G&a3iu> zEv1xx!qt<5n-&*0z=7x=0UQt>H{C}Epd0DAphAkdbmJ&0IE@g%O+9b}Aqll1)Cwg@ zfY7*cP=WJgzyWaM;b6PM4buUO*YzDbo&Htf1-Ci4@o=E}A{7ER4&RPp-9`{39k_Xy zzhqqqmO_Pz%RmP=wu>7_feOq8!lM->jSZJ>r01R<<>JOspaMJ{bmJ&c;AajDSkdIt z_#QvV4Ny2PiTxe-$m{t4Hu={}J*j7K zo|ywd+0B1a+0R{R%C#zX8ens#={kR3G(0m8-VDp^-tEdpo8ayU0{Yqy?}giH_#-+e z%v=Ty1wPknCmG?H&Ai|nIk55D5R1QIKe*qsLR$&7vvY(st!qEk@%xSyzVGZ#f$sx^ zTp(zoi~1E&t%U+)ZI_m1r*FxLo;;DRCyNkT$FZj$Lr=UVxlj93=OP|AnUG8Vd6Mf#PGSm=P7FH8r)JIY5ah%Ab(nXK^5I`@UI&7Am5*5H6FK%oJGM+y4J68I#X}PO?U4~c(&%(&l^csM zMfd%RhtA-wbG723)4g>r1&CVRlG!bFJ6tF;7+MIl9U{fiI{aD?k>Y3_o)(1sK#Ed% zhyz_G>j6?6t;4VU5Gk(K0X>K#A;x`Cz64}h@DeFr=N+v}aYu+4-8VU3|#O{9n^PAEgKInecRMRl~f_JL;A~3S^7!o1P43337y@yC~^o&@@(R+{|NKr}-aiB#v z>j6?6J;QGdAW~dC1DX#qh7^uVQ^5#&>4Js765{L)#-Zl zsL%l^%Jx9Y&~g{)KnjQh1`#MWM2e$e#6q6JL!_YBQV6zKg;3@PQj~T>9OyckP8i($ zo}NRbxC#bT9*%@Yih~1J@$#tMYfmc|x^IqN6^rizQvAw=&fqmnO7Zr(JhKRqqKqM= z3@Ii604X32Xc}P4L<%~u>-01oBE``xVj)M#w&<8kd@KtBSknz2?s9-saV%BO#;&!<1=%Rd&r{1dbEc70rT{B3r-RJsW5x&$pk4O3~ebBQ6SbPoQhUak?ekgc^;+ zGt#s=9e~YR94+{Oo(v^&vVdn4ai9LMdHWV@9pj17ic`pwzqV9efVE{>hb znVG>md0fOCybtFa^!Wr-;5!kxJ(h=c7DO|M8rtE$9`q4M=Qy&zM0)*hoET5ehUXis z;ghgibq6*u5rYW!k8QGH1^dTJn;rtkWYX$ppF#b7Ovq={5Brrgk^DScK@-?F@tQc} zSeu}IjLH$|d%xxyVvqsH(Z4alV29!Uakfl=(vPrvhI^V~;RQ>jUFaK5X;IKcv^vF= zIYp&oI%3YMYJEl&Zf8599E&EXF)TLF5f1{ZojsjHNdDjm@KpbS9o z2uCcwAOmBi6gh%>toJqaXhX4TwcFuBkEV3G*A$_5M3G+SnIYJpbljBi0!&*R=SD0T z5pAs6<>F3#vCuXbL@nBMJN~t^72oGyJ`4YsJzKEp_{Y}JAy&b6VW_=3m&P#Sg{euUkWhS_OBa zo$t%$8!tqUg)dx-zs=Z9#$yY1*U-R3G;k9{!LjVH1;0>fkXhYC1}*XgCq8z0%YDg%d_uNx{U?Ln}Gd4OlIf{Veb23a$y67 za|S!jY6HcY!!c!Zm2LPNa13lM)Poh?M2u-8EX|1A(s4*UrVWrxN7W=~cT3TkV1d0s zRJ|EhzvZvOw?)W#rYSfU`cRk)cOp?W=Y%ZcxG(2Wzkl&XsJEc%E&eLJrI6ke)B2cN z6mczws;S$F*3YWZyw2lj)oMq{nLvS!G=S@=GmbJHpjiLf*Bzg$c#j^2C#ZvH?^?vg zW2_t4L+Eo3KSdb(bw~roGZ4!Ns7EGF9cBy6A&g-FD?R+s#TtTBadgZUe>%U@m9PEi z7!g-40Ju_xJH__rMNAbHwcdea7u{-b{IAs!J6PYvxokNPuyao<4V>D7qu?Zv0PtAw z6CIO+9eWPX=%xv1z#W)wJ@w|q5%6y$pci<%+hFFk8H4N06*kxy0%oA+<_g@l^NzWq%|Ju~12GD$oFzv6b_6GQ z3%B{;vDeI54Df$t#J?f~#XuA32FbPmScxw>(8X}IGzKS@^Li|2bu8y?qiG3|0Z6TI zhVJ)LDaOb?)MEOfWG;Wu798M{FF1I z8)7-lI0ysj1z}JDIn2eof6jW89Ekss4!%zb|HlhgV0gNnub9271YS3cfBF^r`5rC) z=@;Nq48}MOa_f6A24@^O-f5%UcxT6TXi|_5Ld*E#n|7!`rsWJO25O|&aPr=W{gz7p zKkU5;Tus{>KfF7o5{@Ve;Y27x88a6}QA8nxQXx`l5<+Ck7~0aX% z;%3e~cJ}*y_Fh}3(`oSg{ePeL{rum%eU9(m-~Fs-J?mNHUVH78Gd2jM{Y^zBj)f`N zCP*7w(9|&UCZDGHw6P_9CNz8Ka5t=fE58dLUQ6#Z+_WCF2EBY+6Lb zKS&D}n-!Mg6|_rY{uIX(rJ7>9!XjQRDA%0gxMCu%T`*!p^@vX(VH#6hw}^-hiivnD z?Vp)H#lI1yn&QrdMZAMdjMtpv1;V|;U2R)1;u`7^UxM)(Q!MOXTS&yF#YB9NzNX2a z;tfP$V_5WcmiXD=XP3_dj)fVrjZ7}Ykom&B!adNqpdsGkRX`XUme(F0A2h`4ew z5g(vNH@^oSi@UMrMck{fh!2s~g`{|;aIf$bw<{R&2=$0>!+4D;wl5-Li((>9W!t6x zZ09Yr-=gizm&ZPZMSPB|E+oa`hc$sL#P-Kxu2(>m*wNb9BP)u=op>->^F@W?-&jYgkN* z(eALcHLP>(KA5pib*pg64NGY-{XolB@UaX2f@^8ErL5*aF0@bGc^6y!z2cKDwx^tJ zPq}ENNs|urWT&c?mORrc=bw|97%|jNo%1*1-u@YK! zr&VmNs{ zcf4h%5iLdtmKe+9I~{Bjh#g=tT5!g9YVMdd^ujuNAME|8YTH#T@>aD?-04=_1y;>S zX}?=(8jTjkV=#>8dombL!g#b5XP)ww=h=nXxh(&H%`;;ZL22ITqX zSN;#?pNczF@=rbQ3g#Q_$2of_vMm_d7KLn^XphewoH5s=npI*kqK$~9O?t1oYGu;G z`JXu=tYmsjn=9kp2zeD|V2&ZrtGHJ6s`+hf2`NaObH?1CR;F8pXKoeVxzQ%Km>A3F zM|5KOcMFl8kK#?lPnmA2dlGx=Q74G1X_f74`$DJ%*)F!vTx_4a*uGFJU1Wt&n#3}# zUI{cg@n^AD-}o4xTw+syA>uS=+lxZP=`OaHTx>79*j`b08bvG=MZp^-aT`Uf!C}s` zO|UHwR@ZM&tBL=v(tmeqx|HAkqGp~WTx|1PKo6$U#HjEoMt+CeK6kc#=8Uz1S|{^A zMXHtG?x^&G?C`lD8fPj&o$*l92dx{PQuFOEiu}QIA9fw1s7DRVm80IrmDj$c?_Qwi=j?w>g<#K|~=;zid z15-Pg)Nqc!VVh`~@d`4vD|X;d1ho!qw`VtJPOGs~?b1Gi@rgiR*J5Ffo^J z`_uTFw3Y*Fbf@LeWD6eSU9He`opG_EnWfYC-7IQnQ<|xn_EFP@rViNAG;(9X_DA^E z5)mGYnr(CRZ+~O~~RJn@3*vx5Rmud0}epnfREt5|E(*c`RI$wS6Tv{3TOe^v- z{tz;xXuox<910Y7n{zIm_H%tUY{m2&z3ITJSk_q6I~w6bL)Y0R*i8ygnq3UttWLRA z!MDm8ZdFq7y$?NMircnhorYTle2i>@?nNI}9S=K#c1O{Fq`&;G%FyIuh517?23=^6 zB$fG%pMM{8@=H@1MxeVjp+QR>4kf@f|MW`Ur>5*XSSr-r@i5s(fj!JE@uxoraxFhB~|4i z7T>;)0)yd?Ax1s1ICZU!@13r`cGdbw`LLII-R$QyKojR!XgUvnikjXiQnE`ty2$IE zg!wtORuvtE0v!cBp`mjq;g0nv-!A5V|7>wf+}5i0-)I6=`fnHP$x5x_8Jdu%uc?21 ztMLCOIe)pBKlneJ96=o==Px8DLv6Gn?HcY@K}n5$4QlM^zdQ8bWBTtU{r9nkxXm=S zO2*dG1-F^LrP}yQqbs#uR~o(KYrgWWFGRaf13MbvIn#iu^ewcEFVURiHKF5U8Ug~9 z1@1he0V3{vFK`E2_I0o6XX+-hF-QHPbiVZy&PO0A^bgL5s7IhH*SX{Q6T<5O^bO`|*sj;DRT^4G zG>gu2bu3-t@3~+yT<9)NqG7z?T)rcAweFY>%yzR%bGEwZs`cIFv#_Z(Y)0ehi2bJ> z@8Nd|zt=8v9b{;yolj=V=wrmW)|TiesCAw%^dYyL@mHK^ocs)3$QM_u?_x)iFAsjU zZzqHg&kM2hC9IPyB+Clg^fI3>?@z-CH0vt-p?IACd;IHxe?9Q82mbZIzaIG4 z1OIy9Ul082fqy-~;e|RLtZ{S>7!?uX5#bxA=oA*<9~hzV3>g^^tZ?%U_8k@wrm*yj z7-famteofYz;H!GNJx;v&o@{xBtQ`v9^el`F)ScBAj~%+K;awguLunb@e2qK&!;O~ zF*GEM#1&%#BZey?h6gC}B6SZ7i3srv32H=d$LZ+n8x|ZGJgl`MFT$XJh;YT&kVu7p zh$1jJJisq9EI-I7R6y8}knn(7lx4m_;UPbxA>EX-LdH-c#E1|M7sy=20E%NE^!bGa2>GlC4j7{l zBPN+p-@q_QC^8&*A!zas80s4t6u}YzgGRwembOjo8aHX)!b)@$N<#S9@Q8p>iioht z07V!|P{^oYDnH>!yI%lVQoxP@5ktd&%^Jal%mOhq6;&#S#l~J}`~w0 zvDS*fp$gw9-@qW>AwdDw3XcHfT!f+<6;LV}EbGL4l@yD94hvAt&4!AwfV}Fh7!?>U zWG{;e`UG{p`4tM>nBjqb!xg?NQB%G8X%iwvd|?5j^U|Oak67k$H!=VPy%38#g+vDV z3spBH6ooV|jf$axK>>{vPSJr8D2vnt2#1=yY8px+FSd%E#bWI7Gy4k0mDhd=#jZdI zYHY#-BCJvV;1P0))Q*KmpmsG<^cH-~s!zB|m1(FzQ{;sW{+gwon~MTJdFJUue-scd6hZVlsLRx{ z{%jD1zAhvfO@F>x@YC1oCA61R(Qady(WR$`^EHH|iUa88lo=@^z9b|vsFrmD& z?z=#-gKZ=Q`-NiZPF;>OR;VYmOBG!BK%>A;Hu|Dx!SD z0;wUff|K&fnJYRWGSwH|s#wOtk@M6`3*At-*yg}Oq0JF9g$fti5H$5d6=!#((4&0A zMxwbB9A{bIiVXaEb<`3Hst_@OU_Tcy5*IHL|40ntd0xxziK zhG8f$Bmj+%RL?Mwpja?qi3;?`XeUBJvSLj{rNHPRIHHll6P8h*OXawhPyk1fiojrK zLJt@iB2_^Lsb&WIV%#w*5}Gim!uW}_qX`daG^`Q)oxc-C{4T*!zCnThh2(Q_$*K?$3Jxs35XaIfTq$`s37QtVU=GH8j&dKXiKj5XPqdaW}*4zT!ym5QK50^ z=7vrx;-`Y9O%&Akp|xw#EMEucA2Y*qXQL3d8nb2Gj zXTUXPrM5BGD10Q>C=@@m!ko~GsI-l(72MB?jk29ruW@4x=Yav6Fovdj1)qSFsAubsiC8XXG1aT|NZ~}LJ!c> zBla`rbN~3kmvSDz$soCtkzsQ9(1dTTJvMf~`_E~o$mv<<%f;W^Fs+G4t@rhnw@KH8@RaIwClS% zhI*~Kcv$PW{J^|x?iq2x6TWIY%<%IJP~@2I-1-Btf9#aYcX{=W|D#FZ%4cg`T4(Ex zJJKZ1_WkCUfhix7YRol$qbuXv*WGJ?HERFe@vHV_`_CF%t;JiL`u#6Y+;;!q`_E%4 zoo#(#bLHPAgDI!eZsO)bNHcyZY4 zoJZfK&V!x$_6qgzx;$po%RXy=tE%U{C@c2c=|P_=1>C5zx7T8QJ)<~-{R;;No}FLI z%h!6R-}%oct~*A2+Ve8T<;TdRj$Vn^ zm({tk*!fCE_GbITE2^E0|2yf)MI*aLyCzMYdFPG1LhHkG;h*m>t@}@3aPU5Na^9T= zwcpN9k(+n9@b+xmlQ(s|$Dim@zgCxigUy~yeQQWdSbpiay4-uiK(EH_<0hY+-p)L+ zW0RIcW~Vt?+L^XDtdd)Q$HGH7G244|A6q?b(3A;37Tzp#r~JD9Yd2jPA2V@b{gZ#2 z+yqefR{-A#pcBmVo75P1mbACR4 zeuWm#`qgS%tTMjR#OtvVTzfO}h<(+yW$IJ}~DE9btizuk>EZhU;qk7H9m-CFV}V`!TX zSvLD~do5|n&+KaYP$&D_%i0(FZh5h8$EwHI%AdY@#&%GtH2q(d|5g9!?pt7XZnVLP zT)U{**Lsg~d);3#c;443D{q=stv-J3xV6*gExNX6&uQ7D*wSS>8cir0Vdl}hcki!% zZQgpv`7?jTyz}wjgQj0y-oNKHbD8 zqF&$ox@$-0zOR2NHfBgh(5mO!lXX6|%t+f_+3AeKb4BUi^J9&ERsL7KnK9ZG<>ATT zbz4W&zR!1x66(4RW?{huP8lA2D&)LS4(l#kq)%?EGLd$!)W47P%FMImGpV)uQq<#7ZpXbCKYhaXB z#d=KK_Q1cioEn{qc=l(z&HUWtG5?&NcXZvrD~)tcHBUL=u0QFSRV%$0OFvqA&({j5 ze7jbTiH_{)xxhF}qwpqwhE(dZZ{DkhLuw|S3VU6@{*x?z^3?AYm+G(fJy>Jfi-3B^bn~QRWj!x7HLlywdic}^gUdTPH(j{+1UE2K&$~Cw;a|-7 z#?Qa=gbxY0*7~~BqKJ(0AEE+u{hs!`eLeP>L%XV0f0xryPH-JsN&i>JL;vgk@z=b= zalblVq&FD&T`Me}U+vF-RsR2L{`=MOomqn?T~;;^dg0^dXE);QOKGy?eAB;alPM-NoVBgG25Fct1YlI{2Me z-LvbI58T^F>Gi3-;a<}b?Uf_Pe}1#0{EAYcA4VN`)}Z6C>+AZRTx-#?e%Y(zjt{Kh zHuFiNfUbTAK2*JRaL;U|j>8INpS7D#X-!`j+2+lc@$Wk5osWuWF*|;6_r`NZHgdMm z)vuuZ^GIgeEJxd@Nhg#09Xru(_RSdW3TIyrTia;CIa}_JOM}fqPE_BWJN@wE9_6mD zU;FI%{9ZfTF5j_b#P3hjcYjZ8=yzmMC!Oqb`(MBL6nwV#k+A#mr!1CMUs~nuwe`D4 zr&lTcee1~{hL#&U-qw2HBEJ#VWK8MSL8U*u*)jM-JEc?zh*l@3jYj|L#uyr-R z@6`=>K6=O>sq?}vFYCQ&=#5tgTnGE#m|S*1*6acDx62%C#%AwcoaKLq&uaR+vVHT5 zXD$4W`^We%=&*d_|W{Z$?xwIO$mLgrS+Cr|pQ=rRKgpBKuS~XVv?cNgh4# zEpb`N{kX1w>}F{hcc|@OWxHezUS)c1P~SB{p2sHkuBKmoY5xgX<`-@psB>UX-*m;a zS~`cul2()eW4o(RBO-L+$!E9QACrdYG7k0dDG@(Y_;KxlTHlM5CXKKvaQde@^ z5C2pB7{~oxb53KAa~W}^tWL~+U*@yXkz>0~EY9ugt!LsnXk)*1mZQqF__XhGTq}=J z6Tg01{czXk236dh=6BxPckS$zS(gr!>Ckf)e{M*bez}(~ggticX3K3pn0UF#n~{Ut zjb5BI^}(nG=1Y@ZFF*I6JF{9qTl4AK_m5VqU~6EzVM@I5osTJ9*F3vc+w=U29bY$P zwYOZH61-J!fmtJ2%>h>ynaq6E|3R*n`|VACC5LSYsM5Mi+~kJdkB&{4aq*uQKTajb zFOLekzNggpRl(WSmQ*}k^X~^It*X7yv$&M{Ic#y}q6O|_qY@T&Z?V7UuHh3q2Tm*B z>|?9M%@fx9yJby@UGVM0*#3b}*EP6l;5oQl_o-IK3!3g3*>hdb4jDs(J?#5T^lCSy z*`_V)u1){eviGaEJ8sIiED9eqqREOC##I+qY@2+%L1&Y$^$fm!Tz>D>ix%0(=Jj6Q zd1sHRZ@;YaA9Am4gMf$J?vX zD<5{Ad4IrA;}f4c?D8mA{m7Fc@q11`JXg`A{j4$xCkE|L)mm8f<(PZQ4{lYT8{XBc z>DI4m>cFet%PsKe+VowQ#dhmQdsSY#+CirH;iSjys=V)G#>N^e+xMB*OSg5DN&Bdn zH%C@g%9-b+Wprdo&5p{>an~#!ub48vVYy13|FqjbvhrhnQn!A{`W`*|J>NTY ze&f=gby`04`}pnlJJ%qiZecDbGOk?d7pT+O!#=p%+=%D%Z_2D!=sjD|@ApxO0WP`@ zuOA+{_SSY-Qne22hCTZ2i{4c0KTdCop)I&zq_g|ec%FyX+a_*REm{? zz0vH1@uxfNtDMl!>U`@mD+X8Cu_$P3fY$Akx*OUouUC77NkF2(g)0mBoZc;WceQG> zXlJOq*_}}poD?rz7uqi>AT+HvElX-jjOj=7%h*`RC3lXf$5SKl$o z{kFGBy-nYfDh}`Vq(zSw{G-~XhPFLvRo)>ddP;?Km&f0AnlzqYYsL-to_{Z&TIc%t z2d65n__M*5PFdp}jygtLo0WCc`!Kbqtm61m7f0+2nB-McVc)aK8%JZ8JE0Z-S^V{T z#}S>ve01-m?-^-YzDI(7M5UBZul(8`GE!7AzEo@ag75q8UufUa-?eMq@1N^g)qYXt zRJ+z+KDm9hnfX5UFN+(mcD^{0c(b8>y6*fAdwN!gs(E8?mA;(eJ_Q zW-SumFW-8$$%lHU9-6*Bb)?x5+cSr)4a@4C*gpQean!Pug$oXa`F-&{-J!YhkHvEa zGBI{&uwokk~_I=`m@`Z*9p^hpKzFEc4wUbhdO&b;;WYT zj?JoOZsBS;f86sb*IO;9{HcM*ydCm)rT?hg;=`jw_eUN3D}Bv!Yv&EeKE#>&IW;l9 zn=RnkyLjS*X>LP%kMlWaYCr37*(;FmG|unAhKB9RwX!b% z*Vl(jy&q)!wkK>u;L=;XGP_RWbNrZ!8#n8gcgguMc5<#w?B>KT(+vG*@AkC0Q!$|2 z%u)f9HavREW%;~|NIGJfVg70CkU#XxB;Sla>=gDi(%-b^^22?m?rXF6>8BGjqc2V` zzwXeqN{?PEd%wB2Vq>Z88y+mbzwBvgz0n^}^gaEg+#bWKBX{mAXE68D-o3+)^m)`{ z&&uw7``nl{)*;t@@PbY{k$$uP{@iU&>2p@MCrtEBei|9>(eXyhVC5?3aYlFk-c|iY zOjL&*SKnk^d02kZ&|5vbnoX&GqT7QWrN&3i4REi2H|!>QkZ%i4%<8huzh&H?fzMu) zHmq+P>d?9IahtnawR-FxINCmB+K8Q|H3yqtzc6}#r&A3>&EELzOD|*iTaO+y8%8v+ zTW8m#z4hIh^DlQ;)1iD8zjoo$U2DVDyH_r{=Yq{i-G@yI_;%3f2O68G3E6hbBRh9n zbKr=>l7-_M9r)Cs^SArME4x}Rf`&TrRZyX-ltHCKkiS z*qhwz^sM&wmFAU#B zUmo|e%$pC6A<<0(fX&G?Hw=j&&vhaa`#-TQ=R%g)#K zUO408tDpw+TX}{pF}_}^0^e`-?c2_J^mKy=cqRUhsBHZ)k5aeRl-9r1s;TpYl~x{` zUWIoyX=c`}=7SG!b*`4)_%zDk=7GhoL0z8AO^I8%Xv@qO!x}EGvVU6pvCFqUzOeAg z{G|ovkAC%eLEZj;umAt;fn5Fv{=JVc%nRGL3Z)m{C3);t^4n@yd`kY5bj@biiHg(S zPwh6w|F7-cIvv?lcYQcxY z6E}IZJrUW=z}g{j!}WGod)$i{bl1D$uga^(SKRfl&Lxr!Q@i9pK7m)Oxd#Gw9loZE6*ft z)s3G1$aLcFHpBKhOiMpdwOW^HmEOr;9XR#=l}~u-`U@-g4mz+}@8j%I8~mz!JSf%p z%=v{IU7NXeoizCTm}$Q+p0T;v4DVj{$}XMum*Ec&FrHAQ`?uwaQQGax)?U?N`iEOu zx>F6(54WfrmpL`0OZ^sa8oqe;P`)~#hmQZI3u|sV_qZwlU$+ncYyWh0b@BOP5boN4 zMT8f!3e+_U*z{<1jy^!UWO$F=L}^k?Z`mCsyLOF24l z+4qo!5d-$N-?BXQ%(B0Ye7ejT{lt9j>3Z#F%)HgBf7br63nT4jIr!R7!Nw%{kDgmU zE|+f&XneZz)EW(4J5PUPd+d(|PrMJtbbq&hf!V`)%}sxdyc68ga$0r0C#QRRR>~im z<9_vhs(Sd!Wj%5eX6*I;$VV+3_QNFlo`X?RN4S0W{qKQYhB~jJ*wlZK=T_F>$nY!T;S7eqk&d6?KVM}rAf!YCi!hHe-P)2_qlu~&>c7v*y=+r{{`6X zV=ixu8TO!0xx5cB{7Wvs9XJ+$Q9KPe?^`ba0hstBm$x_MxPw41V71&_J`tG3Locu+ zRwyVi@`=}0@;)>I(oyn>z&^k<;6dOA;8|cniyqpbKyxFbJ3iw9`}aIz}94R$9s10KWlafM*Sq{6V0+jFQg+HU$0vE(NwV=D7Pn zPhez3CHPK!5&~3FckO@m<)8NujI3V5e*R@U;0;UjPO8%CI}B44AjA!qgG9oyd5wA z=m{JL3<5p`t_ONFgFJ9cb0u$zp1fZR*b6jl3BLhv0+WCh?H~tC1nO6V-K~_oBXBY> z6!@^Ul3xLw)CT?p{sznhc5AET&CEFNE6@wLtsUfmecCJeTfoU3l)OcCj@xbzJ;0+5 z$QNLSBkaew&?lW>53o*W_z(CDm`2z|$s5;1I-QlgJ@70r2-wg?$u9+Vb%lR`58dD& z1;_2|0sjCe^h7#ca$&)71A~U#Ja*klX*4NTsS@(T3XiSmTMhu3+RlAj7JhY?~1Fkuh!!j%V032`{egHl^3qLpFINS5k1Ke~G_kj^tP_BTD zZXkSPjtc=g0@vPF@(I9-ca{7-VEs%bZ)t;a_)y750AD_apMce#!A@I_^9EW2|9F9P z0v%ownJWy8-9{3D+4_LlD&wl_`2bOICe**1*&w%d0 znua_d1oW!F^9OLLz-*v>3G@dx1jYdEfr-HG!0kY9 zU@C9~@E$N4_yL#zENjPctAG~3CdRM>cm@~)yio~u0A-b72e2713pfP$1vmm&wiU-M z04jjXfX#rLfUdw7z!+c?6W9T?2c`m(f!V-6fn00!2S8KcJD?4)rzv*j1I4t2V4zI2g<7Q{7axAP^S&Y)c~3St%1#e9e}PtH=sXo05Ap^2uuWy0d5D* z2c`m70`CEvRfE6VqJ0211Ku{{`B0!l00)2odfiruL%3i$OT51bzTo)`x#O!A_tv&=KeZ>;;Sl z`T!Gv!NASH7+?x8ssZ8!hFinWjz~{q#0S)G3cG+Nz*Jxz;5}e#U=Gj~Sk?*U3a9`E z0_}hkfv&&=;Bep?U<~k2;Ci52bEK;?@(*YOTm3Uw|`#hAyc8Kuh3Rpgr(C&vvA{LJrN9HgWZ*?$8n8?U zq^BGD37{#^1!x0Y3mgu-0gM3}+M_%GgMsUT4}b@Okq&5IfS-YxK%ff#OaNvAll@R%yMrH!_<)APk-tEj5h&ll8^8!)!bsQw%mAhXLxT_=m;pd7g)9MBYK6AHV4$-pF_`)IWPK*KQPM^ESj1_5ot;dfvLFcs(? zf%Nvm{Ycmc42^<)z=Scd3uqXPc9ZS{lj;6Agroc8;WuFN1o#bTGZFdc0r^QhZw$-< zS^{li5ic+lm<)8EjPP_Hcng>?1>u3oaqzPz_^I#<(0w}m3QU;6^FM$YK+E3HH=F0% z0z-kZz=XNz7l4NIppST>EB3!#s$7a={sWMDclBN64lFZ3@G~$2 zxE+|h66G3bxE_A?0&Yb914DsxfEmCHV9pl!nfR@+uOH;LA-%xl?XVAMvjgoGFcf$L zm;uZtekc6YAJ@RPz|dWYAD9D70J5QLdP)(@}0Yr<8--X1cC}+%8VfQEo4LZy>h=;~=+$1SAonqg>yi zjG!wODs=wI<@IDG&=sovT1%5I1SOkk#r28oT%P`73)e-Sq$Rs)KoMEu(i+^BXSsYO zx`*^kmpRBy6ai*;in>e)i?N}K^qsE)8V5X zpj9v-5VyfE72(a_^2PT}WJzDy@S#OWXlb!5YY^doyvW>}an2;+}1XkIjj>6|}| zkpHl6I>HRm5JuKt$bTUM$}=y>9)+x&WRYhsh?(*)6#O3W4T#SxLnz}zqyi+A!=;eh zuNIzy7~;1N_cr049}zf+O9S@_oRz?c+&yq*Fvo3sd^o~dW1cGuc)W!ne575I_FcPZ3oz4QCbfojKPDo05O?}_kE z6drmJuRhsEd&o})zleCXGB2B*XInGe&Vp<s1RM0_I=QmiMd6=r|N0<%S*eE|B$VTdB!OS zS45c&%kO{mXAn2j!`#q+5SH@Y3t_h*YzW0$c)s_`AFr72jy;l9s;t}Er0{9>wm#p=@19$QEUNj|aBM?YBs^MyDs!k^yx(7An6C(9O<_=8 z;$$O){+rsxgSc0^s*_a1lJaLHL@8`6!q!tL`JIJ@b(AM)$yOE! zYY+LO2)hhn_pq><$74(ZJv(7UtTHSU;^#0x)icLRl_KL`ULbyF$hSqcLlDYuj zS1hD5kS=-;+8gilN~9~fa9#G$bpyJZ)y6v=W>*n@d2Jv(RH{10p^PhZn3l(X7I@!L zlD0fnru|q}*eDcz9#jp7zEr%Ye2a4`BSrM*j`CzJ*)7dYurQY12VH@9*Xd8X^6Edv zU|KR~wdc@G$gY8GY6;`rR!F>*25US(RI$XnwBqx-i#$Ol%PK^dC3HnY*CptRBD+-6 zkRp@yS4jiOu7|9N6>HPgWbt_W^SPU3Ga%cm1hSOJKOj37veiqH$C)zim&N3{8iD~W&5XOAzUKG4d=4Goq2rIP4MDW$!;2whpwRkyK{zfuxiNu{*+7NN_gBIZDB zl>FI}=!(_W-dcpN80f0qM9J4H$+!gmDpSa~gvutZLr^wF87YZf2{P?&Mc8F*gmT^- zZL$s&7hP8|y--1WVG(0*ALtt2O3A;5s$z9z%4F9HsWX(u?a)=dwUQrLlJbh@F^59N zm(=fkfv!gF^4f(0V_fuEILG1A}#DMZWbo6BQ6s99tTf3E(1|^hF z!m2L^tO+PdKB4?%7Lre-D+%@_J1Y6WlITj%(RM3D7v=d&=(>cp9|v$=xPS00rZ!Np zgt%p8JjOaJ`A$?AWz!%VZ-*LgyAP**&EmB?m2ZI2>s zPJ*t9VZCam$A)d0{)5;@Z4qnQTlU!$Hy&0bOo^N`68~`j-qnZQnxr7fa|$fv$<8l>9{U zmsBs^RqAD+wootaar-4?GeVWTM=@<%W`dS>8x4&c*=|!6?{P;f`2{6uuhC~m7Sdjm zt{CXD2v_pwN}?;%SbJL`x+raF(6u^3$;TCwwv4*kBQ+#V7+06AhVNG+l{}Vk2<1h! zenjO}4`CkP4zf9r9Zs@&V^T-CLN-_$k=o#v59Dm3;B%5wt8;kqqa*8-f8K-ql87)p z5QhF(WwGTp(?Hw2kn%ygXdR*bI3*uk5?x6;+Pw}*%v1Db<3$ptrV{W6E z{K&M>UaBcS$aY%msEAYYPjOz19~|jT-6hS{S9t!)hOWcVRid(%VNhgQYgQd|MOaH% zTvz_UgdCKzygt?C}^uDOrJjXmhk0Q$JJ?Q#6SIJLTZ5vg`ED>VcNb&09#f`@Ttp6-w zyq$`O*Au$@7MCpESjZk*igl6X)8hJ-3@zE{Li!au1W$*qlgrW1GF=+R^mtS4Q{ZJ6 zh1FM}9AKU6CD!k12%996`4tFDzH>y_IavF8xOm^8$&=9=b&W6cF?{C|pz9mf&pP0o z8!R{U#T*;Ou^D{r4VYUmNnf6%D_dPi9Hi?5bQx?d;qyz1ZjsL~_U3qw+J-s7lGv4@ zEn8NEU2~x88`cJoVr^y-b3~}l+S)w}=?`gc;}&#P+l_hp;>#7ao!ZTdnA;$ImbK6p z?^W{jC!>omr#jlD@VZ`OCL=iIb2RkvSjX&ub19!Go&@lg2bKJQ658FS2)iSoFBAHj z9#-bsaSud z=xcv2Vs2wSbd^1>3g_f6)xK-A+SqF!WN$z=lw|YzEE;8*MgDL_nA+!*yzocl zif+s4-H>--RfOunQs`QirsT(zr2OD{R9Cz3@<{&8hQ5@GO1=iANAQ{2^Ltq>;W@z{ zw~g!KU46Qe@2-Q3qSJx$hW1e=Kd9SLo}w#r!KG|kA(li+Qk@|+9I9gw&(aJff4L;_ zWaw-A$mF;5JIXWlWv+5u>K5j|2uaKZVu(jbKjQj^?;T>vpHli&pOZ$4V*!#iuE%lR zZz=i5IR7cDGH#IV5{Eg|j@d)D)NLgX^};(L>aU3R0&fH!uObAV*8h<1Q1GR}hZCPy zc7^d)UzKu2I#xi|9kTsMmeziF%JuO`%k2Zd0DKGLp(75{S(Hv^{4#KF8}79#N&U{y z(ym@e{igWMVDQpAN}kFihenVVF%jPkd^&h{fyaU`@EBfmuHbd>4T(4Lf?YBvVeuR; zaf=JWfA+Y)m)-xleh%*~Jk-*@6td?atLi_BI~n{j4SX8-y&CvT@LRxRx>N9Bca}e2 zz^~UJZ)k<*d=0!M_yi5SJ^1MwcrWlV;3>bPcZFnsDC1Sr1yi`0;KLxFNb+jsVy4un z;TFm6gX|s1F47=7Lq(R_z)Z+G+*9)93u^-%KjRY=$^( z2EIP{CdA_)&KaDuB+EE&a6`c9lMoJ)3kBB)9Ez!M5H}TEpo$#%X(_mY;8gu&4L%vX z2Y5#co#!W{qPyJ7LGBZzl75oUg#0kbcTty*)#@&vsfDgyD?|!H@ftS3e2QAU3hk63^$f&mk{@^dG=_L6W@TnU3MDT~f52m=N-eWNtm80$8 zwf<&(q)@-c2=$(HrQu!;+*9=z@tNR_!S}*BheeoD9sUBoJNP~V@A|V28^URGzzcDU zbr`P{#b-LI<2JZ|5%>KloOs_=x=-=Buoal z7aHX0`?!Z1_)PGBY2fKQx=R{(!-m+C5xjc6wPd`iom3a?!5@PBZ_FP*+q4uJTZV@v z!MI1VL6Eh850+?u&wqz3y zs*lFt_JC8Zx0c{Gf2KkpJM4!O2MqWtaLrxSFHy5Gi~~>LJN5h5Rwd zyOEx}dQNMlyUVjtZ*#Q5B$o;0AM$t9;#D9heeak7K8op~dW-4VAk})S(->XX^MPYpnTwifQ z!Oq4IVP_(2jYmpe7w2>kw*s6YIDl{vmy8Q*aAY?f7WmP(sl=gpQ`$`NAb?K@I?~%Y zyN+`Efm8uW?j_`GA;)|`a#*Q`Me-d5Ia;zS$k8{hc95emOrNS;4@s^qpu1G%Opux}saT9?4Sdys2UjD14>qd?g}PA;b7XPq?zUl)87 z%TLv3GMWl9MZFZi!eIR$kTzL`(Z9DzCdB5a%|j`xI)mA20c;#r5>`=uo=f)haT0kK)faR^WgCc zS+JkhTM};%{;Zlj`OpjeQShN8ugQnPvb4eKkKTgLM1;*z(@Ak{2me$9p9=n-2L2xS zYZ~|u;L|klWt(H~2{oR|qy_kW;J;B?ggnas>@3R`9-W2q>kio||0wy~!en2Fvd*|4 z16hx3yi+g9Gd|Xs)hps%XBu>MeSvXaNpxW?Q;R}$8Nx1OJXpHFX7!4$@v{Wi4V-GZ zAX$6x4&W={oDQm&p5Quyt4M^{9t&+D$%o>eoto~c;F@aCy#li$rf!9Z+HP^r^z}L{g+kvm9fv0^AOu^$-gwRe}k@dIHdy(^dw$pibHI<$z^?~C5xjaCISPIec=a-J1N<5ddT1|;&EUHt?bJ8RoWyx6 z+S}p|cqih~zQ@Vnv5Ddn0{O`T_jwKRv;|-8eZh2if;R^5OZv$9-eNpK;0J>DAYO_G zbD8wjD8(}e_YF-~d0l54ea1H%NukZ^v<`z>QYn%)r(B$e!21lRg`8 zYrsuWk#huB{uB1xQQ^G6nW%8&hal1iPQAQN1>X?7s()M|zXE(S@Y_oAekDaqJFtj% zVWjUR^v(Z*_Y)=2hq>X{BJ`0j+O|je`Od}!xTbc~9o+YC1=~%M55m1lkS~pMI*5w} zX8?}Q=^#HPg431svN;60za4z+5BQJ9I5=qhoC0n;kA0MAVB+-iJ&o{9DjovomO?K- z4=*R|=Ic4jeca3GYLAnbEu-J}isNO+OOEM|ryb7N9>161xa!!$08osbXnnd~ zak}hu$tm6Gw9{EMln6rc+t{PufZjzor^6mUNBlPvdWAW`qSKEyYjgpP;}-)i8Jsr}Mf(kD9;L2U(2)Wi>CiEzcpcQEIH@bKM>yIGNMFJ8 z3yRkPJ7hyN>2QV)f9Uw5WIASQ(lG}*GNI#k$#iVhq$2}5V$A=$GH%=n{R4EwlN~th z@v{V{Q|rGg<3Z424;}GZP*`++JIX`xshU~>h58R2D{A8}@IeI~)DESB)3-pIkY6vo z5C1MzUJ6TlFmc!;Dhfacany(gffLF$piy;MUi}{xvFGTX4yiJYQXw zC+80CDE57d6)7PO+Uv=(0ngu6;iAEP0C$yf_P{ydQmp^;ydxbM6#svrgM4C)f#5;t zkZdE)65P~=dF>u?ZNV8f%3n_0!L0x%?0Hgj9(SRoC6DuW1wvU%gpTOOJYO$=eroE| zuntH|_C)>3y%)6@fEwm?B5@dU8ldxFtlEA&hzFhxJpOpn2+0bFg%Bekce*yOa`*xL}U!d>t zCg%5!Xj@yn%&@_pUMO+}(xx`{K-=3(%{!uwSmHn_DA@K=IZTC)&CoG~`Hi?+ z;Edby`~=dA!ydnv;9|keBBJQBNqvQvy1}Hw1`n?HprZukn{;Gq(h&h22ix)Dz9hxj z;XYhlh0vxPgpM1~@lcl%Bc?AMT#6$b2NkVXs5c>MnQ9L`w9lFu_8Uthr6tg@Oic&* z-V-_wLPtLp+lGVFam8OwD84R{ZQ5xX>f?6kNO9x&fyLW~`WRfmZ}tdB`@h|T4xbX~ zC^%N5^fl{>`U4#mRqClTIGvuXZzmb`QpkoDe zOeH_#u)r@1oTUfPk0T<_HtHY!rT)a)6*4UF5=d1E$$)%q+*jvm#rHOLvBj>|6e13w9<#huu)@eaZSOvVAi+N9;>@2xMYD5xeRbC z0@3y`?qB%V1OGR9fcBrKL;A}^tjcq@M4BthP^cg{FUPL$2^){$T`RjznJZSVYV4Yt zDLSf(z@@SKX6#zJkEy>9qLNtOSBl+V%KCZ+!hURY%YfY<#0)LNu4%4}4q=W}JQed%m=IV(0&Wq{6sUWIk*LEx* z&#omu=dkNfETXUMTC$r(^Zz4auqy;scI65=z!*D3WD2CG+E@8Nl;Q@wc8UD%e zDZ@_;bt{PRRAyM0VGD+x8TMf~lwlOZ=?q0J7*8gMC80~FPOTM|jsZggeS;OYO&T?6 zWYf^rik(Tq8j$i&hkMJb-=gxMg*!1U0~Jib2&hTE26!1)p370QkHQhJVD%H1!hvN8 z0x#89Mspa3BHpT83QLz%zIqFMrJwarl8?a&>6hxi#LpDujX9}4OZ+CBXtIAtAw11n zX_8MZgueuy>&EyImLehPq5Vur ze+!l%XD08*_=$|K$@nnFuVnm4#!q3qetnUU#-KC7YjG90STnJl+c5csOx}&jOY_ci z8Bgnb={P3>NBgCbotqkngs^uZPU$;T;u|#=d0`(yoTf5_tJL^S~+Xc|!8j8SkJY-fsn-;!R-_{Pwt} zqdVg>%Zo&&Xcp%u@zP*XTa=GxJZB)vXNbU2vq3_@k-HaFZ;DgFZkHSM;Z8u;+=x=G9AL+ctG0kiTH8u0?ko-M29)*74cUX ze@CRbT1D-(>QY7~h)l&lx|A@y<+-y;3w>!T9b>zGp9y z2xI(6#-C=quumQCO<;OBp6OwJn8A1-rayt{S-|vE@f3+<#;<4c3V4GK1=F*G$)9KP z!g~PRzsPvn|C^38B5>J^cj_Y&?HK==@lzN-h4B?oKgj<&eMLgz8!&!4Frp z$sb|zI~jkH$#XSCc^k%`W%7aSfpZVz(-~jQTO_tI{s!aEGG5q64fh^1{w;f$m3Vrm zNd7b%EGiK8O2g?p$dmo{mGay3mGPV!uZ00H>9O_^^$7c=;Z(tRdq0tXD*|W7cw2vw zV8HcY{9;C=GWkJ_?-#%bmS0g4KUCzUcN()8e}(Z9%fmbko?`Tn8*aHiv zjlm1$r*!`E(}D5!YP=`o@ll+h$VXJig)seh!bM^j^IJ6I9U?`3r;doT8UJRic>P)g z?l;EYA14yR-cvYT!}!({MPACUW@W|nDj6^A5e1PgOg=bPl&{Y0+0XddlSO`x2;3f2 zG%8mnQ$#}8j|r!#On!Kr$P0Tc;q(&Y{bz{0uwN2RA2I&YERjDT0;i=X>W`Z(62iVl zIIYh3vvWmW*uw~?O&H&Nfyf^cf$Pcm*Nk@&vw<7M_?C&Hps=qHPU9GV@Hdec_6Wl1 zGR8aoF7m>DKRDgQc-_?^FO~BXjE`Zwuul&}E-}94y8QA_7{7z@!u~oC`3j!OTMFyn zrS?b3cus9RrjL${9u^@MQm=+eQ0@{ep11 zk?}X#KxqQ2cl#ONi*+EvK0c67V|>ORq8{Nr9!~EvzBVg>!ah9UpEG{nVNw1Zt6x7D zKmMr53wxH}R9;%NQ+Z6}+3=UE#rU?zMSi#lTx-U!XS@gFjViPJVthN6uC9<5+A$r` zpRvrI!Hmz;7WovWCxY=lvi$NiuR(h3wM1UQ^sE3+er|SJR1nPK-NyK}=R{ue=W)g> zE{eRcFAYwwGCm_+h-Wqf$i}E$R{WOu=aa<3sL?ys$qCPB$_=92+-E?Zyr!fAL{{J*OGp^NGm2i`mKDVf>nZ zL|)jt1gGy9Z}ME^g?&kIYEVY>Pu44u7xo;%sRiR#y%Tu{5xC}zf53QQ{}9}BW_-mj zqM+1I`7?gXSCJR?^}y*E#xGZjys$?HP8Ts=4_>81*lz>4gYn)nkr(#O0Dq40v-Lz? zvhx<>(-|-9e*yV-jIU=P$_x8gfG^AX+1+JCUf5>>yfNcbJ2Qc|u6Oj=1 zQ$Uyy#>Z6=`BO~3t%;bwPSr$0*#7|XW10NX8Y0htn*m;EAJoRrOPKtfnxcGn);{cJ zyrqT6UuJwdcun&^e+l}_b1Bkd7?$39OivO`T;N#6_;-wtWeH`E@m$$*V!BeMit;r6 zrlT6;6;nh)TK`v{@s7e`A&j3yajrGv?O8#P=0lvpV-+elKq??ieh8D#oGt2K#`tiF zr->9CKEnT4HQC+!t^X+1$_ z#%FF3c|+D-^<=z){`d|K=^g)Y#@n9}dFkEwB*w>{7kQTbT%x4^qR2Zkdp0nhyD#z! z8Gl03b64a?ioo4wy!{=Kklu@COY*lwUV4Z6S(5)-7(u<1;fwUYdUD!T8uLk?+Uk2Qxl}=}Bh15l~aT3xT{AXT;@~>LJ^8Y4Dp7Ct>&h2Nsf|dVRrsovnebPidl0P#TpTzRF zEt7vJ$+HJasaeTkeCz{JPgN%WgYgQchdqXIk*vOOtp27ld0I?NG4@7%1h((TZ~U(ezRcqUtrI(_N<-h&-mw(9yVbxo$*S>XD$`> zOY1JnRS@&no#}UH@--MAJ4cis!T83E=U91`__mDCOcmvQn7q5BCqd+IGk&1Nv-Wcq z<3kvqvR;($AOe>orHhplCng^)$uATIS@*^*V7!8re>UyKtpcybRp!*+=WUYouy%}1 zZ*hAWZ+}VDKZnJeBJr%ec{Bbh<71g0Bt1{S<4ayHl#MHxIQK=;b41ji#%`3UNOqe3 zd~Yksn}etNn8eBzdtB#eA9dn0S-X+I0&HaEElI7swU^{sy26-zSH^R!-bw9`7vt@j zpQUy$fbl-89t~l7qQF;#J(kX5xGJ?~5DI?42K z<3&Bfw-N|AAXBR5)gUOrp&mS*+U`GD2X7)(s&M{=F!|Apzt75*2jizRz6G$Vb zJiGXRO1t_XIg0B(2tq_dX;Mh;@=qhp@glgp3qSYj$^TZ_w_} zYGzityWk)@wqutAiVK6On8cVOfbBr!6s0N@Ok@gEO5vhp_``<6E;)Rr;v|$4V}gT| z_xkz&`ce*OCO`|94=Tc^2i5&Q`F#HDq77Pwa{&g&K6@4Z~`NsRF( z@K?b{&;L*TP0{B~klzRWdw?G->1_n~72AYAUE0@##c|;@9|8S8pCcf)Tb>X6#T`XH zJ>YxS3w}O$eH!?k@M~SiJAfZ3@wo^1%iD#|j5OXM!k5ze>gV+;>Ni3E9Q=YV?dv%3 zS2hTPw}JmF@FnAdYkoZKY|-ag(7(!gKJceYa#nyJLBbgV|JA@RyG;0>4*53$@4Z(D zw}ZY3{Ei7BJPv%$;<#{{KVy6e6=m3Wybt%j06zOm_VxhqN1-3;X*Z&}#<7YWP@&kvghu;UE)sS=PBK~y&=-;?V z#@&CSbhs4wx5flt1wNC&J?K;A=>orZmC)Y-`cDA=E5wJTz&{T>M10-^{42nxN_p*| z#X;mWPk{dMlHEQJ{B*=mtyljA_*130`WE5K9lrYZl;383E_|J5@zcFqOMEUO{EQ{< z%AT*&o}-|@8hX%m-?9jQAN0(RUn>6|y5{)cy)Jq_Np5@lL4WUv;1%EpK)(y0LoJuv z^gMys$#W(7zXAN7Q6W@0{{i^6-x0<)fdBV^FNd9IJ^vE$yRrZJJ&gNbgtPps@%b8k zoMc}olLNBde!P^w&INv;A`D<&-o?QG4S7)4=tID7xkTs_@V^@PU5Hm2hnv8UUMln( zLH{w}Ps8stZrl$1G}L=v0R7j1f2}0XW56%mEc~b7U*EMjIy=ovpuYog@^_`1_cP$n zV0~3@r%++WdOM7|t)1wG+u7SpOT4$JOu7Wt{Y||8W`D zeou?WxCr$7K(BVV8u*uyFVv6N-gR9|67-{I`S8Gr&h-pGtpR`M^Hk5BfI<=XEUC`AaAv^1L1?)fHy~-?dui zrFJp`{FG8$S_}O7QrzI*Okn@6!Kg1O8S?KQ{w!m+F>V zfxnD>=qJGEe&BzB{7aYi^)T=&z(?zuZv&4Jmvmj8Bm4}~+fmrR>iH$mUsTGEuLD1~ z1E`0u0{t4`&!f(K1o(A?+w+;yb6f%F{}l01=k-b8 zzXCbcAMXGj!+z8+9s<4t@qf&d*OS1#W3rC+dtr3zIpCj#{4{<0dIk6s@T0xJe-3;d z##R4a_8!sCYp^RSJMHU2;IBge>ZeuU50&!!YTzfqp0y6&WpOZdngsMen>@jys<-{X zZ(A$)xzc!F0=@(BT;+Md;tpfHJq-FYOZM|5@F|R|_49GyXTg59g3nKYUs)6Z^wG6Ki)*R)j#UAA&mYR&_4@%);RVM@GoIpmFKX<3!>YLr$GNlr8xP0;D^CS z{pDrg>#^_H2R*;N2>+#&IJgck=biHiXZxHl#q$S|_s57Izb=->tAW0Oc%uGw1@Qka z)fHC(H;4louXYi>96a4-9`uiu?Bt8UUq@XBv-bYR;w}ey|1Hp8I4+8T8F{Y(KZrc8 z{^h+-^sv9=zsnTALHOK&p*{fo46K*h$>oHzop@(2v6QQ&@iu_|#jAvm{hyRXaXs+M zO5@%L{8%Y}%>%y&_4aL$=T_jimiXKU{0|WyCP9CQaMcMQ+k6-F>q~k6S-QJy2|awF z4unpxg8mWcLH+lqz#l|ip>glm$za&tdWa`#5AO#4x23wbMmXzpKl15vjJq22KP>6z zTF~EJqW3M1&Q6np&q3&?r}wUD#A&q>dB*tl(4>CLBOpop)6ouZIuN|a8vobW8!c2$V(D-re&zYc`#&h0u>~{jw z>~}hI)Wx|nM41*y8)=8cnNHGZchI$h)%&ep9ClN>NP>885c8WEzcxB%dwZ=QHT(AZ z?d*+dhn>1tkD_#j#&1N;!0Uy*UeImP?DmD-X0*>F;e6nA!+O0Pcuby8F8S(KKn)D>sd=BPGyM4m)Brqdu=eGu+Fnn4kCN=PSciua|^mHBmk2hW*6MR;nwV zOPb;e$Xm3|<_0mN)SdGlt4rcE(>us+WTME-Pe5EpY^Ofok@nMY^~z1gg;Gc56oKfia1CTvu47PRAd?>p9!X?lWIlZ)#zJg`SgUUS_4^S zMJ969fc-T=9QtiCTIR2IR@1xg;Q38!g!yIBWO7renc|&%a^*WkGaFhcM?p$*0lN;3 zsC#3dh6w12=9mkFsYPAW=#yW2Ys^G`JDa(CyIS;$j8~PyS(y7D8HK%^U0E3(_NJz0 zM=HY(=Y$O#gU1vL#C=q*szidT{6@rInRUh}%+586Ue3I>NN*BJ$ILj^Oo5{cP%GF{ zhQ71ZHIwOy4VPSLc5aB(PxhO!FkxbXJ^ezY${-tQ$i!M*Hha%rcET=sU$s)hbrGz( zM8#V}a*G+|;tb8RL}UU<>QhuBmor_s8H*_nH)j3UIwak-agrTcJXA;=BO)uIo3fVK zJgbI0k+W2vY;M2;C+_{?;*&=$aF-MTRwE zSRKn4(z;nA8Lw1FEyL?WX+DUfinG)@9h*l~+tr0#ocdsJ7crHrNMZ0c5Z%tpo3OTz2Dd!IP=O#^Qn?v(x4S{ViW4z5OdBMIjeC|a*FL? zJ1|+5n)Rw>nQYaymg`7I4IC%u)HDMji51fhgW`;O+9I z7yBV~ppZbClbVed7{;)ktKjXl0@HI+7IQu7&G^X-cbE=?cGFw4@tSJQ46m%3DSBmH z$6GDhfUqvNe%KmZ&UL}9s^=$3(5biQXgBj=+6UMzjq!U_JkN$5KMg4tj@1ZFNbWr} z*f0w}6Hcd`7>jpi$A&Y)6=N&pGVFHM)Gal~2r9sd(+H21Um2fsYCH0qL6h#pGDuF^ zE(1xwKA6+2@OJJOkZ#I@0}=rQV&N^%(xnx)Bp$TEbFgno85A3$tY;iF8@6G^D4vP7jJ1#}d&$U+WR(O(vmsNBV*t%5Ey-EkLlv|1am znBe6saK_)SmrRJ#rt+rw7DnYe4v&$Q#cSJh{nYrT2{W;IoiX0tfSf642aVL!=!Xy- z3!0^pmb9KJ*lt6!EAB_d%yf`8W_ZKU5kgiKsgtdFn2@HOl`}?lRn6Pek9bSjF=Hdy zc-*hvrmZLT&-HfH*i8{I^-UDh5rCJpxM7=o_XpNbG`VJU;RVLd? zdz9@^=7+R`LIq@skNyd3$@E^zCcx~!OXfHo5we8SZf6twkH^gmkqIC zz1jaVeKdB28)~q%B zKZ-`3M6r2dk_!;>9nDt~V2AaX$m|a0=+L6bxgB(yyno~4A-ZpRbYhYSiHJkzC349! z3}fS#(CLdGldA$kigA#jIT<(ITE*EiP-tj}b@G3^>*@GmH{(1EOPr>3 zOwKLhhzOz{DhldzDILYxAuuJmMMp&9%DXwa_I8tsyQamPtGL_f^s+HnmzI!DkvM_N zdl}GRx@N<+O(s7A+_q^g9{_Ii`DDh@_oy99!fFyVR;@DABwBYmySD;z)J#9yE?{WhvG#Uj_I6M?^kcP;_WlhtLTosf?h9zg$ zG-xI%kB0c=nSi6S-0D$vP4{RQ#05019o3$V8GDo&!IYUKW5rU1-4;c{-NcVYQ9VHp z>=so?Bo-#R5(9#PE#s^%$OBk{%<=NWUg=MoN_q*Yqg8C#HVLB2CEMYL)^Se|6()pg> z3e0Q=i^ZoI9NEZOL~gMaHVQiIhJ%txS)Iv{POMQ6R+&3N+{*o}5NU*aBck(4s@SQ% zF0G^+GbEg;9%0HXB?r8`{GQBRqhO3GjXlzq!N3dFkgqSij?RI|SDRtHP{U+WLBG*n zVD6MOpfJ`TsYkBJO>NqiD|g4qN4CkrT&5-j(b%rU(q%7^g$2LeqiskOrwc`NRW}VU z?$M^Sv?H^t=hA9^u{Ts{;%d1Vx=hfuYhSwh1uUiyV(9Y@${u7V8lCM3oI{x}m3-=* z8V<63T{$Gs=Qmv1*~Fvju4#PY;5#TyxVe-Qo*Zvyr;9RO(hmtkzMp-rGAJ7HxddrI zG81Bs^|?Ht=;knsL3F}k_e|cj)j^&CoIF+=U{oY(F(nazlx0kd>PF;W!_gp(+6xVV zCreS7l$Jzhh74MhpS$YW2SlBqKP~1XC zwB-#9gk&H`l{4C`1EqCRo@7;tO(X@e=HbAWCeNQZafzgLmKK_h%lC#dDHcZDYJP$& zBddAI38_j$Nwtt15dn)L!)UU@7?pAnuPRh9uoHGcYhv#3BvL+p$8Xd2W#KwXP$s3t zQ9lpCR38SEFC+DEL<~2b*fLd4L^3_u%yy%G762tkj?GGygu`EMs3=ujtEp=l$2%HM zeB$3+lFCT%6q_+=M@ws62n0?I$QEMjJSz)}oaK9`?12&~o;sT$T?I~>yg3Reqj55K zwpqHLVz%P=MRP=Z%37BzJqeCH+L19e97)@$HLA4OH5Xt zM#9&oE82jm1{bS==ws(bgUZ7};fR;A)Iaw3*@E1bDVY*%44;nZNreGy;@r^()Pd?l zRXS$EC&j85HkQJXu8!vhSZ9?A74aemMSOr1%<81DcNWic@(pd;VGz2$xjSQ9j-4%O zT#g^;Q+qm|qMb6`$30phu7!0+WH}ag!tfl4sWU-45BgbXq{X+993pb<;}56>Wd{VI zHUDKbj^b9F><~bwSkPvv8p?fSE6CruV9o82qd+hl4wTg?oJoJyc%bfbb>>a)i^H_I z2Q!AY`8b}GZuy6%Qf*t5RHD9Mqf#DOvQk8zn6YOF`Vi5;69itxOl`1;qB__pn&fNL z##Deso!k#_i1B|F+G=rG>KPE}3fmnwO}kl*CyS6c?3nZv4jk|@gmyz=IYQx9g=cDiNeA=_MkzF9&+N3-|f0)^LBS! z960mSkKAfw4ed0m!hX&nq579jkOu4l^Q^;#OO<}vKAG#Jew|vX#BDpH?`YQ(#67Rl zjnbgf>h>%2kxq{uz?yTQdOvJ8FA1A~#@B4Pgdb+JAw>FwOyW*+IRYVw!-}Uquc*F^w*XiS1r6wDqPWTHPf}*U;od%w#!TO z<^CU``)|;14Cwst#`xOm_$Px;S$>~z`WI@M{*I!*w#UGjWn|rHfBpVINEmm~@%48e zw8dk$L?N%T{`U~fzoDT0^>-7r)$d!W{7R?oA5(w+O$NQz-(}Etmkeul?8*<=yM^Ac zPPM=OE`+wPgHXpW&;N_)zXo*rI}_S21!L~dIxF}83Slgl^3R^n#dEsn&^`B6UjMu3 zrrUox`f1zao6ZZjvtjP1dv5)c*+jYx}Neivp}xMraR}`s?@i zwbk$6Yq#?JA4Pv%Km8u5wmtm<3>Yj|Mrfa)mL=5w`u}XSZR;0SKwlpJnNt7b9~4g7 zp0ybr(Wu@OeSzNa5OBNfcN&g-O!$=P^;+B4s4qX8t^M_PS&sga{r{8tvh8Vq{Xas- z(Em7=kNuxtI>nz-_T~2IA>sPj3tcoHS5bD?LH)1(f%Mn!WhuSNt=Inx DwXn7& literal 0 HcmV?d00001 diff --git a/ctm.c b/ctm.c new file mode 100644 index 0000000..7e13f45 --- /dev/null +++ b/ctm.c @@ -0,0 +1,404 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "blst/blst.h" +#include "debugprint.h" +#include "fstoken.h" + +char* token_path; + +void print_help(char* name){ + printf("FemtoStar Credit Token Manager (ctm)\n"); + printf("This tool can be used to generate and process Credit Tokens for use with the FemtoStar Protocol.\n\n"); + + printf("Warning: This tool lets you do insecure or broken things! Be careful with it.\n"); + printf("ctm is still under development! Do not assume it is secure or complete yet.\n"); + printf("In particular, note that keys and tokens are currently stored unencrypted on-disk.\n\n"); + + printf("%s help - display this help\n", name); + printf("%s path - display your token path\n", name); + printf("%s list - list targets you have keys for - use \"%s list verbose\" to also display paths to the keys\n", name, name); + printf("%s keygen [targ] - create a new target keypair [targ] using the default token format (128/256) and the system true randomness source\n", name); + printf("%s keygen [targ] [tfs] - create a new target keypair [targ] using token format [tfs] and the system true randomness source\n", name); + printf("%s keygen [targ] [tfs] [ikm] - create a new target keypair [targ] using token format specifier [tfs] and 32-byte hexadecimal seed [ikm]\n", name); + printf("%s keydump [targ] - dump Public and, if available, Secret Keys for target [targ]\n", name); + printf("%s keyrepair [targ] - regenerate a missing Public Key for a target [targ] for which a Secret Key is available\n", name); + printf("%s req [targ] - generate a token request for target [targ]\n", name); +} + +int get_key_paths(char* target, char** sk_path, char** pk_path, char** tfs_path){ + int key_path_len; + + key_path_len = strlen(token_path) + strlen(target) + 13; + + *sk_path = malloc(key_path_len); + if(*sk_path == NULL) return 1; + + *pk_path = malloc(key_path_len); + if(*pk_path == NULL) return 1; + + *tfs_path = malloc(key_path_len + 1); + if(*tfs_path == NULL) return 1; + + strcpy(*sk_path, token_path); + strcat(*sk_path, "/targets/"); + strcat(*sk_path, target); + strcpy(*pk_path, *sk_path); + strcpy(*tfs_path, *sk_path); + strcat(*sk_path, ".sk"); + strcat(*pk_path, ".pk"); + strcat(*tfs_path, ".tfs"); + + return 0; +} + +int get_keys(char* target, byte* sk, byte* pk, int* idbits, int* hashbits){ // pk/sk/idbits/hasbits pointers can be NULL if you don't want to read those + FILE *targ_file; + char *sk_path; + char *pk_path; + char *tfs_path; + bool sk_available, pk_available, tfs_available; + int idbits_buf, hashbits_buf; + + get_key_paths(target, &sk_path, &pk_path, &tfs_path); + + sk_available = (access(sk_path, R_OK) == 0); + pk_available = (access(pk_path, R_OK) == 0); + tfs_available = (access(tfs_path, R_OK) == 0); + + if(sk_available && sk != NULL){ + targ_file = fopen(sk_path, "r"); + if(!targ_file){ + printf("Could not open Secret Key file. Exiting.\n"); + return 1; + } + fread(sk, 32, 1, targ_file); + fclose(targ_file); + } + + if(pk_available && pk != NULL){ + targ_file = fopen(pk_path, "r"); + if(!targ_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fread(pk, 96, 1, targ_file); + fclose(targ_file); + } + + if(idbits != NULL || hashbits != NULL){ + if(tfs_available){ + targ_file = fopen(tfs_path, "r"); + if(!targ_file){ + printf("Could not open Token Format Specifier file. Exiting.\n"); + return 1; + } + fscanf(targ_file, "%i/%i", &idbits_buf, &hashbits_buf); + fclose(targ_file); + + if(idbits != NULL) *idbits = idbits_buf; + if(hashbits != NULL) *hashbits = hashbits_buf; + }else{ + printf("WARNING: Token Format Specifier not set, this is a broken state. Using default (128/256) - please add a .tfs file for the target\n"); + if(idbits != NULL) *idbits = IDBITS_DEFAULT; + if(hashbits != NULL) *hashbits = HASHBITS_DEFAULT; + } + } + + // 0 = no keys (bad target), 1 = PK only, 2 = SK only (broken state), 3 = PK+SK (can sign) + return (2 * sk_available) + pk_available; +} + +int keydump(char* target){ + byte sk[32]; + byte pk[96]; + int key_status; + int idbits, hashbits; + + key_status = get_keys(target, sk, pk, &idbits, &hashbits); + + switch(key_status){ + case 0: + printf("No keys found - target unknown.\n"); + break; + case 1: + printf("Public Key available - can verify and request for this target\n"); + print_bytes("Public Key: ", pk, 96); + break; + case 2: + printf("Secret Key ONLY available - this is a broken state, please keyrepair this keypair (see help)\n"); + print_bytes("Secret Key: ", sk, 32); + break; + case 3: + printf("Secret Key and Public Key available - can verify, request, and sign for this target.\n"); + print_bytes("Secret Key: ", sk, 32); + print_bytes("Public Key: ", pk, 96); + break; + } + + printf("Token Format Specifier: %i/%i (%i ID bits, %i hash bits)\n", idbits, hashbits, idbits, hashbits); + + return 0; +} + +int keyrepair(char* target){ + FILE *key_file; + byte sk[32]; + byte pk[96]; + char* sk_path; + char* pk_path; + char* tfs_path; + int key_status; + + key_status = get_keys(target, sk, NULL, NULL, NULL); + + if(key_status != 2){ + printf("This target does not refer to a keypair with only a Secret Key available. Exiting.\n"); + return 1; + } + + printf("Regenerating Public Key from Private Key for broken keypair %s\n", target); + + fstoken_get_pk_from_sk(sk, pk); + debug_print_bytes("Regenerated Public Key: ", pk, 96); + + get_key_paths(target, &sk_path, &pk_path, &tfs_path); + + key_file = fopen(pk_path, "w"); + if(!key_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fwrite(pk, 96, 1, key_file); + fclose(key_file); + + printf("Saved to %s\n", pk_path); + + return 0; +} + +int keygen(char* target, byte* ikm, int idbits, int hashbits){ + char *sk_path; + char *pk_path; + char* tfs_path; + FILE *targ_file; + byte sk_byte[32]; + byte pk_byte[96]; + + debug_print_bytes("IKM: ", ikm, 32); + + fstoken_keygen(ikm, sk_byte, pk_byte); + + debug_print_bytes("Secret Key: ", sk_byte, 32); + debug_print_bytes("Public Key: ", pk_byte, 96); + + if(get_key_paths(target, &sk_path, &pk_path, &tfs_path)) return 1; + + printf("Writing Secret Key to %s\n", sk_path); + + targ_file = fopen(sk_path, "w"); + if(!targ_file){ + printf("Could not open Secret Key file. Exiting.\n"); + return 1; + } + fwrite(sk_byte, 32, 1, targ_file); + fclose(targ_file); + + printf("Writing Public Key to %s\n", pk_path); + + targ_file = fopen(pk_path, "w"); + if(!targ_file){ + printf("Could not open Public Key file. Exiting.\n"); + return 1; + } + fwrite(pk_byte, 96, 1, targ_file); + fclose(targ_file); + + printf("Writing Token Format Specifier to %s\n", tfs_path); + + targ_file = fopen(tfs_path, "w"); + if(!targ_file){ + printf("Could not open Token Format Specifier file. Exiting.\n"); + return 1; + } + fprintf(targ_file, "%i/%i", idbits, hashbits); + fclose(targ_file); + + return 0; +} + +void print_path(){ + printf("Token Path (from FEMTOSTAR_TOKEN_PATH environment variable): %s\n", token_path); +} + +bool string_endswith(const char *str, const char *suffix){ + if (!str || !suffix) + return 0; + size_t lenstr = strlen(str); + size_t lensuffix = strlen(suffix); + if (lensuffix > lenstr) + return 0; + return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0; +} + +// This function is awful because strings in C. It should probably be improved. +int list_targets(bool verbose){ + printf("Listing all targets - you have secret keys for, and can issue tokens for, targets marked with (*)\n\n"); + int n, keyname_len; + struct dirent **files; + char *keydir_path, *key_path, *key_name; + bool sk_available; + + keydir_path = malloc(strlen(token_path) + 9); + if(keydir_path == NULL) return 1; + + strcpy(keydir_path, token_path); + strcat(keydir_path, "/targets"); + + #ifndef __INTELLISENSE__ // VSCodium doesn't know where alphasort is and highlights an error + n = scandir(keydir_path, &files, NULL, alphasort); + #endif + + if(n == -1){ + fprintf(stderr, "Could not list directory at token path.\n"); + exit(1); + } + + for(int i=0;id_name, ".pk")){ + keyname_len = strlen(files[i]->d_name); + + key_name = malloc(keyname_len + 1); + if(key_name == NULL) return 1; + + strcpy(key_name, files[i]->d_name); + key_name[keyname_len - 3] = '\0'; + + printf("%s", key_name); + + key_path = malloc(strlen(token_path) + 9 + strlen(files[i]->d_name)); + if(key_path == NULL) return 1; + + strcpy(key_path, token_path); + strcat(key_path, "/targets/"); + strcat(key_path, files[i]->d_name); + + if(verbose) printf(" (PK: %s", key_path); + + key_path[strlen(key_path) - 2] = 's'; + + if(access(key_path, R_OK) == 0){ + sk_available = true; + + if(verbose) printf(", SK: %s", key_path); + }else{ + sk_available = false; + } + + if(verbose) printf(")"); + if(sk_available) printf(" (*)"); + + printf("\n"); + free(key_path); + free(key_name); + } + } + free(keydir_path); + + return 0; +} + +void bendian_from_hex_string(byte* bendian, char* string, int length){ + char byte[2]; + for(int i=0; i 2 && strcmp(argv[2], "verbose") == 0); // i don't know if this is cursed or genius + }else if(strcmp(argv[1], "keygen") == 0){ + byte ikm[32]; + int idbits, hashbits; + + if(argc > 5){ + printf("Too many arguments. Exiting.\n"); + } + + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keygen [targ]\n", argv[0]); + return(1); + } + + // Default behaviour for if only target name is provided: default TFS, random IKM. Otherwise, validate and use provided. + if(argc < 4){ + idbits = IDBITS_DEFAULT; + hashbits = HASHBITS_DEFAULT; + }else{ + sscanf(argv[3], "%i/%i", &idbits, &hashbits); + if(idbits < 1 || idbits > IDBITS_MAX){ + printf("Invalid Token Format Specifier: number of ID bits must be between 1 and 256 inclusive\n"); + return 1; + } + if(hashbits < 1 || hashbits > HASHBITS_MAX){ + printf("Invalid Token Format Specifier: number of hash bits must be between 1 and 256 inclusive\n"); + return 1; + } + } + + // If no IKM is provided, use the system true randomness source + if(argc < 5){ + getrandom(ikm, 32, GRND_RANDOM); + }else{ + if(strlen(argv[4]) != 64){ + fprintf(stderr, "If providing IKM, it must be 32 bytes (64 hexadecimal digits)\n"); + return 1; + } + + bendian_from_hex_string(ikm, argv[4], 64); + } + + return keygen(argv[2], ikm, idbits, hashbits); + }else if(strcmp(argv[1], "keydump") == 0){ + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keydump [targ]\n", argv[0]); + return(1); + } + + return keydump(argv[2]); + }else if(strcmp(argv[1], "keyrepair") == 0){ + // Make sure there's a target name + if(argc < 3){ + fprintf(stderr, "A target name must be provided, e.g. %s keyrepair [targ]\n", argv[0]); + return(1); + } + + return keyrepair(argv[2]); + } +} \ No newline at end of file diff --git a/debugprint.c b/debugprint.c new file mode 100644 index 0000000..4a72ca0 --- /dev/null +++ b/debugprint.c @@ -0,0 +1,30 @@ +#include +#include "debugprint.h" +#include "blst/blst.h" + +void print_bytes(const char* label, byte *toprint, int length){ + printf("%s", label); + for(int i=0;io+Hm^x(;8-%1iwGK%6eFC z|NiV}t*5h3vmd5f52$wO;uS3omoK<#;ew^=_kvZ;bC=FY53CIdCM+snyZ_lO3Si_xtCqmv~)qZUZhMtHY{7(FfYA)Wl%S- zKv!I`V0qKr#Z6xhYUY>ds`HjDX1BRvP{)Ghb6b`z_e+9?c0FAhslH@x zGbd-JwEzn+e}P@HXzq$d_A>zp(10g1npHT}Cda7_g~)L#`&3Oe+`3+*O!pq4zNu0A zoNB;`63f(v{1V+~gwz0Zm_DaAur;rO}BMCr6B8Q!{9%I5`^H zv&hj8()pLL}Ne)uua{bv`-z2oA%7wjpwy+7O{HRM6;-)YGMr>V_Qkv@CMe zB|bUoX957q3!cnqLE&Z1kk2cwSgz4K$A=@ku_mDM_oLDhH#2-Mm&}b@+B538gi$On4mtAEMN?xg% z!}7UT2Kdx;pnICTqJgWd3IkkNB0F3dtvTP8?tWEOs_V{MwtPW@0@kd7o^^pYy3J+H zWHe1UcmNNqfP!7_)4Xt~|6Zg@xB}n`F<8Tb^t{E27OT(3yHnps1u-G)jX~k;kMGe1RF%Nov-l8zChN|dhQ;%uvVzml( zQgG~04-t@?o0^*!EL{lMRj?_*ZfRN!kzuM8YaW5duV1)ZKvb|>1F7F!b$i+JV3F~A zMfy^|oU9Gmf4X0=c*UZo%M67sZ&)6FM0LMrplg;}h7kI1S?1TFO%!^-eC95myIhs2 z-^y^hELzqe)}pR>dE|+PFHCnTaK8qol~<-DSGZn;cP!L^dwtY2ldf&l_yyDTgqj;d zxRUq&s{lF8Hgaf%`gJViG%F;eo0hhmu4Re(t&wB2qE+u>(^N;S_pxb(>p?Nk_xDEn zp>WN`c1`^dttkecfgX}9z?GO*A!PX_KDMSt&?Pmj-U7?TwySLtDTbY5B&9HukELSu zi({);i{eR%xM9q>chwzg*y36Q=gOWzE`V&?SqcWL}F=at%wC&4+v9^b(2&^?REA zdsT@NW&XSQO)E*UAVX<5tweFN|7h4hS};9YaJng2RifMYRaTYIVPW2vP(qo}tlxgS z(;Au=!;Wq;l~$FkSh%#Rp}FNMQ<7J(s$}H?SoaGSF90g0O}C$_KW0C{1sr;uHqCyV zI$bq|0>5klly3-gr%#_|gT9K-oavHfOIv{QX#n$z<<=A5oj!f4DwZFr1ihf1^k&sB zS-xQYJim2`aEj^Ko~{}zzeHg#Xo0SmitH|4bh-^N;PfZ6qIoVhHw~cSpoUczz{VN@ z+x^k1r}*ZOHr!+Tc1Yw&d>#ai{A39Svt=zV{NSm{jvy8up`p?>DVGI zxpYOz73pOy4a*lSX*g}_^xy&7gzb#IDxGnqO5j(3sriv*CgxBq*6^`u(oJn3t*QfHkIig4zXOHRl z@xpui&Msa4($X1yXHWe2%je$GcXrv=KKu61^qoEF+C^jkv+wNk+kTZfweRdx-uKJ? zi=4i*E7*(Uom=U+TOIF$MNXe|>ts4ho$77rVN_Yfl5WR6;COZK8F9LDC&o998SixU zmpbl?PS;*MPI|91VO!?e@y_G9?j$O9?LX0p9ZGt2?~wY=l-|49NwTkV1phYicYNb} zQW9)9C z*U9{91P^1K_!(W@n^@BAoxN?wgq830&i>Afu`A!EGpepFLs5g-*Qg>A|f@cVCJa8#pnUIWm6bDJgfa;K%8crN%-4 z-pyEyt;_c%y~T-<`i|eX%#$p-SD3lEJMPT~d2-?IpEe>^}cIr)}Rh%slD7hE;If9jR=*zT=^I z`;{oXmvb6C-I?<6;5c-Un(~;_@mRe5e7s7zso!jkf2VUz-6?A_{jvJg7>Glr%u zpK#pn`s`=QliT+uvsaZRtM@wY{$%{D7oCoSEt>@LT96DVVB&bE`r-6?nPYWtdSLKB zlG*c09q*i2(*0Da)Aiigq?dYESt|Cl(|1NWfx%xc1%^MP!toY=IOe!ts&qO!yP_ODN zCwn%KFdK8b07&Q`>v*a0Wlrp{(^sRBP$Nii+-8G>MV06vnLW|TG?pI`N_N_MxULsn z(p`Ijp;#xg4?GL1{R61hiCs{U$}U^P34fu!x)W=>=dKdpq=RN2Z_y^$+NqGunL3oifOS-Q+?n_SgEFk4WTL?0KFY2uuSJ|q#=%OgH;-42*^fuV_%*M#oJBq=~Zm{ln+ zF&_9E=Xj@_kjnle8z(otG4eY8_mWFVg;jF*o5?02fRdSgRS-n@Z?Gx}VU&KUG>neR?G(ruRd@Hv+=OpmzhS_im0Fz^l;mn&ZCgcwe*Xr?MYl#gq%udC{Mt zcP~*D^eV3*f(!z-R5{+gougIp?j0WGwO}})I>#koN%sjo%{g0u>`9n7<@Vu;=@%R? z-RO8<3Bmng^bfcPQtoqVn%-TqzWK2GRg#$=0De~|8Ww=xuZAiF9#x`q2&x1GYKexn zeXC9+om`!`&|0u{J?q(Bb2^J^a-sF7&X*}y>WnNS@E*4wSX3D!S zQ2|v}@AiY$Qf^D4F~!vazl_M-Q{L@~M#vPiPI#dOi+5*Y8-`IWh3Su^+(Q~+?TJn_ zGQGYk99Ea8aNIt>!fb2Uy&QSEh^@LrnW>fXZcOyswVZ5Aq9Wy9oN!V;{C6g*tdaJz zN|_DEaP+O?xOE9Ipn>GtL=8}zbboH31uzFW$n0Q{8xyTjI7(TiGN`gHL7UL;gK(gO zIf)iqUVr;lgz59xO0j$~3)w>4@k5cqS3iBx|*2atiJQp4%f`MR^0`oUmD=_~>g#jRJ zjrvtl5K6DY3ghMu{vz0PwNGW#c&l z8DAL!j=T!Si2%ShMF=rybF8q9)*`UdeA{8*p8SrjYVMnOe<0jGf`ycKtti%Tr3k68 z5^a%ir2;*z&#PouA`8nZhqooznbFiTMlyCn$6$uhN-1E5vG|zHFxyZmXok_wDQJdq zR!S!1n_)B!)I01H3z=aRwnZO93mJ{HvGUEZz$k^Y zrmz{7XUx4dW?182%rIf0K(R&n6q3s*kI1W{B0A_SA_jvOQAWooMh`_sq3B$s)n|cM zjrx@l3VM?fj1fM5iBfma4|1*U6SMc@knFVJQbz$-Q2 z5iK69QN$qUWQ`V!OqQ?yVun$%1lWs^7g1#BHE#inY);t5&_e;rGu=fNkgaM8lYd|b zL?|zK5n0($UKsk0P~E5)FmUBz;4fyF@8J9Mnqm1CR)kbi-n9wZBBPpN&fUf*C7!6? zI-^-$%Hs<6o#X!8QQ;|fyYqOzb7-40`6{WFCKxW?Qj?)MD?(fc`@cp*#@6! zjdIRUg0EEkq&YHS9Xu?L55m)t*&U-(wt4T!h;kYFVI#Py;V|5hbe~Li?H^lN zPj%eGj(7c9bPlO{IHDsIBi>`b>_8qq_>NQX;1FO z0X4b=dyLCh9OVH7V>ATuw-AB}J_OA+1oS*N88F~EW&F(YKc~F;Cu~b8FJiF7;wPoY z(DxmQiVK zTb%m?0vd3#f5T-Jw|7gT(s=dg)~t~p2gD?XN8GqG?@W|QKRL(!m6*hf=~6@m<+`N5 zvmcUXM;L|kz&YKe(@FIx_0b9N(?^IBM!?jwC;67Bt(K$C;8a-s%Deoe7a4Gju zAxYM$kU9^WK>-b1b;f(=p!Z>jQq#k809RV3ygL(}cww6DwkNUHY5~I^@6FiMM_18y6f=PiYu#SodFVfdm~V4< z8Ue|0W!Hm(;aiqTT+B5EO$C#AyEXn$?rMym0uPiXy$h;Tr*`E+Fg*G^VYoU`g8^hE z^qxqO-`%=@zwoa@GptNuoiR<&hN5FLroyf!;cOio)Lvtwc?n_yUcwDt0kLbqyR-h81GwHred^9H?Fj9h1F<(lU z&XV4&DnzL1omVk42?-!9VwePB2=fZT!7&M+D#`l{nVYIsE!|e+id93FW)gbv+pr|m zi4iqwrm9PsxIEXLMP)*RcO9YG=jrkE0OMT6XiTT031cI2}R}f;=T@UDxuo7v3 z0wX~x31sUm73;&hP^)mH&~9l2r3Ci!nn_HrXk?eUFO6?1vWpPy3;1uaF)y+^swy8# zRlfest8xq(QRRM^pUW-*SKT|z16i}AHwVP2gRMc~xV>2@R%`?Sr33W119d#!PPf%qxFpw%JB!1=xgXL(uD`zqE4 zkw(8dWo-rR6$oBmoyg8gxD7q|x`PUL8)!aY#3M-~((tMouf(ed#c(wh1QZ~|s1IV9 zX)3l6LVJNg6hUK4NNCm#K&OHOtV^kkh@U0BAk0fs-t0!mKZ*+48BWT54B)~~8Ip03 zA^sFno{1Eqh4cz>f=#P2VGR7Rz|hJ_M)nUeBk|;r=mYLbBhoNZ6krn20SxwH>@&pc zmH5Qu0D+caj9&}t34tkVy%uUbzh%AgXB5JEf_>k@7*fz@ac^qyZ7Kwrl0 z0FjdGW?f!nX^p~-8i=UU7Nu5p!P+WutR>vKEhO`#tN!|F47=gACR-TE7*&qbK}1G5 zf*RSyQS~FZ6}F90Z~x=ODdLmQ(BTMP;7N_5G#^c1cJu4lLj)q z@=1TA1BqyxFyJsNOQPQUB81b;l5oL_Bv`RDe`~C?dhY?5G|W+YzLV}lrXFNTiuC!Y zh#L+}AGUcrpVr)NTYhjH>Tk_!isC#&L`Nhq!e zYX9Mg^xasE;IY^ksb4l#62k`$o@ecN|+K3vj%S9@!P~{UU zBqR`yGPgx-K>$hKpKT*&dK|U1 z$`FU8Az6U#9?Qq4S_wd#KFW#G6$a=t&mDHVkU8G!(+i}{j+W*(F8xBi`#ijOF&LoO_Hk1nXuy_4`otEbQ4`0UE#S>L z-LPeyY^-6?a_ZFl?mezy&yJeB=t{6EO~71=^+#Ao)! z&h?#210_tVc04$PFkWnPut$+5zVxY&J)i7)evGb|jBP29a;S!g?sd%2fl!d0 ziAp!8(crVEkP8x#cjW#JM3MCFOpqv3-fjFNNoqs6RYJszm%QP0Js*I*#RA((0R^Q6 z8&L1wR&--k6q@rTk9TkXXs>;kkB#m?vYPaIi;>>vLZk;8DMrH}->T!HV|0{Hys~cv zyHHla)j7&=?*8Kfy#Uylz2=)O2J$V%&~SbT4PR*-odd-=FC34&-_f#tP11s*MiFer z%N%ibN8If~YklMf%4l`u<=EZr(%N>np9AS;bdhyVB;@dqxZB?n2m*KeR&Y!J!5d+PW#)|2T$TL~Z2rLcj;&IMQfPS_ga(D#Q-!D~*vtTh~Ni zb&?NMc6{^|)r-z554{p*6DL_)pU|>@fYy_d_HFyNvNe=99f+Y_hd&$adEl2& z8VhC~#(;fe8J8?DPP9O|V6lLE*sCzA1QTMx=mTK=PB!&KqjUPiZO(*8bhpr}{z$ zD*#MTR;D~|MzF8={;wEp_`RhAm7`Ludnik2d+q%71E#PVtgVxU%k0D_oYPY~N&-X{ zf=l5YNXHx}KCUqi+X%GK7zia}0cdFfPa)ymlRNkn;PoTXfp<@AfnfV)vzmf8y6}HA zus(9_lT)y7sZ9~MO?eN9vBiMxH%Rkp-Y1tS6|xlv4srO9B(@hMwoORPg^SD_;=xYP z@xIjwlC!7#0X)NUd)BgIxRhmq5S5LC<-f*(i#c>q`TnpJj89=4kPPm8+c9_!oNRB@*vV8I>?JHkUFPB|Ty(*7ggHn06+*cp`Bx5`FUKM<4>RMlXQbIhnx zvP7Y(HEgT-Hs2qIDhft5(PKju>%Uo4tuo|KRGIjN&$8fBy>0TT9?PTyV741A%%6z8 zke79E({g!G_v)9z(83^c0$1-QWFS7>T#92jwBV zqZxY1@2+xF6~Txrqc4QwFoiwteh-@WCvhq1(vuzJxZS27ipuG^V!ApIW9^tfrqHu zys43(DJ;&e6?{o-0;MUp7rSrCe*wuI7(5Y2x9YuH**4`}(fA<#mL#)d4#P+Hik!R7 zjL6wN#0qe@wg!v|*wukkqh03Hb5qBFpWqD+HBQt?xic`GJ~`8ggMz-GXN(KgOCKbDo7Z0z{TtK7HUyB9(I5Oyr~K(Ks3C>GFjr>Q*84qO%)W zUu761al|~S@@bNp-o{il<{R&*0g@TfMp&Ekla)Y|8rT__AHd@ZUx<~ikHPk+OU!m+ zI2C;s)*CMWc*l3So3(8Xk(^@e92nfG74L6f15f501L8t}Pu0ux*g{eKYd2t4Fk);;CH@So zOE2s=oE{BJs;#1Akq_0e^{RJroj*M+7-m1PnH_keU{P)$+THzwzb$}N_z0SZ?rPAwDT2@LMrs^Vw0l`1C4QQ!0G$Oy4RH!NQ7Cfz_+YoVU$5!C6I@RVvx z%>9+v?>4%_c(Ufuw(lvym!XY~op@B_LFHMHJeoCm8~{%KWaNRZx*`v8=K9oCj65DB z+&+b@RkQHjyPTt?JcUjwMeG!!lXbmp9wwEE={>yMg7%mqmC2h>2P4XfWOGs%-dpe{ zdk!e)Up3`C`c{l81iC{JXn2@PL!fWjFmJpdd=MU{i<%{OLi$SK8b_u! zB0>t6x`7J!hf3l0V`i4ZZ6w-7;Wi$J!d+iT;Wk3yii!B96z=+?DqQv*R=8**3U?zU zD-`Z}OX13FF5D#WD23apX9~q;DO{EoQMhcI@_x!1g>){dGoy6wS*ZD@bZ+CHROhPb z=dr6@BnW&J@}ZyVKdN?#jAUCck)bv3$NxCZdjmPlaJTP^hIppQV>ItGuS7I&+djrr zv6o4CKM+`G3I8jBJGNa6Q}@b>hQXmwjkjYMTEcAZR2jpd+c5%pL%vgl& z2?J#*sO-u{W=WPv4Vdh(#AuGfI(;|MfRN4h%*1OIWy?-lS+yyw9s}=Ew~paoL}>k}3lo6W3lP$ys8#s0)`x*M3j3!+K9jmyCOpgvdrSjW)xL}qN4;e9w@J_B$f+hAdu7Q{^$INEf z7uI|CB~-)1Ni1~*U_BNFCApBBxE{WdEC>wDKbUfG+~|#k!}USuyx|H+)ydmdmJ$Qq z{g{EuOci_qjS2~{AS+C{Bq&HR_bN;XuaRzPEeb-_G9|$~cza)>)$dbsc+|q1&af!f zTA9*NJU`5i6=cVLbQxyH@{pNi*pWehY&*-H)1H=t4{&F-^>E3GEvF+h;T=OMu%22= z?pBU%MX;7v^MVjr$YXCN3n_VQ$Gnh4z^Dt^Diy?lC*dD`A#rui?UEg*)*!NwczbnX zwt0ojptoWb31fH_3k%hPtH_lqwu(yeVM9wf>foRXyh6}s&6YFLc;Ue3TB_h~80?SW zBg0`0m`Y0IIqoyuXf$KrZ<9c(x<8Hdj2+2CJLo1&PTrC1dO=-)Wv&6z`4gfzRIHrN zd(kIACCE}o*k4o(R89U9lr2_2Y6-Kj+0Hr%v03BSKYiANrTMtoLj2jn~QdU8^BOm#3 z(r^WiDdYwYat|+N3sOv}^^gqGFMrw-{YV_5?xIXuk3=Gw@KCP$HyDgc5B?a#XTBK; z=a27UyDw40<2af!Yyl`e!QdFm?fW%5yQ@`(#F-!TiI=$@C{}BcMXEe9pjt6C?O+xT zhOLB?S5m4KrtkLl<9JE$>zt92B|3BX@ODo4X3@c-eN{Fv7xI~x)x3xpanXqX5yq9Z zqV0)RzJ(H}%Gb4~OOl*#f)O|UsO11{jX|v%aZXxc#~TwhI26m;ou(G3ykNkEUOol0Doav!$o2R06u7xytq zcSw7e1Z!)m*UIXtG)yL7EQCCo5DhC+9AK!^Duo&_h#L-yO`;DhKoDv%X+@8c2>?EU z49q0OD8%NW+@i-R%mMGAT;&EZEcAjPQlrn9ya2m2bpqPjO-{+zo5%?QYCWK zg#?Uoup4JoVk^&e+PWz^aA96!?SuFM+5`aPHxB&!$9x!W{t167V`V7!FL2n>?Qb=& zuYM!F1+tklDmGhnYaZ;~Ln51TK0+u;0vdg}hn}R!T7hloxnk!JqbC{CY%hm$-$xam zKy`FXn1?iK>dmh7X1RP0XdsQ@qJxy$lU^bHsziG~aXUDlS7}Isp%R#0=Ez73C4qeq zgZpFZ;RD48C!I*q+zQHFnt5bGASs=pLI} zsA|QlECmNu3K#m=QH6>J8CU}<3~T#%A#0zogKH!`a8$$Xx~S3Gf9WD%FIo$@^7qNm z*Fw3fOA$3dnufG(OTU`6FNOk0%A_VT&nl5MsVrO^#uFg20!GWB z+^J94Qw$WM0uk8sNkI<*fTOi4@@oMR(OM8-L0|cIA*u9`wtbH95mSl@>>5}};h3Ot z#c=k%NaT*$LHexpnQ#nK03Gidj>sK-m8>u|DQuLH+}}O~QEA{EvFo~gjGA7(_ zvMiI(BaKST7ZbitO+#H8O42dJ4V;OkDgllx8wmlm!cCe4bONQ2Rjz%jkcFWWw+p0hW2;B$AKoXz~f@24hP= zLw2qyUSiTm(OF*K(b)oMu~Wj;3%{(DlcJQ5pjs{j9Xhl~Xan!>qh?U{@K=8b3;;)( zkmvn@YNE%e2^=8k1W?J$md5((J?TvjET3}3_#rNEE(0Sj<$^-s$((-#GGFMOJP6%OI$I`b=E=B4|0~2__6`}DhA^b2?TGKAcqWLR4GmXd#K*KSN?hI02+V%-)O2QhF54cMaqEng7vYUD`kWmM8P}G-l z^o(JzeqUlXdY9V934qi(s;%5bfQqRG-na7X3*2gN*%$t4-r)F#Z(ei@kiL1ry$7KL zr}B>+*$`E?D|bBu3=rz($c8Q^L!sC|vZ0HcRIz_#Ll=Km75hgvbnz!x>|zWm=e8cg z6d*LnP;1*qqYQ=@TyNSG;cke5Hh8rAa%UaFI;re>vZk7d+DRzxmUI-l#ZQr%cP3gv zn&QP0i|xxjoXd4=mvnKz2x%q74aB3l2E3AuCf{P+`*|***sytVsFaU$)-%a@z_X7O=HaT z(m;Wd?-Loy{TnxB;)OL;FXo1eIRFv;T4Bvhpy3Xh(l2iVmhu3Bc?;Kt*}sYHq<0(~ z8Cxdl;9_pQkm(ZIda)C3y)^ih>;%#WuwxDZ_}q8tPM{WTu*3S4uO$6ry4D`|E9PPz zF2t89B>8EF3=Af^xIwDL`>l z`9_S)P9J$SMwIGLEGz7;%%%%9X;g{9hwYW|cBs;3wYx%Ptv^+3fBeRB4(+6qlj1FmVLN7N;cI%$Of=aG<)1CTN3A)zC(0DaT~a-i%_Ku3B4dGPuZV`cD{Jy#ZCp@JgVQWY{X zd@(vl*>6uhr3l5ylmSUkgM~7gpMaPt6~Ho})k?K0@Wo=4ZALQ|6hqEWc%L4ry&n}y zl8-+xc0@88%5_Bwj5aUyK}a40wa@$EivYKgA;n@E%H41`4azr8SUF3ZcC`;cJi^Gh zqnSw$F1FIUa%FT9n;S#+=jKbB-V<~See+j+_5S5s4cy)5!?H7hR^DQQZP)4>c$C0R zhz}g|N0^xCEZlI>37^&ohtjRZx&e)cbCn&8N;p_;I!+qvu+;P ziyuyQ)&}Gh)g-%qKNi1lK&C7R0=>sOh`_<=$h?Gi6fR_FzVucS%W{}PFtuUt&f-@= z>Z^xZ-k!{kL*S(mws!iFL8Q^HzLn9P?D=pu9LcR0kJDj5elrt~VikJQKB;4G5WJuUAJP|~tPBXRH-m~bZ{_Ajo>f*fLfm?&^@ z{?=R?jxonu@YF>nKPL3$n6`bq{UNGhuIU7{M#)rw3XVg`Y(w?qP%^_{iqVve$t8*? znd=E*K|RCqsb`uAhabp!Q+kFf>nM7Ld~h6krivDlhg~-QTEP8LdM1?rHL9K=t^q6s zEsU%{&~o^{=oyOSw?@xIFH|`$JrnF#Ej<%D9xlN)eViKm_o3W5_gJci`(j_!Fi}&h zn(<22kej)Yevh;fRU2LXxGkb>z6T#FbjTm4ZDtqHHm8TRjWq{L+k9~p zZ8N@*wz;(SL9X)gXq(UHYn$>AR$3l23g@AS!U50`g#+C@n!-6O6o@o6orE%np~-Fd zI$YziX_-p7eve+F^;|LE2|-6cOi#o-)P7B+r-V&V7_i2K#OBwRfRZ92LKyo3-{Hw^ zPU=l0J5?~wyA7}vQ-eH7uuF_m?mbF?4_^an_X-;OWw5(_E8gFFNtn+s)*CU%b)ZsQ zdhCEGsuE-)p7|FP&J2y@16jI?t66UdSBB}IvIS~MZpL{wg&WRObd>xe& zYp^&r3_d5a{jqUkYhdQ(b0YgM&WXg8&rXI)kA)M}-CKrjU`&D)BFxwiu>1|FYAc7&sd=ec;6sb-&0At0-9oz}0 z3zVP{pwLq`y~(m^%>-#uNoSs>=s;f}h0B0fp7gG-z(VA0#3=J;6A(LZ&=(U=r?T_) z4#X&d+e5Ni-hzlE-6jAx%b*_48s-rMhvV1oxbIUpe^VZZ1#M)D1En_%$W>R|5sB-n zf;(YHc#a_g;*!KhEougpFX+K@fh0Aq5Ih-uibvJP3SMa`L}WshGh(0>oPM<#nuEau z7%Pvep_sM`q)5>tekqe51Nh3cziSRdA3LR8@DfGf4(M02w4{V)Sz4+`&QIFIMW~iu zv5sI!s}fZU9Rc5dkznyC1u(dL&NN7(05C)8@d4T~6qR$9RYvkyP#UvM=CUZfw6c6w zQGl0HtCXQ&o?x|Hgc4k+@aKyKxR7x!SzFM5qgq(x%7RI42F9f=aQ13<;3S+(_b-dX z-N8u^4w-`X)rQ-#Goyc*ywKLefFVMc+?9y|Hk2FYsWWv|91XNq)Q8uP)wK_hiO0X- z*!kH9q}4_m(V$up8!9kBnXwk~WYhRRs=x@uKrw+qL{eao;mx3L zMqsQ*d*K_ACI7bs#;}jkzX%LjfHy5LqKPcU1;#s-z#vnQjV}GJ5*QR95Ex86fWTnH zKnsk6N?@q^zs*fWeSO&_1O`is2n>}zgPU%c00%W$MoP7_b1m(x_sC?D4d~GvZh6lC zgn`l8Dw;>ZDzRiEwuPG@7{dD;L>g$ap-`zH)vbTciO-v6xKM0LZk+}*n3o0?i6a;V z{8ZBOh=Kx=Z@Y+o-rFPJdy#!Kx54J43?>+NXQCe+6h(}r)S&__jrpNdKXer^aX_gl z17+}034^iw60K-$Forp+feps|)C1E4kdRj<*#rgwzVf031J3>0_1*$(@Fe>Q`m8Jr zn(((&R)Cq|>_yY-xBMV?0ISYB;CU@J``GYx$!LN|a(0w=Uz>YzG_ z4JzL<6y(VUO6M{`(Wr~glVnhveR*B z49M++m^{raF*u8HlD4+UNi+fE;t0A54BZ*JP}Z8E3t&YxG<4xZi~|3%4;0$uW-ozV zE{3m<=OnaKr;s4EaRi(BA&du8W<-GL)f(Q=F>)8Ux6s4DF=1K#em4Nvbm0T6 zrHs##N#<3`y(FeHBR0%XcgiQlCA%9ur)rsvgwO&EEQqZT->9v(j9*8uTVL+MpHf}$ zkdSf!YJl`Ldyod;0WEj*<^;z)LX~6yoTd);BIYaQvOYmikSpTS+Nn(CERrL^*l6by zoTJOIzY_5LjS*i^1l#avAkYOm4!SV1jV>-7%{N7XxG_h?k|&R&uR2-%WWl3iJgot_ z)hyCdK}G(c_{jxbfddV^{9RP5#-{^zrm-qyr-j;sZ`j(sY;dBh}N^+*GcB1RTM`GG~IIOkBba-ghJs?}ml=tb6;(UlR z1^2e~W1y~BU3_EhNNPeS@(ZxBshig5RKJq`P0Bk%-{F$k+~DAIcT-+b4fLTx(#1nhY#;rH~qxje?qSmq*jiVb=J^7pBkc3Y?cEmhG>sS{?1(n3m?g2<$eZH!Pp=iSn$&8FL#R@>1TZ@&)xYEJ?0)0glRdbwfh{2iCywT$D>CZPvm4*y0rH8PU!zeu&gTH>W7wDuK{ zzEx+)ZP+kj*e`*R1j*)C}k$22@4Lr|8a~Y;4J!0Z|RW&PH z0Zr^YP)%{&(c+fG83>VjnCtEvq&Mq%IWATr@E=BxqUrCWDqRI{hUw`Zp+m z-?Th}KZnk2DTB4?WQ~%?*6Ie}%~EvXbVeL@Ad&H~1I}Guw_Y5jUNiorT#sK3sK@8Y zKFM6d_UnbAJERKb?AP&}%HG9Fl2xbD>{9NJZbb>c{+V(+cwN-s3?#}u2E91JPo1b^ zqF(>7<94v|;CDe-WRdsa3$M(;!;MuX?*w+MpKn1<#CVgNi8DY?;#?`{vwQHDsjLi> z%E}l?uVy@MJvtFRu0fAZwnJv%t-{fu3g@0*{zCmrxt-1-kTGu(J^|mgc-@I@t9N^o zQ+jK?Psfz;fQ&`&LKyQm5%&5i!!2`?UNfHWT8rnT_qiCg#q^~ZgkQDodw>c%cN?Ew zWKs(@$fCWt)=cidcf1aDZ(;}0f@@p`#scBB@r^SQu@Pt9i1YsYIg;vUUS2<6ML#L` zeojZa8=w=GSV+1(PWHN1lw(xFM!425!Q=Y{B|dC%)hDg%?EQox_Yde(EDLm?fD!_} z4A*uc0o0)1slJIz-7<+wkoAbyWb7Br*J&N>xR=w`%H^N0W4pmvdI;4BiQ|3dHK$LY zn2BneSl7wCGNxq)C`>@j%7jo|h^hMLD}Ivlru{nEwR-}}NE68z?it6YIouA;aNsJa zO{+(&uWHyG&)i5r{Ki61sn{@WTb!jpTTPFkmu5T-&MAWVfyEd>Um`Vg>9@oH_tMRrVnJA2(luoF{d*hZk1?<8^5$D&MS zW!z9fhi;$EyI0g0HPA3bx}{-4=4~aC5alc#&Wip*o8@v}sJU92AOCcJC&Y_xazksm zaB#~&ztWx3DCY6gMhPfQ@xgw|;s@xnCt2T%Hc%RDlC0j9-Uf>x^W2Hv7h=gNPqZX? zeLl}QLKJ{e7GWJAsr47i)s@(N4lXyEyz>E)Xl|FTp~$`UKhd?jd!QAsAUam#V?V~c z2Zo?5a2x96KGpJ;P|LH>a^U-D#<5?!M+wZK+#3r=Ml$`UQasCW&8iVi5C$v7B;1pm zfW~6ih~HqwE&}nJd)Yv5?YI6b<{?5i1>|K0$E0sC;AQrls=9u601u2x359}ZfBYXt z`n#}Dr2oY)p)=ghfTgmRF%VT1@3IyV`^wzq&!L5-SAhrUReJiBUiHSD1bT9&ba|i0 zBK!i2;C&`ogjX&bWf4T7egRnGGO0zl3IgpI7C~B8gj&u-OMekwy$8oQfnj_O2>@jJ zXZkvFPyr;1K+#C26fP@3<6sef_KaEt(Pj#7XHq&^i!jz+gqcdEuhnzKOt=UlT6g7s za6{fA>^XqWaLgB|HjAJ?Rs!4X38k-HxMhyy?nlieE{TF|_WPE1>oxqDr(ls!t_`w> z;G4TRGM~F1@AXIHu;#mR2xE=h1Kv^uwJY~cK7#*PJMG+;dIsv>G z$9JT;@81AkE~e9OR}M%AsOkgdj{C6eo#^t-%6pj3s*3vZ9Xy3)FwE2rU_JWQr>p(Q zShj@~4&~m5cihM@KTUeaD9}8z*}&M?VHllrH&6whfFnk_Ezc#<)Q<0PJ0ka~JSwn8 zX3DH08Sw|#aHhC53jO6zR^gD7!xXB71yxaKUWEK%^h1anv6Q$%#l%#^E&iC@kmPM) zJj>Ioevpeh*$@!F`o#KoVa&GHi?!Jyedh z)qYn+H>?F{wN(w!!A;76aD;J<=?27LNFY6LNUV}yCSC2x-RM$|$?h@T?*IG}erkQZ zu1C0o-&iD8Xn1E1C(S!|(S64Rz6W6zP#ZY#Ic2% zYs$a#HwxvJdySWJ1P&%0aWCRqs}jbsy#z_Gp;0;3*nfc#dETdO5Fn%wqxBSc`(bm7u;UuNpI{3cA!ddrArE`mJ!ORZ;OO zfNvL1s^vub^SIP1FvVkVsj=Fzxzva=Q44u4H8t;IE;W*lg^lBOsUdh_d|*+38kf&{ zcT|5GCu{0M8fz|tf*vNRNxmlI!2b{U(S%(<@NJmn*H6+=@jy(m9gvWU4)Is>ukdHenE-9S9p+R;wgeZjWFacN1saH zWv^2}2?3{HK}GFPV;XApO_|s%xHk2D6?jdw4 zN%Vw*hwmkh;;;aKw_)na5)^8%qlvkSHKq9@2u+FOR$H+_DS(o95 zA>YV-Ts(D48hJ^wcO4~Do|}#yF6;=zNdLDlVp8t!3mYAaV z+f+cu**eOLqJA)bfhkZ_{)Z&m4~1IF3swn5a%E)#l&J5?%a z#IiJX!5HS+Hf{$Ryaz;(!vl6B+Q0)wC3-Xu*wyEQ?nd>1J)>(V=Sv(r|6_2zU@umZ z`(CK!j%LgG5_rH~n-lhcEme)goaE+QIbUu-<6!Ns*hU~i9{)2~Z-?Rtp(0QXQ0)QyV zT?i}%9eA6aTGLf`>g2>@&l*nk2^JYblrUW9L=aj*#c9#M{O3N`aQU^lE#uoUxvt-||1 ztp|+k2O^=0P>c!J(XQN;(02SVB^h^5-~r>qjTdl{Wjp-W7xJfs)XVS_vWac?N=&7P z9TO#V>9y^+qQ&Fq_wyKUCv4w_E+)FCd3t>5an3vNY>_aOwY#@<~< z;4+q0ZM>r||0yPG%$)Z(mCsW>#-4iMMyNtL!BkaM8e~ly_?< z9#h%XowcIvd@+uVhoA~?QClW0`cxy6#rl>*=vwvOPoQy+Ac37$O(A8Adm$j<^&5pu z`H}*n7s^9|3yPp#VMTN|!pz2pw6=)+ZR_jAn5pXY)_BL~(03}kWGlln+0`59-pQsn zfx9p-U~yOOnJ#ZC3cUGS zdBX*Y_BIA@INs_G{LOMTL)X{vvLo*sooHCVi}9p<<&=s3Px33LDSQMIQNP9lHKYEH z3B&RBQ$T-V6sOlCKGlV^ErVtU>JcyvECepI=Z-->!k2^%^u4wR$5*s`0)9*H`c)Jy z-p|xU((=W^nQve|-d7}70$lLkACHXmok>kayF}eH+nv|c`&xN!92#a$t{B`NHdt2C zB@CTT%Z|ZeJHMd~xE!fmx^m zB-!{$RUc;N?BnvS3u5wd`PKzpp)h>-p$l}QZwRzpGar_Tz9Bpe!mrP=)p>?~YE?2&hB+FYs+jg0^Z)Z9(j&C^B%uGDJ<1 zh3r>O^3#Igpt8Hf)l6^UJ_HO??ioxYj5T86n=FF$<&$Q*g4)L1SpUe-OGi) zB*}UAfnagp-TQf2-2UA6q^ye`{p?MIoRpYdU6#A|YKVC*K^;7Kh?={z)~CD~C*saD z%n`R6cL@>paUn1*eulRzjh))8HT4|_> z;u+OlPW&ur61l34d_v13;xU_5Hqa>?^E7VRDFdu)awom29dLt}=qw&}vt1f^*^|4> z!8q@qbt|Th7F%;0_>4>cycxfCy7r89PF#&%&fkhCOs@AUSUP1O@S?r-$ygt(f=w;u zPW2@lTCP=DL+9fTr6-tG^{v4hXep4C013>5HQeW|h7*YwNWV{{shH)5azDpzK>Q}v z&!qRIrYz}1@5U^`<<%`sDI{1ae6%vj4nMiO&U%6*w;pTtlGrfrthH#0yY6~dd_CpD zOv(O;l?UK9Lkr!9vW!QMwf&4go3e` z(~Oemj8fj*Mre4>5gML=Ml^iAui+mQtR7Xv6KrUBnS|2trxV_YhL-`AE3xChIi0si z!&m&dHT+jjz`10g*5yx8c9fqjQmg1@DCQ&@Y7~uE@2J-6&8h2 zsx|zbh#M8c=$q5Q=!fOWSkQ@JTudO;=mcy7b;~Khx(?eC;b1&;Bc+^%+d|g>1*+g z?`eJY)Rh;vKNer}L2NyRG!AaAX_=9EZYJ*B9N}H*>L_cka*~ZoQV+*FhG=uyq|=S& zAe#e1km{`Jqt28^ov!EXv#Oh&%$~#WO3kR^W(sFj&(c1*OmBnG8NTGn2A!a@RiLw1 z6`jqQ*qE&D{L|3c!*rV4_;uqy;Igy9kB-H9E>Qz6Oi$9R$08 z6fs}(8yJQe<5S(?5Y;u1>b{08hfj4gqg2<{+leMemt`P4$qcm*N*^cT;gIxJ9#{FC zIbj4QWFr}CZ(oGsoU7#8rljHl}LHt_*eXvW;<20%{+{K zM$qiq8JrMX{wn2N!{0JFr>*xvmQ$h@ru20a#c~6&KmDgVn4M3Qw+m51x*jLN!y7&U zSE)&6=D0LYXXkhQ8R=SoqwV1C!r9Hoo@v;QiQt+7;xvQC9 zA3OA<`Y894NrYa~tCP632@x&M3i;Z;T9a?o>~PcsIZBem6QUIVU^j#dF;Dmjt(PZzVHF#-;ZTmJ3>vS$=0Ubmktqi<2@TUBuN1 zCiiA#*jx$W(QD6ChhcOdESG_UndivVFphr-iQqaS=16oFYyihP9PLO3(K+Qtu}EaW zw_JRi0FG_cHQmnwYchCXybPPQ(+wj5sfQgTp3D9jgl5bJ!LJ_&^Ync`sL1&^9Zz0$ zh7Nh?&jsb^UFEfi9l7|LCN4NlMl7?w6W#WMK9wFGxCfk_#2GNTf5AR+Z!<+OT^wp_ zRF{ZKIyr(v;<;ybvD+srNz@VXsulwgr~8~`D&a=^`?R;8Z@%H%0k-z0|K8-bto z)x9PpVKatwqatSmEh1h>JwXQ~I3b2p+ zEhLXguM3e`A<%2nSf_RQ0yG4tUn;Xx3B5r(MI=6>4155RD+!u)9~NT+pAmSxpV5RH zoXNxKSU3EqNSv#yc|+n;_&mzvK+}QDvoVV2lVIrZ@Q@u{ZQGZFSq0Gthbsdf-v8>v zQaIkfw1PpDN)Pc{5KZ39Em(S%<8`PO`jfERos(&0z^q41FNx>|h2=mjIr$L^OMH?s zTekz_jd`4VEktvx(l2NshII=GF}#5gBY2h&tM_i2|j?pN{wM>eP| zThT1(9tgym4!?=74QmIi-EMp-y3b#j$j7Klxr%{{a&nb{vpbqvSXe4ll+x)gEP$^u&) zc0(NUAh#BWHKB77K4UA6)zTj@tj9w6B0&p7Xox1-V{|a2#8b=)BvT}yl)8u^G!+-qwH*YScWR) z67&Q)2-RR_V$He$TtJqjCIOEn1H<5aYHL!qe_k2}+8hc&OP1o|uh2zwLb{fkGV789 zVwnd(v0v{Vims33LJAH{#VQec0MqCz9Q?z=nKRjVp-^N7G*8^lK^mGV%dk{ubXj9b z4ecu<(Vws5T@HE9vDFlTHK4<)99C`$Na%(@xoJJ>2gGrGP=yeh=~R|NXK$#FB?ki{ z-hpKjN*!1Uxf!w+OR?9Jqmj70W4r$~wR*39mFb{P4eGE>KRQT;_Xao~BRJH!Sl1){@yMG1ARDWRIP%ukL!4T2K7{WZ(6ObW z&H#fR{UZ)aKb_&w>Qz?~@Tm0hDJN_%iYO<%p?3&?SwQ11(nhVBl&MX%MAL?e7NV9K ziK63COI5yFs*q2b$rJS4C=#Os64L#Eq1BVe6r6R1d8IQbyH7rO8xE^9HUk~wtI%YPf>Pxc1 zdZ+?*$SK2ULp@}9wDR){oTk&d}At6C%JFI zELWghiT?KkOeDHVIOJ+>n)e{7lOCH`-vgakMP>hehVRMr$t-SABg_vu@j84u9{F|` zt6fRcFlOYPk8Y892QtacfqcYXdyz#=?4oq=UW=&O_Lb9g%J(}Kwr=uVkMRz5(~WF{ z;el+$N*v7t8E==nyYjq{kO(l4Iq6_?K}SZZc!hM*v(BRR?j?(fA&-RFw9dIzht`ds zlFZ1EGH3)JaxCJfR9k!^!NOBA*w)ikDO}CHIX6EW*sCQ5bV3!3nvkoq5vDHLPrQ{; z(V^NX9jZ60MO>8_kSH}yNi+eGX*aiOqT=Uf=VLsqm8ONh+DvOH<;GMuI|gpdF2<&8 zHzwhwjl{Xi)_oK=reua4b?eTn?%;~yy<1g9x9(yMVSdUQBC?f=QsxqN-=r*(u$e7) zh@^wSn!qb-dw9FLS+b}*Q}H-D00&@?Xgk!jk%Qe~upiV;G=G0*>@)ZQ{IZm68WqGx z_hpi@EZ{jgEA@PP7R{wbWAutNwTrD{24LB(6+)J!QV0@M0_O^6=OM$jSqR6LXn^uypeMrY@E zehb=mc3wvV1DeNncK*Mo#J};^MU?oXcyUxEK3iHxmH0Z{^ca-*cPJP{iT?w5VH73) zx#CJ(M^8r6@7QxI2^{$K{z?k5hsxJ^pE1j~lS_RCyL0 z9#ZA8{X$9ks(e#Ol{*yIIEDySRVKGdrzkRU6Ekc#T!?zJs1n{C_SIUP)Ee>qI7rCq z8aNSG|BPSxx8$@^+ewR4rS6*s1yOZljO+KEjKe){`WdpUKf6c*?W>n_nY*-`YF$+zs>Vd zxNoaM2CAL&C72lnN1~fBX^+I*aW&E>FcOS(BHA_=+$RQa39Upa&YH=U1TQ;YySi!1 z+_8eL1cye+t!`u$x>Vsp6CH#(_2stD!boEAibj)olW_v5d&Tfp>~LI36^iiP(kYlm zC=XB?Pv}stHQZs)SA`gWeqS(aegC-jlE^ndxB;ke+((6wlj>Ap5=S@)Bh<(4dVWPi z`!NWaa>Z+xHG=y!;uMraUoedH@I3V-CXK8k;0B3cHv>l;LmKPxEd84szWgv5=Z=Pe z$8Sh`r4chWSDA5X-G{|0+aoaRHZb3VHmVyiiQO};>PQ{lo9R~xCB13dfND}UD9+|9 zrTB`>cXu!}`v{LLCB7Ro9~T;8uz)|dIM7Iq#s{9n=L=LJY6XMMAq#O+7a5Y}Y2le|zN54#sP@`pz!D7cqL?t%4)5CPhFGj9KpLDpD z349=C94-^W0hSDtV2>6yS|zR>;yXCQ6U`t$vDnZH$RC%7NYKI1x)CA@cCrY?DJ)EX z4zj<%Yw_rYsK(YCpnFR>6}BpU&y%NJ>ifdR2%Mcc_rwBj}a(+IPa*bSCbHL_kDp;7M$ znfj#y7|UxkhsZ%$xFRklIqu3m#B86oZXrLe3bYi9|67uP#QuMPN0ZumNW{eme31|* zZi;~F+d&Fn59|b_Y8aMl+;6DYUtqOwQwzA;al=Bmh8-bq3Q`nsp%O%!ujjlvJ@lcfri=i)IUn0f6h>7ePl(11~PU5x{^AQIlY@T+q!Dar(A!A<+ zk%ocXhe8d*I@uWsEp#N15gEU&hj=fxCaYwYNUHjS_st`I-eMWL0 zrHKfE&-P^f{=4elW`*p9yd;4|~-0Kj!U-#en zj0*~|9?<@~a=2DBbiXbx@EN%$9=yK{&#briOpe=AimN#Mq~B8XBL@CUI+{7D+d(8# z?zd4XqE33Rhy%zeF8VF#giBf@nEx$_av_dJhxm_sL(5Yk^6L@ygFC^XPIY?Zfg3uBu4Y4f8 zVC?q58EQ}kGxVMi=Nx$pm~mMJM$iizH_%{S0*#RDE3c`Zet@2Y>!kFZy8myYR4zc) zJ>NFNM}hF(f8tr$ZiMDHm$MI9xv{~x*JA_+ma0OGG`j;8snQik-y%;2`0Mdxz8A(B zO!tT28eGM?FrKiuN{)4B-zykAxVa+Uo(AE|wHJ%TjCsxB!Dw!>oGfyKvHHT9{_4{X zQM@{o`vK2vdFMgCi{1GSfk}S;ZxF3)qV!7rBl<&!y0g>mq~nQcqy#@AcYWaEw8{7~ z*8Z8a#mLlRa|5hVv$XipjhV;|!xT9M(DT8~Wi4OEMeUhaV=WUiub$BIaRFoS8C>FV zp-`$^;!%AL!M{(YBUcfN#U!BSzIZqCY|F?hp(dxMqD_=b7#PTXkWHQms8c&(dF4nH ziE)hwhS;eqXW^7 z()vsKpWV6^Us#|-U$}e1m-u@fes}zM;l2EQ1AZ@mY3U69rceIkFQ0o0f3L^yuYLCI zpW*MD@%!3EWB-%CZ^iH1ew8_uzi-Fy_x-Z}q9j&`{TupL7FHy`qqbLl0il&Y_M`TR z{4s*GP^q+ zFEwp8Z!YjYgDWIBFdZatjsO<8AmZ(Z(Lga3Oo^Z3HE!phG;Yl#BpP<&R9{aT0kZzO zH8+ppyWaLO|BXMb_lMSyfM$0VOsHxzgTnav~Qrw zn)YctH$mJzaKq=G#@|HKop-+%e=EOz!ReRc?}r|K;5%31Z`C)Sc>Y@aop$D>k9Fej z^u4bfxd(q|JoUqWD(CGOH8tpn4ts}^ZVeyo2_utT+&QWS2!b{Ek5&vUXlVFDxy>hm zAF)p~xGI<8_;7UqH;xM32a?RY%CrN8UUF$Tk7e->6>Kdso8$ zhc8wA8{yA%*XYYCY)8#rS#oS(W;)1monoD}MAUl%0^QFZcHH*2$!OJ2r=P|RAT{)F zX#J7BOI=c-kjwjub-*s~tAubj=A(?0y;jP_Cm;sX^*{D+pu-*#kx~XWtN#mU zE9Y@2Hp<<&0W^R+QvR#<0hkUY1A`pj4XVElSNjc4%WN%kXJIC0E_0)x zCEor{)Hpb+q$b`jD+9BPLyn}&YA;9u;o@VL6X@LOeeJkYUL5(SPp;MF1eCT z*``E*Hwkr~8|%dA`T7kyXn$JnZiAa1cOy+q4|ol+%*gn7JGGsA$(T9>{~)pesU@-~ z>;$nEMFA`2xwCGEftR4#7h)O2-8{#G51`WE;}c@u0}sMrM8?%1Rw83yKQ{qDQ3df? z1w0BtAIUv*8wP(~Ufw0M_O9d9vX_)(epk`*A;SiDmlAuBZa1+*Z#9S>PJko1bOWPs zFAa{!K&*Zi0e5>C0?Ap_XnYN|afoGw-LJ0$L%VvLmUEkR>PYfJ)Rtr5@`*o{}4e zlt7_s&;kX^%@;=#DD=KY1PUa1y1q6$CSkGdK&iVv z5GbOv4&}1{0{PHHal-n!PkkFmhdB8z^z%d!C|7cP|CG{9M56{j4sjx=0o|*i-)@GE zcU*${5GwoF-KX64eM+coflzsa-_78CYY9}TC!6vE@U~buR(s3q}~| zBcNfRC*Hmev#UKgCIPwfEiS2UUONIKPDz$#pgjf2vWV!%GKdQTl7$XZ`3RMkEC=LL zYm=ajukTbpsT2_W8Kc}a{*I2mH?kW8V^7`ui z1vM5OrWm0WqlTxe35PCpy*ngYX45v!tjpIywycL?Wyp3A2eQ?Q9_x}|y^V1D z;z3LlNa;k8FW=hXrV{yfImAm6oJ|>)gxyaeR+~K~K`HhT!+5?F+xbJa4K-ry$6tUt zxHvI!0C92IpK3w&6V$8COiWY^#PEXC5LxyL2IB)zP%^L&4`Is@<`JpSy{m*kLF;Zh zBX!<~&r6>DVdtJt&HDV=vp)S{IPf~E@D?AKWQZT%imbt(+SHs6&jGCL}o`foY^4*U$*J)?Kb2l0P(AB-Wa&{2kNd&is@!-qD@@e@uVdYJEg;|K=3r35I@ zKKj(i5v;}ijZ&gL>Z)}sTFdI_3)4n&ONL;fS71EY0_^QF^cL=a4*O@AXix5?BkXu} zt9-dDFdl9xB{bfwwYcO>0P`-{7BZ@F>J+hk!lSoHs#;s$Oj(JdmUG?P)~2do#O)Dq zvW#F-n4w9@R{IB)fR^0kzed2{d}9XN3zG$hd+AQbO@=4WT95y3ReK9Yxp{vCEYXh5 zarbwE0(@KGFsFDES07G3jpEO|E-0XeO#mCc9rWs#2-FLoeYFq`C$9 zclOf#7ru%_8@0VxXb#rUUH^x@_W+9`dH#ooU9t)&sF+R+XBG(t@C=|Ri=v{K(}IAA zA|Nb^8N~pnnA3TtGo0bnGoM*7>v@Xtj3;Ikb3PMqRd>zq?68CQ{r$fG=Y5~|KhFa@ z^O@@E>gww5>Nsd`Vt?}C?+8!Xz;b8`Gfd1gjrTEKqrmiG(6DrHfqc#_2CExkrz854 zr*#zq(p!UPKhFm<3L1x0*oBw*2+n+kCDXZ=0*K|%;kV4l2S8edIP8cOo`=)lbV6x+>vCzSM2?gzJ01Frs)!-rB&vIH}lxa`7n9@MUB!Qs8Pj4T>HEOgqR^IHV91AgO_Aq5o zV}8u(Q3s>Bs)3z`EIpbcwy+r%H2MLXQSpQZO{J5#!`3FNmvk&tJcs;qV0vT-TT?TA z2Bg7+II?aM4j0Kz45ANd;IIYMX}l7E&NbgfK|UPn7<%Gt=tW;s00o=2kkG(U>zoYq zCys(}UTj(%_=_VW;4<;$x*Wju!_@1+SeQd3$z_lls&uErki|!(Sqx`!=(D)eaUA5f zxvbAm!+54*X6g4aPs|!#S_7T$^ll+XdD+E7OrF(GoWHjRrdnZy_vJ5QeGZZbt%~{w(6AQ$+-{Mlm*&0 z#!)!A@z|xD=CIBlkIo1-6_;MU5ND=_PBSh<@hgcUesnPwaT67}bMCyzUBj9o=i*jT z+4xj?hxF<0V967+kesxc5-8_4{Op9j`iSc%eR^3!`efTo@`TfmQ~12b#f1_)=a~2; z(IzbKd&SOR;Q>}6349Qf*P_f$jcXVfglUUk8Nqp}XE~Q3v6N4(AZwro0PChIpiKy` z176}zZdn}w-F8cxtW*f2ud(vUbcs!rKgcQmZ6dY&-xG-@N>d)!U?Yyu)jh9C`%5$K zUmRc_R-YBJ`B;3w7fdW?TU;$cG`^l>?So#n+ZTYF1 zEmPIhe14!3f;Hc1jvYk5hUqad-MoHfry|gn33-~Iu$jRLGqV@0{`NkW2sr^KUh&L> z&>%{N&Rc`~lUc1*6*{q+W9C)aEAWlU#>*b1Jfsq=`AivZy+3g^qki&0oW%-oAqVQ4X?(h6#ZM{S@cqX&2c&-p=e@m zU|I&-vY%rXSP2eQ&S?&vkZvKGamO$MG9hUzBC8w^X{K19nC&x9# zCU5s}4jO1`8IJ+1?#s^u7KA}4HaLdm#U0paU`E1+bq#Ok=r|iVz?*CDBj}n5hTrF4 znoj#7ZMMb9Z}?<{a!o65wxP7b2$ONs4HejQj0--K0`ftn3u@7XDY-FF)iJ49m1YaJ zV0ii@eUgeQdKkD32OF(v84!_)R$VSF|LlcnKcgwz%>}j09%N?1NMzLvx*AO8Kg2ew zKvDEI15IdIUb-RkQH;atuHn=w|lj)NeI4dQcn z$;NcCL39C5*dWGHgV+{1jXQi9Xv8-09;ja5CNjmFZWF(?p}}U64_g{FHqbIAV2tAq zV_ejJ?C~WTSgn?GQTk-lTKaUMk@QJdUi$RIS?SaGQuI`yr=&4uB+M+Rv1P6w)Ktfe zJCxvIwf<6vg@{2yi zl+texs&yr%Sguj)idM?<-HSN{_dfhCWu&=~Sl@3-gn27i)itpm*bFRifFq%8veMLP z2WjzQIas`CKjBvh356|$hxAL!#!lp+DP4rL%t~k9W;!I+#s!bQxJp4yb-IB^Jb zd5739_f}w4LT)*5l^K*i7k_xEhqMOW(@s4M&jSuJnKpb?CYaRBg_2`ZCRZ2^VIODH;Fs3Y zyRZcu>wN%(k$G4b#AbD#z%8p^YKV0(Yj89y`EN*Ks0-6zfQ_pGKl_H`6*qGpV1=6| zE~)JVoY^SG&79nGCYP_u$LLH~shy!`1ruEPE7eOToIWVm`VYoQ1P2Q`%X%jJGawU; z>pu9T8|)Ykz>_|48BP`)y3rhK_VnQ)`3UpXAkei=?BO>h*t}KR*$(_9FIZA>Fu)9E zf16wiNZXsmmsGHW3t^7EZF>CxSWg8@D&W)ywv_cKUr|BlT0QbjA86*A^1(`1ENzgv zAuaHkV!ldYHe01oJLR5>t{U{FX=V==hNpwJ4PVf^*eow^A0$=C-H9k)o^SNvV+Mw3=?sI##z&*`$pY|=ALJZco|l()2xj2n3qjeimeR!M0^tZq;|6J(n>6{3c5lpZ*r0Lq5Jl>6wcJZ%rr84t=H0upQ=a$uT^_1C3K%3u&E z_Y52#@wiQz-xHiG!Fl}>a01uvJ-AvRcdI^SyfY$6S=dTSSu9H#-$r7b-*D`q$;7oY zN}TLqGO+#!IkS^Yr*C>kN+0i#;GnZKea!dzYWk{5(uXA#MkD!ZYZ|0_H3cF^-xy!g zo04b>IakCttYoBO)0XxgPDh`65I&Ir1BPi%Rbjk}HGOOgGEARWUB;-`wEF{On76I! zPJrmgL_7C_+Uuh^#c!aBK@>O0>Y^-PF-~+l`Q*vpV2^CN6!U&(7Sq{I9CWhxyPSLG z|BR5|UH*!Y9#SD466^hxA^G9tW(f9V)fEKLgbTdABQ9la{tBg=B zm~oo)1UIN+#j~pTB39ZZ$X7rh6_p_tAVoYzh8aQ5NYN8!;tn=?x?D~oe9lRur{Xr! zcWUuTKI;e#$>`%S!w9F%>=dj)UhjkOLAQgNboS9hb7&`EvhgsUfngtz^5D5-SXXB3 z8u%_8`;Z(qmHfjm9mOUGj0WmU-sIru9Dc8DTqaftf!0Axpf`(zW9tCwMI|(2Ac|C;`H0}ti|DiF;$UTA^ z{9u+3jsx8Z{NP*aM*{3y9tljZ?-P)D-#ozVX+U~kaMsy6v(b(?(eM8rMR0)`e@K^; z4_num9*AGGghLD1Dyo*~;{Z#y1DFK;dWdM;U#x@ARzh%?r#Oupg}Vo&<>i)z6EIOE zzDV_HNT4_DmbRae2@p<+rQ}LZ6Jw>7Z61b`xR_KJhjGzQW82?g9TPu9lumSmIiD%8 zI$r}gKOY;WKjD;>9M;jnmHOa#Auj_2Edq4HXoUvMmEx9j!&_nktC?Ih-ZkteLU$V0 zR}6rDp%2&4Lm1y;-zd}IR^K2z#ictO3bz3x!M!sz;iI>A%md9{;d;TEFb9|o-#TN5 zRr*=6Johr%k*e3DJax~h27~oSAThW@Vglg^8F~ek8@%-rlL#q?iQkcs7)F4L!C2DU zrbiDJG@19ntGW?b6OJyhpD+%D-6=rt5oZ~E==)OvX-AkLfsv9a>1?m;{rGMlJ2BmV712{D1D-dpe;Hx!+F>lUhzA0slk22Vj@{BW>kc@ zPa+nfN(C074B(?LLh;ffG#6h%DOy%gDFWNnmm*gEKq=an3#M2=DYD?DXc^h-Cg{?e zDn+p2!l)E=_{LIH0ZI{y#7fb)r$OvG0g=6rY3!l)PdA4JR@vrA!bWLwoUj9wa@bkqVJYQN7JILphl8Al zg@MP@$;2Zk;oO(JbPfM_E=m#{i1iaJF0JYukX~LDkaj4*`*H&K8X18#vjWq*!;+}S z*1o|(zMXQ8ZD34XWXpC?eb}hxKOP7U5@g>S`8((H*)Mo3k0fGTX(65t1d~cvc_tKm zHV~@~^Z8=RX9FQAKO4vr78GG+4QctxSwOef^F*>C5?m|BJ=gcKxU^1o7SPZaxU37m zs`sCq1(YO>3D->Un5IEYU_Yo2NsSC2Z^>Pivm=XVUNTff-MF8kO43DOds*cm@1%ivFdatSx55WEtkg6eM!(L}Xe?eIPZ-pu`c<2i*g@Eu zl$Wp`)`X<_qB6}=IYwL);IbQ*C(_PK?{~UrD4N#xJBpUlz!C-+t6|%_%!{(`_N9Et zOU%aIXZppcBo@UB4HQ>pA!eXb4nrwZV#K)QUW1M&tVn?&5{>s4*v0^TRe4hM9W$`3 z^tP}aX*Bp!xrv<$HwOl}3+q-6<>Shuhq44ay4WJoce;(5bYEB`S_=C;v_KR;P4hhW zDw|-IE=vJBPao;RN|!Az>oCK`f$;6a6ZkC++z)1RoK_)7L+K5`R7U5^eTn;$@>qi! z!VjK5*VtsOvC{RG&htR5?22;G)2(YhJSAO54>4JL1QcE@XX$n!?4mtzX*{HdqWvv8h)X%`PZPYx0$fr z3YSNHy};a)q*NSM5EpS%oyZc+tHED_RPEvq1{OU@{7tyG2yG#`~T9gpufMDHIR*fXx3w-CNiS+2Eb#D=|!tU4{=} znqv8Fb+3UJtD22;s4vY0$H-iQDV;6Np%d&frKqLG1{P*H!QslVF+SzJTjJe(3DVcJ zUwBQ65n(L}!%|QOJkXeok9!_doXpdl)UYnuC9Sg>$AWkqFqnAQjaNuYu^FQJx)!|x zV8wCaKhkPmG!RYzfPVOWrO5T)`N2u4uP3!0mPnjvgum2&g_J zuQ8dQl`k9hMBELNtHbbG8ataG2s@W=cb_KZ6&#FN7b_tU0T-L%Z2((o&W_m9W01Ml zqFCV8)ZsNqmCz)m52Dd11k}C6-gUv}l8yKk%oTSdY=_oeJD&HL14CT6FAS6WL5CFfTmjk^|!d528j?0?# z4$wmTRecZQCN{$H$jg{jc#|#O@Vp1)IYWT@G8hSWx&pnA>P~1BT9vDYX4q8g3xXIt zoSo?2f->{h2%ViSW0la&P$gXc200-YN--1R$CO~DjrEF4;xlFfitnIwNt@t;+74(} z*^aP$O^d|+8}~RB(}!m!vi0q7!W|~Ug=rLSYKS*(;*A%(0cE=|@%1h|PJ$XH2+iRm zIp8y48{uL<36BJTin9_JB$z1lq~Y5LaLC647@E2nuuqgTU#>VxeM!HfK89SO6Hs7? z5X&*06=Cpm(6__k9TJXhQ!_Y0bbQLn5V1G3pZUAgIduWMDf%sh5w6aDVpukN{Cvj6Fyg|npzOVsNN_=C+ z3o5BV*j;x3zF7?2BiO=r&@7dGu^U3a#O{%isV)ExFpvV1xu|NHMdnvN$S95kQ!(K- z8b@UfY_xrPRacm51`{m?CK?ER)AUw1VGA`J;bDOX;$&-5va>Zcp;HY7BVfNl@8Ni= z4m&%8o#TS16s8Af^AeFhGOvKMPW*}nY2cuf8~iAF$j=}+S1i~Ci4Lp4WmP~*( z2bdiWcy|*^Wry_WrrD@sne74+!t4fa2Cy4YaZtOOkMYoKJavROlUS$1q1V!x^s<#eyoReTD?;J8o>{BTFT80`(|&BP8=MSUA1(Jv1M@gWj#I zo2G#ckYLG2z>;x?8JKfWt}jLa_kpm!Qfk3xxdpp%3r71hvS8rBI{07Eg1dkP+ZkE# zBeY;>{r0;+z??DrzvQu9fQX>+=0pRM&*9P6X8PGB@U9_KpuT|BYQ{UFnNO7S0@$Fx za!BLRvTld2tvUq<^lVkvFSs*PBt-Ooum8_#04GIZygcvoAD{9pG$U8rZffc1uqmPc zj1P^2e{Fy5!J$@(lkT1(rpC4yj|NmMgY=z>I6RJ3PPZ z)Occe$>xLaISqL}1nAX&om=%+#nzR7>od39#Ph$mi0i)O&)sJG?RHE%7nGF}Gx}e( z@8qz~5iZZ2H?I2xvA=2gInV#e%e+54q82|~9?&?)YQ!Fo6!%wa9!F)oUiMwO(=$s| zUbAXjZDCIOe|CMVd^zDi4lmd6gv;NQMt^t zf0o_*)6vWI_puXFuRgOY-gtL9=;!0(Iz1=N+<8ODoOZQ#l^4@9>?--6d~vi%=0)?~ zBMc84dr!~n-(z{*bCW;sc`~N9-OZjuKjh{0GfVuKB4{3soL;=)!yc8JlLORXRrQT+fDS)2Oo{+!#r${WYO%a&@lCe-TJ zzHJWw+I~f1ojpGn&#crV;i&D~kL&k2Ub$m;B(E_ z*5=DH#}gCM9LH`*`e=Krc&F(zo7pXHVR3)mVJL6yg3shd2A0m7A6()4B9n6yULGiE zGr7}{@h`6)s=MUm?nRBgHQk@{@Cxh{j-J?-hD zo%Zke#TT8+Rv5W_#PUhgW}n-#<&bJ@vQ3c|j-!j}$_IDp(&gX3*Q~p$d6##lQtSOc zM^8GtuxH0}l~k3M$7x>FT9Fa8wOE_qyXMtAd}_0*(X{pJ)~&O+-L=c`0an8GPd6Hs z{IRUryrMBaVK2hgT9&^*Zq4|>*Tu|orzhK2Yt^B2?^luae%Tkd^80PoJ#Tw`$w-fU ze)MJ5`4g@SdiI)qqD%SkNZs?>8E>n1p0Tb*Oy5y>5Z{0Gn*SRtV##Y;wzkDRaWwAQ=%=EZN#UAt5Vd4~GA2%44)g`L-X zOo?f8`r(+^qrKK<{NbkF{72{SK7QKl6MAb~k+C-W$Einmfpu_ssfjo9io}HHz4ARd zu*LdwIh)_T-*L_3ud=$W6}#8f?0R$c;ZAqm?(E>PrTX>FhWvW#-^|;xgjKg2mdl5H zepTzQyqL(~Uk28PUDC_w8Qmu~oK60_?GCkSxTj{@5nOQ>u!5;2uSF9UYJSJujl904xYzs4-M%7^hIqqu3J*kK`44{ z{_x=y=G6PRxqOW8vldqK&cE(zaqzZ}@W&6{dB0FTtrlg;_lL{@9T6$JY5wyE9=#mo7GC z4*ljk-PWS?<;}5Uk6)M=ez`~1#P`Fu#YCM`C41<~PAl`R>CLvDnA>sZn030@3qoFf z?p)sWXsHKB>y16W`pc4vA2(Jt>pjUYCv5-wEj?e2={aodHf!5=Q&aZUcD!4r&ajmA zQGc7Yay_Vfc&X`{y!1`O9vqsscSWBwuI2|DWE=>x9{aGSw^iyXK4(g0zHZphZ-4O?og85`LF0Q9Rqy|? zFKTVJ@2$_BYfDlem|414>)5Dck>C7|mF~9Sk}9(Equ~#?xs*7!>GHp@P;>384}Yw@ z@xcFw(_d~@`!O;oZ$z0j3nOa;Zt0$7-EIGOKhEf1wf0!E-5K`Zs?VtIkHKL%d|~+K zUgw^^T2Ub~FV^vh=bEwmigy0X)2UjIIs+!u*4nqyc+Q%8KIblb1XBL*DeO z_wr&R&NV*YYPK%R{`JrZ%dq<$ubfYQ=-aex&3}rSe;FOvzohlI`a}Qi_VHWw^R51( zAfQWNc1|gtZ*9-NmHz*2`ukS@u6*r#{)-z#=Z3TkbNEsF%y~?&#i`4Ne7I4?@zas( zLz=}$^)B{@(Cu}n7c+<6{cgb&+mE{%bm`D!U}*6x*XAauwXs(tdfzz`sD0VJ+R+tX zZU!|QYSp#Ms_UKun|&EH^4+ry_KPgyUJu^!uy%{c^DBB}F0b0CR?)K~_V+2?F7=*k zM4PZ3ughNExn;^1bKgZ@x-MUR&}`C*#2=o$ANjJm)rq0HhEvkC?dwe)W1x^-O@bg*!HK--o_m5vM2sV+QF&|Dl91T;@ry3Lr#^k z`M55#gF}s9TU>!LKf4R@9>Z)JN87x9wn3X)eDA{NHM*tRuDb3Km=IMmenrKPTP-6V z4e9sC;c4-wf9kTj|Ai+z0=3~6#ue>#cS&!bht z*H!(vWJ!dsitPZ;^yikkz-Fo>&tj|2>mJYAxNXe*`se(L#t!+>=&JUZVys8JejspYf>J?7hRc-+5A{=dsYvt%_FZ zy=1&g%wNH^hJ9y0ZG7Iy5&^594J>Y(=~w-LLwSeJ?gJZJq`%msdRy)Jl2?zH1$VqY zFJQ6o>Ady6i#94@SChYs`rp+qaX#0#+b_|b_l@aN&bq>ao}=$pI(cEo_dB+9JLNL5 zvia^`HoY3_a4;n;VZo(wi-#Y+cEhnjK#I5PQg@q5y*KTvGk1x4cKEyHYlrRb5*coN z^yVG&&#tt+F+EqwM*k}G|y6W`J z^U+0YSGb>=T`g+-ug)V!JE&`2@UwC&);yraDs7j8KPA1_X-)-?{qkeT{OMCdp7%K( zP}{=8E8A)1#nE$rtriy7rIqi&D;v^n8^7&cYgUUZj-$V;7IVjQOoMdmt`kNrw>a}W zX!nB(urcPJic{+aAInOysCi(@t0M0l_w4)qz}(N>dRvw5-1pZWD{2fb((vuJ(<$D; zgU9^)cImC(-_{T;1Nc_l=(X=Y!l& z2REfH92$Loi^az!F*)VtmDpYJpPQLA%jH^C{p;|%__^7$X9f))I%8J*hTA*-K447i zsEPLV-+0eiGkRrsySw9)XMT7+yl2$?6}2zgcGebaKcS}6OwTQYIhk2phKqJ=&@*#Q`uQ1nrzx%yLIU{)onk#S$O?PZo`~? z)4D8dy|F{t7w?yZ_q*PtcEl}V+@Dux9_+WTpL4Z3FVar#JYmsc{jRn4rRJE``7nGy z&!*u67vE}~dZSl=rvq=B{~lbd!k&Bm(zYDBb*x0`W;xYbmp(MCfDj3UZPa+si&%- zH+duO{BGqnVE?_O8Ht_Ooci%v$-27^?{ciL{p{qy4z2#(^m*8_e!G@AUVUb-Tl=fu zkxR{{c5Rp0%yG($k%yXZD>b7>%@d7_EYcR=Fgto(gxQr$%T+%ttX^ec>4-VDC(q2v zd)}qd<~B8dn7uJBsQlHz#ap>N4xHuI>dEOBjEaodfP&02&9 zwyE~V+p8bN9@-Sl8#2<@DxTIu;*V@xs#|8H2mA=9=%Ci5qM)6}~i@rNPG)ZUOBwuJ?=+#*KsInbzuF-6HY^a%9u5C zSA5v}&_m4|IDMKswO74>$+e;_Yo9DWIqE>K>7`5MWWRp?_w`>(ly?+7~S;)vc*S*)-*ZW;;UG_WLOxbs>x7Wn9v~~Tj z9BXoSc$3Ce{(3$8*8Gwis`PL7-LpoMMvgCb?rGgE#Wzft{L_(7tu8C-eT3! z0r#q9L=E}$>-Nj--JdP*_29XxW5&$Ry@m_5M#udWbF*rfkiWkxIl;=e`n$LqmD&eX za$C8t-o2lHYZ`cG&hTz;s)g)(Rx9VrV)L|4K{b+YT#d*rYxcYu`KhZn;WT10AN1cvR-R_smjnYX?u;VE5AIk7^BH-=2MA z@V>uK{j#u*X4SseDb8W7Je;m&jCprzj;-rbO;SrpLl+x z%EJz)yGP}<91(VNRo$k=yzAKi{qL;>y>DjC-x9wnYQg2aFFD`WJC--B#II{C?E{{_ z9zO1~Tk@JY?0jlsav+EMpmV_!~6YVU1Zb6q}{FJ z?x6ATY`y>XKx)#Tlk8XQnppDo(=T0~U0?L8#rjn@7vA{kzKzw8HwU^M zx>szAL)k$aw-vKZ|7+{k$UR+eci6JHeYdU`ejM)mIY>LRrFml5lz-l}ooaKe=9ST7 zLO0z{ObBjqp;642C7KbASO59FLhh)c%{QEVcK6IJ`?38mcWhIBe1!vTZ+5U4IW#>Y zsMfXki_n66n0eqw|KGwJrCf@7m}}!u%PG#cb-n#=*VdVJ*xYA`Pwd2j8=Wg^E1f?% zWP8hlb>qrE3)yz6h{OC29a8J+YJ08l@@Q7)TI%%E&3|cbe>ZRWtOdU>PmnjSLQ=<> zYnHW-92@arr{gE6Y=(}`@miSJy6rDJ_V~`5HNthr+uE%^+!#^jzKRLg##o<{^U%Ad}vV2a5zpr`Stz7a%Qs-)W*XJ9* zy`EPEct)pI5N5cg3vu>`YIJ{6x@zPwpVHS`KCH5Saix-ui^?_c5|p#Uc7MGgQ>{<{ z>@lcTnh9nrTdhB6bv7<*)EMn?$3=%1E=+k`N7fm^E$@1grcI(z{ zIG>nV`*LQvkP;YPdUfvt!)zA)z@#gCn=4Wkwy+73U;*Pn2 z(f;?+Gg21MUYnX5S$A%k?Gu{~U%2kh$yxWNFUU83^sW5`x%~h4`TyS<5LbUGY2mkZ z%z9yAbMWRbdB1&E_3fs&%g(t)9w;&K)r7Xg!v9|1w&jl1<=QU1Ki2zEakH{=K3f*Q zhpoJiDzCKl8gkR)cudT%+nf5#DHpRlVb1E{CI=Gh+t%@oT6MnZ*$&rreXsQ{@vZdo z_`df1t>Z}!wsTa=r@wqtcHXwh)jq!2ntZQTv+c9j{PJ*Pld_9{nzLX~!!YNtKO$rQ z-W$|%*35AhmT#*pNz7Qg>QKmEdlw&Bw$3tX(rxE4n}3Mh>O1k&j{mH#M`F!zj zxxGlnooLt+01f+?Wx@jtEFU5i1n}4@LAp5hqvsOMszR_ zUw!hIi<%A>?f%>K!+%>pZ2|*A-rFVw_FUAkZ^aLzdh`slT^Bv^{sV15v6zS3pY2)x z{j=wZYtLKVyP7;^+UFFZ&!+I3vp!{PxE|_M+qU{T=ViHfe;Bxa!=%P5nx1IiJ7D;e zlTB6zbuazkWx0RCemPjX$bVZuukyhBh3e~Uh5Da71n7|_+YTD>aJv8BRd?UtKj!@A zdcP$d_#xrMv@!prH0{{xlFhf$Xa7?9%aEv_KE~G7_1fBO?ZU%He)`8T#DD6LdzF?S zs@`mJ>g7&7?{1GjImqir-%y|Nu+h=@>bUOBLc4Vl^$wMq@Lk=&){}0#@B3rsz1}-V zwST#NX8Bvy8#sR&bTy_?jfoYk?j7pVnJoa3a`@KyRB2eeFI6=7^Nh(`d%wvW`cvem z(n*=W{NQwKZikZo53*kUHE`o|pV!waor*dAB=KXBCBDPVN;FP&epM^FQ(m0v2d}$} z(q?tZTDvqo;qSCsnSLvJy*gQF=$&Icj^{nA6cGsAl)|9W#0e8=6loZ#QtcbntelSH3(i9TKc!?xvyP~Nfy3#ZYh?ky>>Nx?Ofm$khHL~7K9o=o<(8rq9f;P=P*g-Yv-Jz z_P5JXt1p|`IRoU2v>JH*iD>`9UtlsFq?@;}t1%VhnPll}=apjRXXm3DYis8P z2uJ`F`e=QN2q?1#EJeDJs1WThTt zV=5zfo@NoR5ryQ{SAiF%y}TO!!NqhcPkWdl?JWO^c02|ES_PYpH`c$vY-hrM7#8zA zgTlsQyvq6B!#G}^@66t)hv$pGAa{Yf&n~9^-dtJtfy8Sr+TvGvnW=}HndXK1TtK{7 zouRDH-p1(?gsKqs0)_n@?j?1ApPh^9SFr{_-F+gu|D?=d_$IV^=s(dtmX}c!J_PO+ z%S(s+zU<0c^FFylV{IW>hTJBhP(oa*Ly63Uz2N@rTV(c zv|Om01;nch+$-wF&saBZK$=D2u2MK@UFvU_p;Dc+bAj3wngLd&3eG@h&PFK&p$>+D zHz}%t*hH|l(S_bIM+)l2Ab5lFc@Z9oVIZw0>bf{pS7Z6mHsdH2z^iDYV_)k4Iiz`$D9IC6{)zTKsZTHbhhb4h2W&Us`dXcjv`vUM1oKFm?qoA{ zH;J!-v;&BC3EX2INYk#GXGpvgyq-g}Xq(s;2ZPO_PSz8?4RKTX=x_JjOts87BY_md z=1^Gtn5)UKes(j=REzV4^#Ss#wirkdw%|XA*L2%P4A4I^cFS1&1Bl0O#KS~+f%q5Z zi(douD=BR4!juKjeF=rafV`f63$IPeynKLHlOpgBF1AXtV7)xGWsaj%F}_6tA#5`7 za4Jk)o~%;eQ7L($2^wMUjahgyTPheS;5XrqbAo zCEK?M>QzIuhmp1r@&2X|FY=m0yx72T!MyzKW~fwm6?oMEUfINJD9R;GLxxJ#Q<4Ux zEy{LV@%3Kwm`^{1UWHH(G%f zSs1xysMKu>kZTn2+Gq}pzh>vlimPW8;Cmk@UQf|>zs4(Dr8=ijW-yIT4#-3h`V_`q zp+DxU;9p|96GFVidLduE4;E^!+GMicUqoT!3Z-icb>{+fokP4T6owbrlbHf9A4pSu zn$?*K_o!>r?V-P2mP)0|SM5U(ANWfm9vw*@B_5m9YHu?WOfaAJ5|2-X$tSR%Y=wM6 zUe+|z7F8HtGtAZP6nJ5tcP3st;9glj&=-?A3N?Uy28G2l`*?lbvY2JN>FR0b=@wHh zQ>@hPDj1>SZQWj?jV+YSR_fpaWVUuhVd|Eck)mt+x)JqG9_tZDU`<`gq6ehh;AGj*Si)rwqQTChIFaKH^KzD~MOjo|hYXTBe6mv|j346ke_^=}HiFl}z7;U8Qn zUrAe5HT579Nn?Jsj(9XfMzZoT)KEOwSk^M49e}h3{}soww4w*@2Ct42T`3FA zo2yzHk9(kAK2eyag|UxpTXiJ``#@eAXN*`7VhY1+nYp@?0a*y0-H7qV5pC~6#@n)hc=vuoyg5W0ggPzQ#=&tlQw>pQSJ1z!UmpI! z1)EwKFB3j}hM6iP-((ktjia!CQM+p*>@t-qEMHjE-FgbU`|G-cDo>@lsE`J1`ri{T zU$_^vb`GI1#)fe?R6sxwJPXs7FSArFRfq$5^(J10>c3=I7TAATL%f^|BUhGM^-}?I zy(V6R3)3EEsMQ@6+Cv=M@c9n4B?w)wuqzO~?wukA=}`vRbv@E)^ztXTaYAm~dXE4zmNW1%WLj zAPiEOo^|H+SO&$5yY9YPt~0FF?+O^(DCz<{;JWp7oj|>9qdrmqukJ34l`y9;`hc;Z zBn2D8aaYwzVqI4jwI)^rsWa z0)PzSVd<{|H*h7RKE?D)`$>biFMzb8i1rQ!5NRd4QvJ@|9IVII5N$Q2g*hQ?uMp25 zoDFdT9soGV#0>fFD zP7VBd(L3+2+aUTA>Iw?w8{?lqyqG;gC5VYYd=cS6a4)D@GaQ@kZG`tmoXMqXWoyU% zK}`DvdcT$4>(>CmyhX4q?ZWp+B=BKy(jUg{tbn^I;2Of&kPPr?Px%v~K%b<5&rrbE zDBu}{V}6Nqg(&}J6ZEK?=Y-EedYN6Mij^9?LfZ1o*aTs=3EIgLTC4*#L~Bp>kgpDe z8SbQfUL9|vCs0^dj3=DiumjvKBHROUxDeYUr6uezVb(|l7t&oOtU5zD_E<)TdJ>30 zT7K#P>`b^H(irpvsc3K4-PbN8T9SUG*ATtGoIcsCy=37#0&T?PCT;aZFf_^4np8QPH(Wz3?~uJ;^t)-rWAP@#xlNu-iKj0{ytEA zkMZrK_i{P16!07cTwKT~FSpJ$G2;C7f|~-aQNTkK@Fc?J<#vV%dK7&P;qy@_gFUkD zHeZIvmNl|+dx2=@nV|hqG(vcX!$s+#h)w^yFt}q8uWLC%-Dcz~iVmhPgom6)${R;RZ6FA!JHo~ha*(|8O z#OVBu4a)qS!dk*TUWl33Ww-jZ&4@V@hVtQsE=3)}*pHu`GYrDQGT|0DJLlEH&ZiGn zl1L|>vqw5WI#{U&3xHcNI$W~L=tdDg=B{vlQo0Tz-AbZkZmg8f&&~@Lf1%6A_?;@G ze6NY_Yx0$61)KARyp*RyFP|UL6(k?af1;D-KW4`S!iQ3RO8aCu6hy1_*mxiE*=B-I z4Zu$kz6{pmyR zuHrjzUGw=pP(z!k(()N2+VLm~o2?ucrjV!S3yV6~NnzPg4wf^J2=Oe!+3+UdT>wWP zCWr7<7?(7?%gV&tttkT~2L5)If! z#}1i+oM^;|&mdg7-V*UOCg@Rz8HC5dy-EGEWocS@*PF%l$plY~%ZVB(xomY5a6bjS zw*sy+ful`MCHw=Xh2@d)+_x%+b)A|0?j+jJO0>D0Rs-+v5^YO0JbZ2J7uJ|nFJP|I zjT(hch2aHjnHnnaasW|A5ifg816(Zs36ts`Y1a@=Q<`ku?q0%5U|?Q9vAPiHFVj0I z@5hA6d7FD813{4Tb|%ae@A~wvI1k3^UJYRl5My;jIPUUrVY@}#0+<`1gI94xH;n4K z0>*CGYR=#6-h&g83_C(np=Ufp6^2VZTM!82g?g}naYEEbjtQAR6+&iK|JBDt@wO86 z49Q}8#{BZ3ck=SuT>+0%z*80QWeWIC!rAaA)F0TWI)OY{3V06Ttmy;#0Yopb3=;%- zU&@(qDSsE>??(7ixaawd8+(S^vJ9sEJe`R~R!8CvJyl5kM376bPsuC`@ktT%8`FAF~4Kjm#Q?;_Q#y)K{ z;jIv7bp&W^4DTVJTvHX|S*Z}uUcy6>A1c2$kLLp6!HA3Tz*wdObP-wlrSCBu3|65J zS`-^pxWe%tYky#;M2Z^sCiD{BCk$Vo=;ZkssDOtn;G<07^@0B!!hPUgp-p#-m9^+7 z*C`4o&yTw%aE#{@;cVQ~l<$T`TVos%)}{5?z=(1GNJCf?c#jv-^(L$@LpT;k*boUe zfv|ZJYyn~8CD?kx(3bGRI5K1y>fr)ma(jJDxKuxZKyTg{{=xN|3jTe~KgcjsM-?y^ zhWt7czk`M02V=v@1@OZ(t|We}-G*mp_}fVsE6U8q`PeklU!Zq3aE}*aIfUUoUZ|%} zQoc}5S)<0@=KqJ)ow@KLFaH|>KA(wijW1hxR7r3)js;jL!o_DAsu3_xkbWxs;?pg^;;UkLEwZJ>nCGg%U0?np~pqK?aEVjn^}x&o08BL->#Ig>Z2{jEEP5^r1YgfdX2+ z{s~X0${C1yt48>3!bLu=gtz#f)3d#kaNCIR#2OrD`yk;~L%1W2C$c?_fOjOk2y+r( z-2)FM;7JQ8X9(d}3D**RV^hd@_yEF7 zV}lQuXg?DHM>)kSgKlOJ{dl4ep>%H~{B{FQxP<6;5S~mvB^3kV2;mC|-$wK|0mpb< zSR-txH|`RBAo*q+iT(@Gi|YhP-GUnG1Jv6GqTfRFHiWMy-{v>Mixb|Bd;qqW4&J#C zJ{}tsxJ3LW+#~-DKQNf@af91>K#%f^>n=rk8WNsN3J~#T#D5<-Mr@xO@aaSN+2)-8 z3;rV{5`NH!1L1^ECA^w12M!bdGvSX22jSSYRm5BH=b=P@jPQ__91zRVKZGBX;jamI z_2cwx4;S1zT5$ce_2)R-iv_pU2)|7YAlp|3cq77pXPp8;V0)whZ$tQ)K+cElcY@nK zg!{GUINQ4fx6yzz`=NR}guiBc$yvToeInvBi9SSzuP6S4f;a=Vp9yZa6Mn8E$JyQ_ zxIIpIa#xN&mw*zi35q@bH$Jss&xV=mG{E-|N?fetr#YSvL7t1@G-pc5ilMb&$Jc#g&(#G`NL|n!vROBzCPXHYA zH#LRxVS6s%b~4esfPLW=!u~^_TExk}67hM2XV`K=5&wno5E;IP@MIaEErbg)`U8Y7 zqfUs({}kXTe`Feum+kX_+dG8!Cj%c1^$uPy3BNOgKeILQaBBe$1M&%-!*RCP0&blN z-!_-yZ2tt@RwumGe2%mA>2T{!c+sCY&eod4ZCk>pFXVVL{v&7!|BLVegbyRU!Ot8B zCcKIhiDP5C+p4DUUi&rw$+lvSffpp`Q!T!S+QwYz7`T#GH{s`e2#Mg!RTmc;Q zb7U!}XZsT1HivM}l^hrK`H}F=t2xg06Tqz<j}T9I9N}!;H@urj_=*dhP;94uA>8p2$Ju&pxZOs0?Q0wt``KB9FSyQewoV#u z9})iKAC9y2&Twl^?PK}792ey*Nq8T^*}7#QsYm!uGEBDq81MkXTjX#)Y<)1?b|kz@ zF2~tgU%2f{ctdK~*}7f8`xAch8K-A!aN%|Y;jhVYBeznRKzQAkoIZ~K2(75SUQ9S! zD+})y5dE~*oRF=9h1>OnJH6#NTk8t9zV_TsF1_P8Tek|gJAfW)MGi{-?;$?zK5~$) zONHALfHQxo630dRyiD|iK5?9_O@-T6gzth-c(Jvl><-6&Ea}ehV+a10gW*ws=c0f& zRlwUT;ONhp;y)Dd;vi3UIUc2$uJH==(-rVV3iw(De6Iq2QUSlBfZtQVUnt-z@Pke1 z!BGLPsDRf|z#A*z{t9>(1-zdEu2aA#Dd2Mf$NWt$!}He%`qg+XSD@dhfd4^!T!@bg z<01%$6zDH0;CG3SAmj5=f!@;2bbdK0;MElHMhbWv1-yp>K0pBFA0)9sUf31KQgZdNY@uB<@>xZ&{n<_`u6>vNUqXg82 zlP$R5$!eaCCWKqj!1NtTZ)d_E5nhe(P{JK-IUf+4UBd|fj_?G+(+HnQcn`uCi~I?1 zNceWbU5ar2Y`r?XJ4$$uq8tw-`kREWB0Pxj7lgkdyf@)CpfuD&$6}n%R>I2?Zf<9s zuDXPmkm0_ByU6f%gu4>%Nc_VH_m$BPCA@SCiJf$NWRN~=r1~OeYz4wb;8xfIqpGt zbHW=CF4DIrJdAK~``Hyj_#nco6F!viSu*SV zF5zN1dP%sCjNSqWkbip_UYYRTGTfc;fik=W;Yoyx<)|y+Gi3BpgfAhSy6eIi!W)$0 zdYDK0nML@Ygp1{9DdAU3b9&MKcMxvv%yAFmbDHqRgu4@dgYZFwi{<4R;R%G-B6ar{`U#LLb#~s4}?D;9OrTIDhA`5m@j=Q8tcCr;e!a@#mR*x zgjaRp^pu@KH^Q4$;yAb~?20D59pP6AA4hog%A7u%@Ogw=RpB^IKMU&#uS$4XqCZG@ zGs0I9ex2|;GW;#!sbn~!e$-Ggq8{cE4%yAFl7O4?2Wk=hZ!$h!gzq9;Y&Y8yUh{jN zE-~I-gg27mIG>I3w36ZD2oILw^9c`?;j0OcmEn5{A5HiNO7986hgIWpvh}_2?h@g; z>Ktclc>#Y&_#d@6&er7u{)uoKSB{JJP#OGwOqVm^q8@w+uSU42|8T;8ARL-Yb|n$s zPR4%;;k{-2_Ypo&#y^|zu`>Rz37<(gxHs%72Zo38FDJYK;hu#5O*mU$3-8(z-rkMt zLF5xpxR!8{&yR%9AzZY*<%F*!T-47-z(pDoh5hU%`du>mqYCuri2k&U{*D6uYofm+ zqgTVYk12Ud0FHWlFQc!dK<`HM#oW2xXnICyLUo^miH&{ZOK3Yj@F}3IC@)my@l-1-uvGE)6*@${9&`0~ww`cpDi$ zfpD!1pG&w-hOZ<%jd0OUHW9u?hVLc(7~!HloFM!);bOjDAiQiNo-UF9EyBGCZ$tU@ zhVT@^TM=#p4H@eHIN>7y3WPtE@pmV@m=}*Xl=!qJJcjUK!g~>(DZ}Fkck3BHV}KJx_Si#>V>0CcL%`e?oX0!g~;(kA$xv zT=Z8RVLS`cm;y>_s`x4%g@L0l!5Z;UMY{KUet|NSC6V69m|C3Jmd&0pC z*!3&nPE9$zh#w%_jd0N(&J*61aM9i#5S~alTPq7Om;nK%*TskPXKP*oFGu)oDxhLJ z+ko)rgp2h|Tf(cgL^oxzv z4bVo!gvEwO2yszyaS<_*5%Jn#Q8D4M!?X!eBO-*DsDAyTBLpN*(Dsjy9jqM?krbK` z78NDL_tWa)BO)LMh*KxVmlziws*4bx`bWh;teSSdEw%hMs6#;e&apAA+_hmb5s|{+ z&^T@XIIS*fa72Q(=LjJoDsphB&@Vz4D#XPm=s?Vv2yMUN+Mpy4p?|ba8y^uC6&D{H z78(t~1oos&7!t2TCZ2<1!|Mx%2qHsu10Xd#k;D2$=@Nvf7?2`LH{8`-k*IQ`VP(jM7sgMtE!@=0SMW zP*lZ;5yFTOuAc5fTtbupInpm8CMq#Oo8;>5C3%D-HX?aF4C>|T_7FnD!$D&WAzMMc z@Su;(LramdH{t!Fg}8V|p^eevlPH5Q!qwed7@UaN-~o9f&6ju~KFPyXniHY@LG2OR zma&6@TS9`iMJtJbJ#7BijgV7>? z(F{Dqx6;TBFPRY<9*s@LUE%?BrjJ_PoYz%r24L&63z)| z*c5p@)CIiwh;q5}nxIW+SS)^?O3&0?Ym_>vQN$UTdnkkkk;FLl96gLViUN2T*^(YD zSZWWgUQcMaP?VB}yP^8Qe)TN0Q3=|JA&F3FiIBSiVl@qzSY!;Ey}%v^N5w$tb9eQi zXJgP^N@V=XjlV)h265a*phRE-3A#|I)u6~}W2l&+n7qI>WaXC?NO;TYNi5L3gmMyz z5rLJ!OP-gKL_tXmgYuXNHfJJ(6SR`|#+eT#DnCCDDSdu^9tH4o*M^&9Xc`4leIkq^ z9cC!4EX8^%cd2MVMYT4s#A6}vp#JXZ>ggu5h>M422p@T#`nq~}3q6zI$4l_lzmsHQ z&z?P9J$Nk(mMdZC0Sw(6>KQ`5F(dJHb#K5Jy4Tk;q^eVkyxcuOxX7q5{vb8VF3?0% z-Pu#0FsQ4Iw2`2NyoYw)pa>6+L`Ek_0zngND6vE_Oj@Dk=YsjetxGHvVw=JfHbTNa zDn{$?Vbr4W2GL#KN@6qT?q)>E8$@pzhJ;@F7`V9@9`mQ#T}<^*0~I(kl2j9gB-J8f z&>b%}#;JxnlsPK$R6`X3sYbhrjs}k-IyN>A3TBXRK&OEA{#aEF1|LkXx&&=#SXd%x z8U3UY5%ICElB^b!!`cxsh(aQ+5S!wtIEWFk~ zutN*qKy8QCt%F;2)^_%75!gyA5jIFbmr4t+ILHnjGbm+nY>bXQpsI`>Vz8u&kf$&- z0@N819TBF}dN-6lx!14HnmmzfLSjFo7@#TllmlLR0Q10;b$AklA&Ifri5RSPhe}-% zdHp151WDL5i*ms$Nqk8xlr%y;qK`32h)x&~)nBKVK$4e91`!%^^>})DBuNw)FF^7| zqr(J4Ck|Y?@Gve9J{j1>gJtMbkq{S(hKdxdqh?TbWQ?0OP6r(peNGq4(de~?zON6JFrz8<*8HPka#VL6J)`r|oLP~6XDCgO;Sf)f! z%n@klqoFyE5^anr_@W50&Q1pOB)ZoF22%{tKOT^?}y@>1mQxPCSeT}UbB|k!>qMx9r%m*au zeF;5HxS>tZW9VAw#ef`$jTgIyQdFKWDWO;$i}cj96d(1DHUkm{VbvEm2@uK=))2x!G(sfq*w zLqI7Vp)zvvIK^b~@fJ+3h=?`6_>2M=)F6pX0($TP6s}F`QpwCw+MzO{Gs-o64_Rh? zlJ1fin35+qJ`bq1$)zBT>3Pbm4}>N^R!^AE9Ko&4XO67Tp$~!;g4`U9dL?>!rG&=j zs2}JMX^l-lv@@0uqU}k{QM6F9S|ig#R};+E$VB6~qpeSZNpsX|%~%!)Yh;cvQNcFXpWRDi8&IYH%EF#0l@MMPonV~KWV|( zSAy$@L?jqBixR{z{spO$;Kmwc(E=136Ct6|KS|tndW}w+7Fq)SDCr90uvh{|C85|b zCBZx{&`f-6q8L6jDn2SEk`1WFij@js%>CmwNC=MAh9`(M7lEOAx6imfjHCDH1jJ~+ zMin+-5K+mhRgn!RFfdaf%NGw72a%0|0e4l6t96kozcey7O47*5c>l^siGw5JBPp-N z=-Bu(`10s^8>}=RDw4;C4m0SJ>_E&67zxKQ8pg$-3rUr249Y3PrM0`Bl%qr&77N3! zTv#y=DAgw3P#72j2e*|B2OhxS6JR7OK7u)8;)CcNi8^4#Rv(B}H57`J1j4c&kQhK4 zkr);&2eF1QqVh0B7B8cUH$fDKon5`}0Ru;d2Y4g*ui(8L0xGaS7ze>Fv~=tT=QH%h zPl<|h+W2)s7&QK21B?r>5eWqYaS9Q)5|v3NCgV2ZAQM^Nh1Hr23S!GT(!Ah^$Zmu*AU96r7`u&oAn3Rfj2@+rq zD>R;y^LyT<=r6~#%yA{cL9!=QAj}P*YfFQ|?A|ze;LKp3N~DwA=c7XM0ovJPLQKAqvnA~cNKHveJ4&Xfq$MSKWo;>iP}Z6>1XK?C+%=hN@*)DW(l9jGZ@5nM z%{VvCrJv#({YU{D{Y9e^UXsq=hANIcqB5X@tw; zoiCyP6Vfc|d8Y)I>fx*em(o9x;8J?rzmJz(&iV>?Hw8RV0bihi?^VEWD&Q(x({g^V zfH#!jQa$vL;3t8lsL#;~_-qAyqXa)C;d4R(zpH@fDd1&cKQ&&MFH-)V3V5Ib9-)Ab zQNVvvz_%#ie=6X274SR-ye#Zr#!H^w1`0Ut`!+=%rGQURz!xjvI~DNr3iwk6+^U#q zIV&sRO%?F23iuEOe5wMzS^+<(fZtHS-z(sbw9rnTk98Dqe+9gs0zOg!pQC{Pu7Dp` zz;VB@DZBcjfR~|-1M>8GDBx`s@Nfluv;sa)0pF~EpH#r_DBzzI@Y1xvGGBTX@U{wg ze+7K30=_^2->iTiA-oo3=5hEF>-ozH^zRjLXW9@UPp_8(-bVo+uYj*rz#V8KiCq5g z2uFGTl%%V<1V1UkyGw8>y-tEl>E|lodlc}06!1?9ctzM3i5JQ*jkl!&-d_PvQ^40N z;QJ)FR1et_Tq=(_ZMc)mSylmWAi+;T+C+N`Qosi(;FA>aUlj1&3OMes#7l0sj}>qW zT3|1ycTvE-74S|9_#g#*k^=sV0=`!Pzo>veSHO$FenV6Gsj7fCSHOEJ;E4+OGzEOE z0)AKlzoCG?SHK-%U!ke=x+>rS3V65zK2`x=sDN)(z)vgS4;63~?Bg?)u5t=^Lj}Bp z0zOaypQwN@Q^0pA;O7+ZTm@YG7Q4KhR)GD0ru5*YfOk~D2P)tx3OMd(G!^fz|A)PI z0gtM>`u{ToF+n7Wpaw%VkyL{y5=D(lHNiyA=o!Eppi(Y^h>AcF5tRxGD)WvgT5Y}6 z^0wM)rNy?kYKt0>OSp&oRX{E}+;1Xx`G42m`((beGaIn=_df6c_x$F0CUfR9>#V)@ z+PAg$+2_Q;7dZGf2d{_w_)_UV$H7Y++&BCJDC2mbyF-4sga5|CKXdR64xVZ5Ke5ZP z8F3z`o-{mIKbffAlJ*SA&jAa@og>8O5+82vZ;^DF&okWi&t{V6@$nGxd8EG^?t7A# zt*5`?k}l`#83zykpSQ?!x|1C8Ul3>cZ;5ldXW@P&dD;4}CeHG=5@-Dn5NG))ahCrL zahCstILpiZR)L)?zlAu<*D>pAK|N_>#=mnd?wkJf62omj1plvFIG67LOHWr5KfO$P zSpHMuZ09^%kBPrGS^Pqy{~L#%)AN;kR5^0GZHaTbw-_GyXPBACK1A}Y=WXJw=VJ$7 z;^4a+{G@_3^|Jkc7I7~3mc-efwhlhZ@C%R%_j}KiJnR3ngI78Da^mc_&BR&%A>!4r zDd?|GJvZS8=4TUU{TC9SPkMeroYQSjob`8i@FB!G-Dioj|6eD*fb97TaZYzCan`ex zILn{eEa9Jpr2of;*8!W1cRv$1v?BRMB!3NY*5ear`4NWO^=B-}&mjG886McP>JflcqZkNl8B)^pG`G`2@cY}i;bnw&9OW3)L^j}7t)4hc_ z=eM_mKTVwN98a9n{e(E{|AIK{*+ZP|%s4-FzVaNrrGp3m=UU2<)9vn%e~38CKSP}J z`>KQgojChp8gb6=a^kFitAp1qO8AHEJlnx9CeHdxiL;$=8E)6-2_(<<&vo$N|5%Iv zS^faYbGr4LC-TMedBoZNiyZtK;;i2%z8pyf{oYTBbGrSBbGpMF{0-uq?q7*>c}*g| zg6vsJoYUPxob?Idq4n1EIXFu#F&VD|rg|<`l zv;Ai}coA{dUqPJ9_b%d`?nA^`&luut&zlbZ331jlmpG^UC2`jOjf2;_FySArC;7x# z&y~cvp4>>B<+~DRI|mSF`6nIxHR3B_a?rm1>foOfXZ>FgXZ!a$_}GiIf24Y?qIBE0 z)LiNn$5&kp58|RW4YXVjlK+DAJYcx!uOdE(&&vVEx zb;zI63Mb=b+tb8wv7hzh4qNpK%6FI?;N_D%*Q+}$`BkR8hLQYCoFDk-R}LPx^t_R- zl1&T#ce8E(sm1ztv7hY;{&%y@Z*<5HG2GVw7|C<_PIkyocgTk>(RSMU>lrR~eua17 zA8Zp>FI)a{l4pA!amYX6ke}v|pGERtlRaNM|y<59r7@<#&_eL49ay*3p9h z@hkaSLh|*lz!$uPv;O{u+x842`B^w8sBdG5&m{hu;ekEF%zDtW)3-J6NCo+S=FUz0rRIZB-MOubT>Q_G%(hTHb+B+h!$uhM!%z8deqpSccx zDRH*5f;j8BgE(I=xW{nuC&x1bNPY|1vys3qQ%kX=U-|HN_ z8*$b@kT~mkia7UsFB)#=_cfAd|4cPJsJHTbQhCiLJ?w|2q-QJSf_&{JzK!@%OaCyV zzkVA@rS^4-d#L%y#=ez-&a8HfDuNPc@-BHalj&+Ty~ z$#eU?sSGFMCFRcL)zxs@4-Y%|XyV*Ik0;Llnd0C}iSu}RLtABLEjw>BTuYN9ehZ6YCT&F7r(I| z@W?jxvg^Yk2XA}5mbdkEBhKaWbK;!tuN-{5gHLhr_u8el|6{|&PPS(bakl3x;w)d+ zOXQcw&8Eb;{$EUdB~A_MXIlrqg*c~s4{^40C~=O{o+ZwD?hmK-&oINqKRe0J7l`j7 z{+5GJcJMEVvz_}KJog4|r)^IQ2QPE*I~;rzan9GP4&L!bZO;Yxs3JeyWw>3hMmqSL z#JL>*?%)d?e5Zrg^Amn!{do@FB$7H`Eew}@&7%DJ4t@{unI!+XgTLK z47clBH; zUzOSqT}`?d8E*Tb%y2v1n@OJQ`Ku23w@9Ar^QR6zk2vRR3-M~o?_uIBUviVqujFeB z$zN-@ov#SV?ViN}TJ-AmUsfo^tS49DIUvT%(pT-VeWsgO7LcDhJ;{obBK1;PviEo!>m-oZrhFyc2QG@4XH_%)y^`@ZS>W zeEr?Qzct*hw{`DKNft=;`2N zh_n5Fb?`L~evmlZc}kyz{cKM&;%xur#92=}2k+wG_d56};+)@?iF3NYckq7^=X@Su z*wdT%eBzG~XFc-_x6}QC~cu=Q%0Y122I?Hgehy7DRob9>Y!5?t&rycw);+*c^ ziL-yEIru8#Z08Q*tmovx+7FUn_Ge?m#XsC$nvy*G`BLKS=RcAj_VXts&+YvS;@sZ1 z5MK_Pg7Qs!C}BVAx!rKPeD5K7)<2T?V$%OB2mdp1?ti8bXFV&3vz=Rrv!1#`68U01 zHxlQ5>0aVoj!zQj{_2;8+x~fr${iL*UlIru>bZ}f2Ld=(StblVbV|J>r>_Y!A2 zhY@EzFA-;d{oA12OzJN2=I{hY6S;;jEt2M-fxJMSaT_5TS6 zA2v+eW4DXv4Y$YN_lVELsX=?0>fm*Lo?8FuhTHnDCeHeAbMPU=c|3iFILA5f5@-F( z9efvY*5CN?L^-nl3y5?4;1lO`?{V;F9Q+mHtp7dYZ0AG=pX1=G9eg)&w*REz2|sYU zxek62arWCa#5rF#5oi6q9sE(^Z0GC5IWGFEgAW*yy52r&xLt2wG`tQ@n~ZmxT>Qc7 zwGry0R^0wK;+)?p#MwV{4Zk2QISyRzkpIf?z@A~};6%K3jZh!8>^bR))ckD213ib& z)N;)o^1UtjiAKI3$!~#*px+p6xa5oD(qEE1`{x7VT)tBb5A2+1@;lohzmYiW|CTuW zzy6b{{d11tLArsT&v(d|5NG`zh_gL+6X*Os;^414_@4}~gKSL3dzdMgj~w#z9sFwt zKSG?#x8cZy-Ur(H`gLEa%{u$)p9~*Ahw<#pQlbJjdZn4Y&Qgk>uGwp{I2Eb@9RW=NKO3cbLg<6NmgI#99A!#M%Fy ziF3O57#^hC)TBGWAwPmR>wl9t+w)iAobD_K-$I=Ghs>uFeq(+Paqee28Xn~9aHc;0 zrzFqzJY>ngVdS48`I%4=)Z5oA`PoK(yhFZ{IQI|B9ef*cu5anjB>cekoI#x9hx3WE ze|iyTd+sybuFpeBemD8y_YOXZINQ0(!LvuF&Q}WuzlJ#Xlii4O`9AI7zjN?Uh_nCa zI(VG;*W{lA4xSTD$l7CN#fiO#|)Q#iR;M%lCPqCEho-; z);aWC97}D_mBhax{p}1Fdu9^v?cmQ4=lCp2o$+Mj^h_fG-5?_wAgMMr)an@5hMy#)G|BZ%= z{j7fgarW~k4n0#zp7m^Z$bajQPkT<8Slto7vlT7QcDEMG>P?d)iHVCSln^!bA0IKX>REOPtf4L7dZFX6c_O z2IF;vz>N;}ZGe_V{Ds z+^#PrJ|CwB^}!>~dR{f$PWLU6XZ=%&&rD0`Z}i{zf|so)&u}Rh)>H0~_Z{+`9P+(M zp6wavkbjKi*$;0x)+^-zxXiT37rr@oI3;3tX}{C<+vVHPa4BE5vpI1t_qN1YzMF#&a`2dgze}9c{U>qu!z>5?k~rJB zn>g#K_m=jL_<{ZTBL{EwPHKN%VR#+- zyV_1W-H#2w06$lWo#y}3O(A*q|6GUsDw5}PzjDZbpX6u4grJ}4_b+Xao$fHhr9N~2JklZm z3zFw{^aqFhXAb#k4*6vy&-Ro|OzqE(4&L2xyPn+RkblJRpq>mf_2hAf{6f;t^`yZk z2|L;US33Aa8MN@S%jI+8tMGeJjb$BG$Z~E>1k=W=wUnC6X$w(7jc&F zXLya@3bY*$lRVdxj|{i-TS@X8$*8+xZ3qcD{Z=@@&tqiL?LTBR%Z@DhH1{c!r#emt9^N z(x~vV^VOI*=c|d~qDPVn$|c_+KbAP>_f68n`I_XApYM=g=8%t*Jlna$A%BGAxt~9E zws@zufAS0$Kd}5?9rBeVFG&S{TTh(*5R%{nFT1?XBF=hR5a<5w8scnc2M6!r;Dd>? zoxgDKw}^APmBd-kQU{M4egS;J_2&@DbAEp$lQ+D?Z|t9IiF3MlIQS6atY;K)j(=Vw z&hqaOXFDenXaB4u&US7k&hm9cDPDFxX+)gm&m+$5;vNyg%l7ln4Hx|!Xa3p2zi{w$ zaTZ>-o->GZ9Nyf)CmC+rIgjLdUbB-p+n=)_wLKRSXFWe5&hwFu#M%Cy4nD}iM-yi| z|KQ*s5$AN55@$W%I(Yqssr``e;H?d}{ojt{*+2IXXMc_*&UXIA!DkX@J3>q1;fbUbS)X+laHCUc{wq56X8iahCrt;%w*Z#Mz%85@&x_5oh^T#94kDahA_q znc5HMI(TadzVWjCf34wmJMH7(udUX4M1C*cf!|(Oqq(jBcf>jVf6s8y!|}r;2d{GI zxq5Btbi;;=e(AabI|mqU+d0f3Khh!p3zBF1f9sH+;E?}^Lw*X$OP3JXzsMoK&LKbS zOPq|Co!{pT7yn3?8R-8jarRragKr|vdJYrk@$uAkVvX*{IA8e=UhLo%#M#d74nB}L zr~52%*7JJ@|Il#TpEF6G^ShNe`zL38!VjG8#SVTWan^G?ap^LHavVUM-+{FXAje z-@!LI_?_F88*0T5eGIqz)nN|)GI6%^BjVhS782+7vdO_u-l6TZ`{8qmOBRFiv9*I= zN1XfHPQ=;Hdma2y2Os0$zah?ce&FC;#R$CY_Wpq3;s>suj}vGAKkwkbC(e34B+h!O z9DE&d*0YB=>!~Y+jhEQZ{4C)AGW(K72`>>%(m)C-loyKJY^?!$r@R#2+SknL-8fPm;V$ z9ReQBNZ7;aUQs`h?kbXB<=|f$Zs+%rQxbYOzIw)Rk)K8SUnkD>WIS<}|BSdS;RN|w zNSx)rCeHfz6Q4(V8Z=1whxPp9)I`46KO-9^IQJ(XIQY-96Y^ZXLkt)H$YGGLJ~;__ z*8j4D|LU}aJlpw(;dXhIH%cuZHeBSneLZ=4LZ188zd87jGZXTh-^UEM^V>f+A+eZ<+$&(2A$|91tc`G|8<^D~+yIO}gkobCB_ zVM3nU-ER#SKlFh}P_I@xFfE+WyIg zi++}0L!9m0N1WvwHcy>yp@X+^@IM)T0qkKve?szX&obg{&n^egydbrmqYbyyEo_mH z=lRQfhTHO|T$ozEh&a#hN{O@nI~;tVgO7Ib!-m^-Hn=EZC);yAaki(_!Miy43d8Mm z-)p7Q70&(D`-V$Bd=er-J+E}gPbK+LB)`}pzk=kSCi%?{`5h!bn&gi<Q!D-azu4?oGs5PbY_-M@gRbJW28|lKs^V`Nak8$(tPVok)H@$@g){-%s+g#U{}Im_vR9$@6^SHHZ9fNPZ^i|JWh_DarG=|CK|2 z3(3zQ{U=?K+W!rRe?#&)hD*7yoh?Y7%dr*7Zy-JQJLCru=X4(>J)G|AB+u!NBY94D zxkG*}aZYz5>EU#bl02tdw?z9>{KM&XGW-H?mj5YnPWNuo!|6Uv@~kID@|^BbhkU&& z6Xnk7o?^Jz&*@%7@|^BvB+uy%cgT+-d0r2E$szwL$@BW;yAJt3lRRH<{nR1_znSE@T_17qA6=F3GyCma!^O`WPquN$mytZjr4KpeN02>J>?a93xu5rl zb3Y$3TccA~FBF#HmypaSdc)elqQ!dhc zk;Qu$o?*%@(0^4+EnjHyK8Clk_+y3-w)i^3->~?t7i;}-i~q5e=J}?+1oqrfths0L zoJ%wxXz?Y6zhUu4mumTC7Ju3B#+mx}Al)-fhf!+rCk^js@pYH!^Ix&}h|4v9-{P+r zzSZLX6p4I%q zGm`bJAFIE2v3SpyG=Ib5AH1x2+~O6#(!BMV`ZKVz!K?cFP>VnRn&xj=e2(F>Eq>YS zT0Si|S^sFmeTz^1jh1`G;+MXq`4o%)Eq>ZxHTUxLXJF5x|IptD zTRiQbnpax9>!+G;wfMS8nwOY3E6{(*6rW}aTYm=nUswo5?kOCttezKXE zjkow_!>3q0yS~=5%Hn4m9>n!QzFsgqzcAUJ&4#zJc)r<3GTP#|H_+*hv-sPFe`N8! zhA*@DO{Z!-be3ivtF#H>f|HknA^ODQu(k$&C-{R8@A8ql)a#0JfaTZU{(fk98UuJlf#d{gP z%HsDMzSH6_8eaGOWcyz;ys5?CG2FNKR>Ox`{L<63{V!R(%r7>{~p70Eq-$&ZBJ{9#|-x^{;J`gGkloEhZ_Ep#m5>x-r{c>zRcp6ouU0u z_k!en%`&{y;w#S7@?9*Rk*j$>i(h5<2#eou_)8Xl(C~LG{%gZ0TKsLp7g_vohSzP8 zoUen1x3+k@Cfa`A;#qcY4~`Hzi0Rqi+^l*T5)o^&CUMBT#H{} zcq@xn817m87Q=g3yr$II)Eq<}#X;&xPb8Wy| zKKRH3gL~eU*7Cu@gY;+L;Aqg@{d%?>eD~17X)SyA9NaUlGd+_TsH5D+bQ_wC+iP|pYM)<*mXfAev!%vl zs{FgWtTlGY*OF5HnRo~1=W49%vD22HD#vL{GtB(80odnw+vkf;yX=9|s%Gi)-?bcQ z=Re1B{`-H|Rt*$o^s>+I`QU?hKX6ySM_LZJIA30^)L;EaAM3xDw{ERq>q{@Y@)C8_ zqyF#;|N1(igvLM{cUnsxgXY&qK{{8Khf8TWH zdGp3k)&KDPl7hUJhab2w(&*HZTW)=@{@wqo zf8UUnGbTUr)P;jy{O|Z@zrC@~S?>&t-8lJoH}#v|B>v+3!g{YRdFrx;-wYggU-lW3 z-fEK3?~1eihrRRvKJoq5e{0b6${YT;tbK=>Z``?OK+C?{pL~AsfL?D8Z9KGUR7RcY z&mKJb>DVp(I=;2V`(#~}H8Z16R=?^kb+gVXO~VOi zprdR=pRDgb7xcB9x*)4$)_f?cY&cfF&cN51A~hCLQ$%V{WrtIEYLY zk$p8HIP*KPcr{5gczv+!XNKBRKD z-pH**-k8R}^rCGFz38ol*P)CW$a{c=0W~Ey!)e^ zOVXAi$7}Izh!S3dqsOI`HsQS+Z?u8og@t9&Y543{m>q?{@WF-o#ZWSQSYfM*=+rG~ z-pK8hWs~MK#JOLVM|W04XXEHl*`&Fd6~$94iWlLxlgf&r;i{~)bt?*{fXeZ~y5;Fj zp?bO!nR^OO+FDk;rXsW+N8jM2dgTQ$c+jGBoLmvwQjD~}n+~;g%0nx_mz-P{I#d>2 zjB+@1up+ctn=oi^!;0vhiYU5=9bbtJWyOmtif4-ra?U9g#jEkl#zzMG;=LI3pNlAMar!LsO_ ziqI0A@ts&w5#3!DMb_tL%I|f`isw`mi%s(5NfpIQ$_n76fk#K+hw^m4Fry-hl6Y`U z2BdHj%!DJpgls*@G#r6bDnclgf%`j_7q8Jt&w``&m4yzJMZZ8!#I(7Rv^FTZt4s<) z5)>zVQKk!`qIj-0s9+Ufd3t$a0}(2Z&Qk)C_kl+<%8Qp)grr(rE+!!NIOiyAn_Cfz zR}@Q~F$Gas5!#P$JIkU|D?(C9S{G^)2CS=yg06y&{n~=5Wx5zDbXEAWZdt*UGF=r8 z)kPU(S45{P5nUB3iszL}0VG!ioLnB-TrTzRyLsY;vd}_R7eWUs#0vxWA1n(k(I%iO zluIepRD}xh!Ycfzt3p|^l-9roIRRt78V<`vJ9Ffx%p}cr{dFV@#G`>*h!QJJd zO%xn<JgT5k8tpZ?h!7Oy5Q;&hU9Z?;4ZDnNS}ZK=KJ*ts%NzOgoFQKpV0c>=o6&D|4@BG$+7x`HQXm0`a%1IHpl1_T5&Dl zKEafL=@ZcV)bt6hMm&C9TGmr@_8mPc^+ERxwR!~|AS8zdIxLVBRc3WGpc~r?9R!&1 zUl0AdZUw_W25vpt2Lrts6u(nm2n{H~_RvC1)R%%>&s7)SZ^#ChMm8oW2 z6{ToDYB)6`sUAc{XJP=w7^ufhJ!G2UQV*5JPKf9cQmdRN4?n3)6k82v4=9! zPfqV%H@!nfI%bLKm?frTmY9xNVtV(cvNYmd{ZMxFGPgK*X?q}4b)DV{Isg(q#o z=gHwooAJ>hBRpvfK0Cvt?zvi@*L+{rKkC8q(CG3jj$$7E(9S)dXPqNUOjtikN>38= zqbK{&<3oq9bkLNW53|lG*XPsx`-!bxu#)SI8CBRBYoFea)3C1PMTZsUdr`lzh%%-U ziuK9&WBq|I@ner>c(FUOz1V%ZUTjdl7kjkGkEZ!i4|a5f9bI5nk9HCGHma6{pW9<4Ke(cS$FvW{5 z3`Zw>`}!4@grk*ybXhn$U%r)wLzUs2`F^w=a?+tBJn6u-_$uF5l9gqqb1=dzU@(yl>=BePnnfUEPTrfNy@DJ&in}f zilgAN5^-6n7^d7d`DDp!k|!q!;-nvUC0%i>q>GN1bi3L~x3-f`cO^X&%m2}h zlFZr#x281fBk`T99=vXTbb92UC)M)E$+()5l=Ki+(my#?()q_r`r6t_KP%l17sesD zHD#_Ri#uJpe%t)$%=HxM=8|iFj-B)fSJLf{mGlM2OFCRT>0e3@$GJYqmGo?JnJd>5 z%#Y4o&y|8r&b92BZ&W~>LVi;uN&&e6F!G%R0y4^|puSMS(j!r@!>^H0DH*22}j(+Yp z>y+&`!;l)C;y3G)?=`!=C=!|$$(iTpEcA1}@N>TOawhxHhtZyfqhst|>Y+V(=#?>Y zbn!8;$p9CLl|*6#Ga|9R*^yX}+;FT*emK^*C_JV|tK`M%v8l#aGwgZM#onZY*Lua3 zUh`@hBs07y7JHDVR$lR9FElk=Tpcc+A1+=HE?yWeUKB1~>|@**IX^8tasl28@m_@Y z;xupHMo$LqDmA`r42LGk&>=$61sEJM@OwCfOfA!Fp$zSceeOrsYPLv*dc~&r(YR)d z{pdaU-o9<&&{P5FP~cY-nHU}I4Cl;D6n1vEHgmaF1IfuNQBVNt{FK0hA_|ZCk6emP+LKG)NaY7U)LTSk*3a)#^n2ws5+{XSl zqgyC!pJ})WFU3ElFEys@8D9hx_%s8_`f~-HJyl*Vi?`P|8EPD zbK)w?a1JuKB9b+}>Kgo+FMn3ZpY`%*tNht}O?c9G==!($&G$x%cSeeLMT&PviuXi{ zzljv@4Ts{Xx_L(x@r!r+#k;)FW-qiUsk#K$M=_a1`{>pM-Els;L7`7FG@2xpZq!+SG+MQ7fr)cI=$enq62`drLhJE7x|AHA$ks=P7bP>_I^RbJ)o zTOQ7OTi#o?WWZRNC8<8hn?4zV1g2>-BE=iR)2FI07!EUJcr0ES-d8Eak1mcxSBJAI zCgZ)*Z$35BWNsun-H*-npGh(xE!Xb_3c zQv*V@T8#9K=Hz>ms5iS#==`nmXR5XHL2K35Yt zk>a4zO_-tj*Xh$#<%<>j1=IZ4BZ#L~d$F?YaKX1;?DpJn!9Fi`Z@yQs(<|8H73}qz z{j4UWtoCzO`Z*i@ob8b&+rsm=N22?~^LHR7_2$RJ(G}i&Oz*dN^S6bg-+1%4hbOJQ zHazLeYa^LE!;{wG<3M=Q`fJ0Plf#oX;A3TY(nfr24QK96TE9v552B1<^@wA#_$5*v zsRx-8s;%svWd9_mpDOW4s4|i>Ba*WK6*-c#67@KevjH(`Bxjp1wfN|9YVn#hFV?0o zsU}M`mMV0=Mw(w1S|40P5x zht2QH$?wUZ-{ViP`iD*!V{Lbg#`knYB-`c0a*^7Zlf=KK|+x9>szT11np5m5Q!02`i;;kqhI=s&@iLh{6_t|$Wlft)u&?>AP=hm zdDoYEdEGoO532xq_`S)uezb0(AH&2Io3;Fy%wBJmNJ*x!*tO%wWFCu%DO|AAkI8%% zu|v3Emmj+iAG^Z^d%^|ZgfY+}uxN$F8$VXJP!@1vqAdrtUX-CE5NPCLJx2v!K1Lc9 zf#Q#A15y!oIF3$gCfo z>|@WOAH$3>hGrD&fSTF42w|)rot_w+a#ngdGrXJy-pJ}SzePPi50{Ygu-KJ{4Cb}Z z_wqWy`ej%i!P3yzA#dc`I$q{}SHj8tiyk*{wIbF589{#)>yAYotck_2CKmlXl35kW ztd3+Z3ujh^GpoHZCws-X?DK^eT4{#;gl$-#X>oRH+t#E>z3}!e^+s-leP6@A9exb4 zG%f0iOd8*ROD&OHfwG!Robe+EVCobib zA?1X1x!#4ipq66%=#$kSn+bPpz&^5V=&%OJrMBEYSwptsxV|^$W;w0{q%P5N>y>ElzUkBw+LeH>_=)gUVUoj6eG<3Oh`!eaFq=A^TNb3{~~lO3Fc19i@s z<`mg9s}sV3PDo_2g>EJG2x|5DNiHStCgQJD`RhUXlViyrK6U=y50b@!PIjCmEB>vN zw8g(T&<2RGDwXQsEF9>wL|C1*HaH6h`YaJvXYCBm!ht?Zgyk%3GE_;`56%%$bxwA0 z4i3~gC!15Q&o}4XfCHV7$f}cC1t;M^CnUnkywcz-9O$z|Se?}|I12~*ED=sBfsU>c z2;!qu{`biLaxDJGr%tpgNE8P;(T^li@&7U$DF5R?n<~P})Oc_f4)j?ftj^jWoP`5@ zmI$k}GJ>;kpwALvbyjY077p}TBCO6T3eLiTK1+nvStY?)IM8Q_a8e6YbN5jao~ZaF zg@67b3mXX+y=cC|OP}dOZN`Ye3>+vko*9VHEU;v3U@B3) zvG~S&}WITI%{BX77p}TA}nY9 za)c^XmE3FkSf4AWaGUy-{3vsDRX@BjgR$*mdSxkKq_{FG_hfHOo0GjO_jy_Ez6)oy z`#(gGPObeUJmd3Yh|jRbgbOut$pQNoO69TxRtY^>Y*H(QxcuP7hIG-(O<1>@UR2|i^KQHoEQM0l7);6ld3UBtWP zy8SsKR+T@A@sU5ob1Hu-$VdKEkdKv6733p-i0ouGmxuhtLMn@S$YM;^)sW3RWHW}v zHJQWYA)himkV|8E$Y@L!+i+nlZxG@jVP#0Ngw>%$u4n`d`rqXFd%QWOz9%?^Dor(in(=+l|G(Y8uKfLPS$}oe_rGxd z`ovZbOQmulRB!qkT!_gPrem^+2yF?`{g`2eMe%yF(O}G-SUScvy)hFDN26zq?^633 zAm@$gggL~-LU~M9{7$tjgoSia$YNnREM&|%-k675#XpgA#$4fzxmA|dac=xZS)38l zd0hNjbu8Nf#=NA`(5V**j?O~2EGJ=Q1M3m_@xNljQ2e4S(f2`N90sYMWNrHu`K1wl zk<0&*zZ1RuZ~A{~zfWwHta?!-ii^Q?&0q2+R`e6wFjX|!9ah*eejtHx@0nTlg5skC#w7#H zYHvc`l^><6R{3S7TM~afTRTlMeWFfF>HzKj%lHqme&R9&yr}9|uB)A3*}`vx z>A5Ue$ZCZwRbZXMk77rrEJ2{_$K4UK^ng*VH)iQrZ$KxHkwcb-unHjS5wZ?}^#?z_ zEtK5=Wj8|^#)3w7LD_vu8QkI*;^@}G9-^$TC>tosh9-O!KSg#4$?9fICYED{kBc9W z7NZ)1Gzh5#UklNd;8wYAu#IlbVlGfSQ|S1r!zZd->X0f9j+z z9;n)diYB3egaNRiQxV2^KYa++e-8MK9>;b@jP2K#%GeH%%BbF`f5$NPJ2dJ92V5bm zf3otISoOnlg>JvC;^nfwD=oBpwQA1?R9lnvUb(e$c#pU=K0lVU=HbV1l{_Yw`7wsZ zurVjruGCYD5Cb}51w_^%Q0p;-t6JY7Lsk3E+3G3-f)csRfG{No3r_=z&`zX&dpW4) z(4>cAFK3&IF0g!}0t~EcoX9F=jsMW)#ak)63r*3cUNcCl-6nqVV(DIVs<-bvY~qov zBeG4TPlnnj64(1oun)9PHa35njU{TEPoG@uY^ebYAd!!)Ej0jJQTh~xu@@zFU4~Y& z$Rtv87IrnElvH8~-9CO+dtFSLK@yQrmF6Okq=S?qq4}B{3xdrrkCPCK zx}yia2Qvd|WU@X+4_cgBJ%C<1{j@V@9?fX(~E88g+ zV#0*E532ooG{2gyV~c`qO-e?(O4*8*w6$z4CWmSt-~LEwMi{HO;m~~fkoky80ou&Q zGgL}w$3`{vJrbHN2?x9Xpw!?wH5h3H+twnXxf)~l8j_r$+J03ev;du^Y&(gBCWmtt z%8{Dmt z=PZ^ZHI>6U=>U{ZQP)9uP4fM=BTXI?Ddfq^eaj z|4gc1LnYKjbaHE9Uy1JqdeN`rAF4Q1#(en+1Aku$IS0n&|8&Lwi4fKwQ(ib!2ArWM zwyiXI{WmFn^ymxYUtuJQ_Y}n^GKf)g;ltGVQ~IBlkq(gnHr7Vx_$@B=^B&8HNMv~( zBFh^QS>BSbBFj6A5H;?^`vBgMy#gO|Bsf5zDM4j6zDrPGVPT`;yAvFv z9a^5|*nyKZVMZjhC-F0eT{J{<+BXS~(HAYzG={x39HwByQWa{eHU&uM3TyPYV;6Oko~weB@s}; z6qHFE-LW%&1$JNRtgG&`*R%u1Z5^@O}DO3H^1_U;1SJSF%nyj*WQv9O!2pDCGQ~pwP^lVC@liF76kx}{pd9?(Geb2t^?Nw9 zNyOA$0X@-ZgRv%zox3$0jdqK&t7gd2@{+JgCmhAVN02 zF#)$Igm2UkqylgB+cNq}tR48Ky2dxbXq@B&vB!+MN%a7}Q30!}j%r}3@l7xc!v&Hr z&EcEr@J+6E+K#{oOj4X1j5dZTi#T%k^@sW(BOg1Zrt zd@~8Y!61o{CbS7bk}|^hCU8YKDxD>KlkmY;+Bd>sOCsxE8!r2+vL^Hke3S5j&WlnA z-vs@31oJjAL2>wIoqSXCLUbt-Fv&D9@J+nNH$ecE)E2}Z6DS#P)YKc5^F$m~5Z-M{{{aIiG#;R<;MX?_}C>KDnaY?PeN9U>~c$qWZgzKkY$SOR=KnD>Eq5aEz zX|Bwd@TulYm^XdF^Cee!hRrjhFh|qZM=3ti>*qD2smxVzr2%WsvUHuhr!4F79PDP8 z89!-$8(HP7!mQTy)RZrS$@9=RZH8vzL_6Y?AnM>KNSqQx9f>Fi@^Fd{dBg`sr|5xGf~bRopQNY*agdaRTHIH`moiyR zki{v9%m-1I?j142n7jqSRs=V7Bu5-ZK;4tiN8b%rxg&WMD9xCioZiVkwb2WIFNJ#b18XYgWxIHe}eK;$DOVVn~9peFuG zWImCei7`yxg77L5T@R;lj9~;Q&VoT0Ukx{Wk4{8FhL~T!` zXMzipw;-;HM7P2z99$TIM8Z;264$yC%Oc_wy%-`jL!G7!fKyO85NV)W7N;QdMrR_@ z5vK%^MI;nNI@&4P6*Vg~Dv%MU1d#;?IN}r}zEix4PzMF!xc|#D*T;kX1+tF-vu4bd zzt5ij6?#wqdfL_#QF{SLZBdY|RI)YUdu~g(2O7QD2P$cOcK|Nd;?{z1 zWG6!_b!UOxqlg>&z)OR>`UV$zk>+3dv4ThwY)ij4Bisb7w=5gCovM3gnrxQaMZ!(? z;Qj`4e*zTH9f4m(axl-r(2Por=x~oC9tmQOWAB$p9p80Uizh9y^9`%veQ>p^KQ3V6 z8AMqB*r+a8Vf|x4Ty_YY(2`cI^%>}|7smJu{aC+JNsKvLQb7{9Zjwmi$0wSEtdB`> zB=vj0={3}X40;^%9SscYFmnAkLp6ZnZ*ld^$1Q|e6RK3Yh&O_>Re-LB*5Gu#oT53J zBigK1ON%ISIXf82F=;{=6T$5ZvQ~k{i?s?9N+FFZWuXq4^jrWcgQy{Co(`2YSG;SM zEpWL@KeED)E`{~gav!KfOM!+O>tUryDzVffJsVjmmS}k-6^yxhl>LWq2Z~p5lVR2TP%BINvfvt6zf3*a3f2c2YOIHqMx{Dab68JSCRra0 zj+lI542%@ZkfVH$`M$9pY3Qz6U+t)|9!VLVNJ^{^h9Wc13x}}&WS*OnH37{Qw{zoJ zFluNM)iu3%pf3{ISYx6Y+Dtk;clM)uc>1jRe|60k-E{P@-=evn_cJWiK9VhqY1nZU z#y(ffYH1T{5+~Q?g4)lMFR}VdsQ!{w zB&I^jHP8DO#BJkmg(XP{O_P;`0IT%ahptvb zIzvkCij#=|Z@QxN%WhS}RgD22Dl7K3#pNHV%SDY%{c+nxEe*l`Yki9yR^Qd;vbk~t zyIj*sVW(Na(RPZFFz`hDQ+BHrp~XkZJ-R6#XdqW$YUMQ1CXV5MFNU}vag{tQ;VN#k zi-o*>U*HB;`0n;>uORLfZ1)Pbc+KubVIx-A70y`^&e`HMsSMALqZ`8mGm!J}{LQ$& zAD+L(!xITSJfpzRoa#;b3J)h(?#VL@GIxhFKZp6RdC`kiL17nt%0jAH|M6qz1=dBP zYo%}^nX@CAiz1n;yy;WL8L-sPs@NEq>KCj{ znG3>M<&}8vLJx?K6}V$SZb#mNJ${IY2cQ=`jvtfbt5o&3MIEt7-B=)aUKk9@F%H{pHc^x5`J}1)j~~eY z-`B3B;;D6i#S>dwVb_g3|MX(n=X<>8pPq{?#Td5+VcdEYdtlOz{k+p@|CE136_1)n zlcM_I(K7UW)1CT3rls->7Su-BH9Tf`etfyc9-h)C&o|Xd%3Vs{Q__tKWoa0FLXxKE z<-wM6n^3Gz-w5`m%;|52WwnVjQa3E~`I z?@IVjVq!FPj$cr}xm{gcJK?kKgy+EclsQ(1McI5ci>Nj_ee^NW)`|O05!}ydrEHL z5}qzrJ97M<`puo=8*3*#)K2(o7@so7?T?Z0CC5p4fx2-<3^OBJQT!-|!yw19%m zb0XMEgXIU=wt|~o_AT==%_C(~pA?qo)XfMNPx6{i3KviIiYJFdRl%Zz%u#dh6p8d$S&BrEhwoO0jRzC&SGBSAHZLo|co8l1+V}36NVX5#KSf){8bWq1Q9%oi(#?e8((!zy6M?GoHS#uA_N) zVw*g3JNY@D;ha@r>}=1$HlrOfMV0BP3GwaHMnImlzVZW>AGw3d&n~;#1reoA`TwXJKqnjm5uW?C8BmRt(S)@mbg zXRj74$0H$?)N~JPf1$OJoK2CO-4S`nEf&U-{D?s;`Dr1+{3v>+zLjaQ?MYoy^1R39 zaj$rbY6bCd@uqO`=5X;At9FSMsy695Z8w5cPOPc+!6O^xmU?;cF>blUL;Y34RbKHd zmGJCv@tkn++;H(c+i8-ps*1`RC*n3qIEBN!*qL}N4|dm&uQ~^R=F6WI=Y+GyZ#hRr z1IiKM;)DxiVj3)rs#`IV9bUZIE8gT`uV83W@+{J-zwOcJyOrQ}L3GYigRaMxAyl{!#s+Xz=7vJX5R+?-gG1lS4nkYQXStL1nlA{W9*gpA;_8 zk04sHB|FK?pniOezJG)|=@a=Y-@>2Ck-Q^UR%2-giyB^ZP9(Y=>#~!vY$MMNm4}6H zi!`tHn{V>5ed1jVb6CDD$Ldb`LYOoMrftWn2p$(2X?|qn=FmeQgeUFDkIt;X?U4AN z9N|gZ^8Mxq{pJTE%@0GuGHn#J>HRrnSeQ$C{AqNt?B&27-KTcSVhg76esgqPQ^U<= zW$AOg7a!}%rm|A=)ZSDNG%dpuO&|Jw?fysIYl#FuE`>>v=;UzoossCQaFhAonDVp` zo-!?7!5cdR@W`Y|Utf!dReJL`dzs&j+!T6ft~bVqlss>}`A#`$peHxJ*+>CrC zhL;R9bxd}Y_Hyk%(VsQp3}^~B$NC{UgLd+aBwF97`Tw9#Y|WK00>kZ@614m{4K=_| zUS3JuYHIF-l}g|@u{aF50^_E6SYX`dg zF;C5oe<(IZ>tfC=>mk#pzYl5ohb2-|X%dTj-yZ-$_>+yOX9N!YX1L#J{&qjKSIXgU zGU~=ROMs@=SY$bGQIu?-aCDIvp49rB)vIEl$j3kdPox+ePVkBKsNdm&)ipit z3HP-;exnPd@jHxXvP!QwRTkv`y+?)rC)rsVE?D!!W=E-$>?}ogs;y`vcxreg=R4`_ z&z8*dc@`Lp+I{g`PyDjAQKuKk$lW!(7((Umpo2GL7W_m#B25dFRcb`s!u z5>RC>L(jMs-8n83mamr(5TD=G`emI2i1_7DtzYoSzV@x~%fVW|tOdnwXOadD5^qqHuIo7>nU*xR`{Fai*8m7EiTqI~$+zNb@SM z`7A$lD7;TS)EYLU^DM{nr^{F2JqPdY;W1@(&`*1l;&tR^#4#5ZVVscO`FS;lEc5VR zoFq=(gdt?V-~5msNRrB+rhofP-FWV>b1SS1EELOX+q5;wmZ@UIU-x`2-op!%k~BQT zAq_LSw07-pOG7tODGZ&Adi}rOkL{nv#!sO~Z%zvh%?{-?I5}f1h92>0Q}f$-#+l{# zE=4J?>W27J(J=ezI%N%WN2a%JnE!0uwhfEMoYb~qtLN&qZCLXB$zH=A9Z#*7{%FGn zr(|@VS3b9VPWkNe&&#JN-9wd@w6u=qQ__3Z95*30&Kwt-&r*DTV2&GM#pF0Pv}-sv zGkrrnO1->chNQLM=y(woMqa`iNC1V`Z_TH$d~+=SJV#!_S{wGJBAUr#SLF4s{zxmu zXII182s^f~${Uv6Ft1{6#hi-S6`xm3t7wSjFO}9fqvIE%1FurNKQOG9faB@y__JDi zMaO=l^=d~9-;%UQoOes)>Xi9ZF|60t8}z7ZQp+>-sj!ul;=I8|#|y{T0Uy=5E0vCq zjE)aaOvmU{Iu08h%TG+lA5-Zll@1oK|D=wOGCHDKmGbi|hV>G5?6N9v=&0kX0=-h# zkEdf`dgqXpLDl|cV8QDy{gGCP&(?>xl-Iwj zL+l-8bOg2ud&RJz-xD_8Fs>(+hMksN7OBRgb`3|Q*IA!D6sWpZSH=;%&OZ?ym7zNO zQ|R!Fj$2MlM|xo>SxG6)2^@ zymUf3YWKZT4_o8voV@NbeiP;y_H*@%POGb7|0V3W_SWfQ8(fOh#u*(aP`_ooS({47 z2BYJS6S1RXUt5LhQ;Kj6R$gD4_Ew7HHioG<=-Byd*AVSyfK{lXV}#LBh2!J+Na=Xh z(jmT|VszY2wk$7$3~KQc^>EyuP^>P5M2Tq*P(Haf=Q_&BzSj&8ONv8@#QG~_i({EU~3kGC53 zsDOlRGCuYW#>XQ3G|b4Q>T9~qz9A4*_3s@ctJ*%!mGdVuOQfUv{`;w~6Pw~j_BS}T zY>Ea6iOy{Nb6$BpD}OAVbskJ86`gI2?4V>>$!FIGZx5-FuNMdILnbYC*4^x)H@(sI$ zSZl-XC*~RUqSByaj;@A{AvVx3j)_JYrrH@~CEZsHyOYixZ`fIemEyC~u;yaR@$y|= zx6Z-jY^${D;?bG%YCv&Su3@q?FE6p_Lc`uyM6*)EZa3^$AL(hvVI(9Zl!8lFRtJ1- zfff$tnoSL_JV*0jUV6FVX|g7SS1^Z_IwyL1FaH z*F1;;ZZy2~2F-&wpoigWea(aMbD-h**J&QaN{<=dL;hnJUcs35qTwUDX&#IPa?wZf zRe6`@K|lOyp#N^o%k@XvJi|*(0}R^hra;eNEf}=zbhHW4KjJ~ngMQ#N!z+KLdC+GR z1oCQvoR$`}^UDlhYe3K^gbgnpruBTRKhnAzp05UxMB9AC@Ji$0+l_qG@HE4NHu@^~ zX(;>=$e+A|wX9DJSBr(h&!%eZN*jDuPkU>P)z)pu+{}8((SUCNx%iY&W2Do4P z|Cke>$o%=H*k2PqKh5B>@LynmhQSZQ{}4Wt1%=ZEw(d{xx7bh)hW-oSblrl?w+)GN zQ1~=pfN^yg_A)RpihWIZqs>63Gjm+{TsYc!X2-j`-;nJWCC+Ks3#aFtK26~hnpo_| zty}GJD7>)Zu;KT&F<%Iu3O~Ws=3U{Y!+_=;`g{g^;U6(JUlFJIt;@izJMHOrh$9>P zXwUD3H-t+!n4g5tgdddnzf1hNaJRmrx|8LXYR312rXP$((Qw@D2T6B0@M~pDvo))+*bX zl@c|83vFw%o%vF!yh~?FtV{=|@$VW!m4!66z2DqbsU{PnTF;lQXR{Y4cFYBoZ?iXS zo%7p468OWYZAX^n;W+tEn_Oks?<(uoaI~H?={V9#Y+C0-YIGii6_XjQW7F=Qr$?Zi zjzQaU1S%TTHApSSMto9}v1tv$mDC|Sgkf9qN4>S!@4{$o;+4UIKT6g??gC?TK5Y4M z*bD7V*#^#(j@7cI(y?-`SgmG@ZV=T3U{V{WMngcI5fW+%c|4Q3v!aX8bg&WOKBsrPi5;E4_;h+ewEWtesS; zgfT(q(wp4B&U9zaqGH;&R6yU-IRzQ`txjmiHVrn^Yo^8|nb9z;_`Pnw8JpF$Vd6(& z-!GNxJB3=+kYDV_t7}1o(OfkJc^}wiwtd^!{X+V(pDG`$Herr+`~}6aj>r2L_E^Vn zDSNEruPBan{2j%itM^M@hz}ipNO9=)y5i9JONv9EHx!3{*iwA*3@3O+ap;OWibGd; zl(_vMf9Ux$ibHQ)Qrr{0aYJ$F|J#Z~hg9vlJ9}N|kg?*>Ay*ZL4l#T3ab}`J78Re0 z4%tu~`w^RpPeh;06^A~de+QAxANr)OIP^(Fap;poap;o`#iycAHWi0H*;X9-|BpaqL4i z6rYLi+ETnBy6dLm&|OC!%wHG#S)S&i$39ei0zIZU_PK5-4xLtbD1Y6V2#U7i&}rW& zj{UN$ibKDdhx73ZO)f7gj{UW>ibK~;6o;<+R`Ei}@n?!pgx^wpF8qM(@6$9&Q#kaV zr#SRpM{(%A3yMSUZ7DwOaGpOZ4!yVk5w1r**atkMIQ9+eiq8`EUs4?Vk`2YNZLu`ju(IQB73F<$Q{LDL?S zh>sndC8QmSBqvO<8ONqL7$#xS{@U?{I0;(kq`Wd%D_TK33ZrD*6r->g6vK|c90h$D zLflaI`2D2yqj4wGa2myvvnSq9kvjngecGOozr4PI%^S)O6=a+JJZ}p$CElz55$mgC zQvCmWeR_S+D}wqz`)Nr2o(eA0cWO%0ADL+X=?83bO2!2owEt_if|ej24CLrt=98P^ zKnLn0_&+XoyIq^AZhy)Kcu&yu|D64!{fX +#include +#include +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +void printbytes(byte *toprint, int length){ + for(int i=0;i +#include +#include +#include "blst/blst.h" + +const byte dst[] = "MY-DST"; +double time_taken; +clock_t t; + +byte signer_private_key[32]; +byte signer_public_key[96]; + +void printbytes(byte *toprint, int length){ + for(int i=0;i8_biCQ&|IHcooo^-s0Kg`%@wF#uKH|$7}A}F_`pcVzEVh-R) zC7F{z_Z+s?u?Q-B))5pxP^&;K_oPXafC?#4iYPW9v^z~{QjoUXn*Zs@PKy2v}`pj4{D{7Y4Qq#}MYr4<#mYY&PZ8s%4G#p^3x zSJ6N7b6aP9zW>PtWN`V;$)opZ&~>EG)= zw&up)WZtr7Y=j{9R7a z{FZU2JDiqYTRGF|9;$NuXPxftc%1WAXVzf;iJ8trVULT7-8-i_sXcDc{06D-%djs6JoF@5y8f8ky?3GNZlROfobRr3@_niN3$?k;&HlZ*XUBgL zeLu9_$@iz6jiXL*)VMQuud`>1v*$5q?q+A!-X0XU{>$y`-|+{X{Cd{Q{n^RCIQzos zPCh)S2mhDCW9l}d*UA5G91qi-^ikbC>sZng96PvR)`f2kj{W|E=@-5sIQE|W=m!?e zSXYgA>7)Ab`X@b2A8$IjY01eax&AAzzuWO2YWrY)>$~nkjfxik&n>B^THlwxd%?TI zUtp>W(ii=`li!f?8lKK=A9+i9?NO=x&=x27*g*?Ub<)j`jSRT{4v!ccp62HF&AjkX z&)+Wiar$Mcu@HcFJr-l&f*o#fUS_-ReOo>cno zua1XDV(|3ZE^pRQGY;En$GMw`6Q=(#~^0z`Y&M> z9DkEnNVjy}o9?&>g?DgHBS$#%?i-nb4!n5}I-L)uJ5I(c&-eZ?kp6zxisnOC%MFiw*Dah>tngr~Aq2Zs5JC+Dkp=^dD72VDOhqf#Hv;b%OKWn{xcm z*EyYC>5iBAwy*?1t!+JYh2zhD*2(Wpr7znis?xvbly5mtc4jT70r`LLwmmOEQ zq<5g%zdye_)tFwp+bPW1gBiAFb{|!fYQ@5xPy_USs%D>)|8&hhSNgk3M(-NA5p&qs z{6Vks^42$bh00yPz@X>P-sSrI+sz`+uiWhT3wGK4>^SRi3B=5`8@Ff`o>yqVQbYOmAZsFBbpNO1gR1_?{+(1BZ+=H#cXZv3UV(oQ{-~6DAe-RiGJ<1~2o5Lz(!z@}PUr7) zjZVI|%1~a@Sv|)Em%P7Dx^PlwEFF2m@gD<;&hY%LrfbK4)X5KZxv5@fUeCxKj(^5d zCsYnDjG-CcNNWbYJOG zfPgUTy&y9a_?zJbhaTh={+W&4b+3%SjQ>67Q&M4-{GDd9E(DO9-%$@ig#SkBT?nJx z17?wX1QOwRuvVYr&*|4IF*~;t0zL}}vvKbR*5J(?HHKHAU)KEv7+aL zbe?t041+s}s<>Bq4H0Azu%X@w?&zAVN^r;6B(DX-G1WOf0dxI_^)#1kz%tFj#3{G; z&dxpK1i7ped^G|0d(l7O9`^hv)ii@!WqnIwkE*!&UI2b;7aEp`>Mhd$mbE>sqt9n;Yw?G`YM$Sk$$DHZ@XNT&-3xBYWFb?`I^l&DEWs_AK@6i>3e)fN{5=|B z9hojPGQGYi9M+tvb^QLQ!eVRK?HqZ&h^^*KwW;L=*JOt5T27%gQ|tMsXB;nr|CUU> zHPUugskY%5D}L)Zescy4Xdt;F(+JeM{;v(R0A@D_SsV{?O{T3Fjw)8Ei>s{4&?b!f zAROpmNv6^BH)~9;%B(5w1syEO)I?RtD;z#vo@+AIfG*K3s-K=&YDRW~D>5rFvg_Yx zYP~4yw6RJGfIjtCF_Ma~NB9~wCd$B9H4&1`>K1_z zW3Pz3IF9M82!u)7OoBVpUC{#CGb%!B%>EO;D4EZM0>faSh*Bq=jlpyY2Fxf$OArG* zRk&(?Gq0r&!JmtF=v%3C*w)!==;=D=~i(W|(abijnr}%&>&fNE@rv z42z9Y1Z&EhVI{`gYh#9G|JMu?7K#;HF`q(m8Rb#*YC;hmcUB|@;}^w@j!}#rDjJ2N zbE#IJWnN|VDSUkrGe zHj@BervWe0;_(_48RVR-(PBlD<*WZS!>Cwd>=lt0QDo?~WC15|PT0oKLmA35-4!h$ zTQ!y^|JV*FqP+M;(aKKp!q9gS)lG^416K(K{?`nP0(^g4Gpy9YDk2pxxFSPaWKuKC zxy{6+Bog&UXEMu6MO+cSbNpXBYJ1Ax=sYy!>=|_CKI-`0&Yq2k#bcvsBLc?QqPow? zKbvyq4I*aRsDkrz5G$29X-Fn)Mug>|5kxxjTT=|mF55mnu0n>28^{Mif6IcU&R zQN(zOB~HqSVH7)(z^PThZ7>yuM)bMQ7}h;WyZQ#u62qG{4kCfrU%3!b*=sU`5S0$1 zUDXU~E<+X9-{9O86VR|zcn6nN!rrZ!Iuq4nShG=j92S!p5pfgFyd_gD{e+JHJ28pR zV@MGdROpiN&LK!TZai=!12Ih#@4POv1}|Lf5U}O5@=k3kaJV+B5yFRRI0okyOQU;D zI~84o2j}?@2uZR|t<<^C3<_uvsx#3$2fdF$l$su%1h~@D3vS7D;e~0o)tcqFRGU7^AynQ9Hzb>gz!b_iC6Hn`$H zZc5Z{<8DcZa@PC}x(BtR;GqLp^ZU#iI>9Q5{tC8%5e5Ej2TP{}&GXMAl%rwuE92w3 z{8?+To40Z>6?7%4RtuqE>7qFGZPbryo9TipFmS$VTPeF(xOgNd7 z(7%U0-(|^Z#3aL&T@4CGY*{98I@c656;I~%*7&30B^ciW4^+9qDfOyTyK*@g9(bBC zT#{+T0J0L=CsO2ho9;g<{JX>q>pZM8rU}|mbZo{{*wrLlXhwiK$Y#w;5EJl{Z15sS zsOJhIIIY^5>ULi(!KL*Uu_h}jhDw%!S+H0dkoH~*2_rv>9rGK2N{CL^f1dbQ=H6|j z1b4+EDPcNugGKe&p=NYm?U~7z0JcR8lVBUdyu#*SWhSCZ@;Sd!?(h#ECh)ul{bp;^_p7OJGc#Wq$8=985-K&67Eg-HWK7~rgk>?kb@M*<*4 z+-!$s!Mv2tvYV=)f$2{Anv=1+!lD52_M&J-%AL31(rWjvgRVluP0+VR1dJJt`4O_0`jWLL7H#fju!$Lb6tVemtVCL@z(|lv0@*rC?e*bZqE&LI&~9l2rEKh#G?P8O33qmx`;vWCJGXfy$v{xW_BXuI4m2ew+igX7R?jUGBX2gB25ovg}46h`r2gPs+ z6$BI@#ApO!wP`B05w`YXfhdB;mXOe_Yk*D#2UwR<89RQK^nx(2@`A-#$Ulk-+8K`L zKL~IUr%cE=$dG6XUSM_#(L#EKIDyk@Oc(<{EHJb(l9ByG%*cLnLi7Q5RYlUUs3^uH zpaU4}#n@-quUFy|lLG`=hH?K|P)`Ua8<9ra2QI4QScClam2yTUKW7OOKc>O5387;8xi9n&C1J{y@WnPt*nsd934nR+k^p!Sy>V-!I=b(prMMWzsN58Toeo zdrdva5|8xxfQTCo%ow(Ldk(F+t+xE&IMm;o*A&rfA4m@!8I!sWglV#f{Aibm{T&wo zhSjn{uu&j-In`Ptm^>(vMg^VbQ{lzU3VPuoDYqpMj2iR<+tiX4!fr53BaWq%uWAX_ z3YvuV*a*Vd$N`1csD5Jp5eri(B*hQrFdKH({f@z~lD`*|kZAf45k@wMTd53k+h&%b zh_guUYXl_CNem5K6^mJzb#~G1Sp5oZrAOU~d#Tn`-bcAu3J+EX!SZa2@_u3HW1{c` zv=tZJMa=1!EZ?n`q`jB4ZPR^ZC?L(z^?&OGKd+UsHzG+aeDhv7V8IzkqJ5?n5tmWK zO!!t&NH_rJ1-1?nw_Y00Jz-l13=_5QP#rNp&zOTqi=Km|TgmmR$?ep|uLG!X=H_%e z{;FQW9f{HHuK$!8j1+V##|!Rax3>{-!dY9x2Fduls}26}Z4kMp_#aep1zbvTD|PKd zRB_`R6xYVJ|GZE7&NiU9Ixc>k#qGTul1S@8zTry_5~lH7k$UiM{J&1TgFUPqQ4@7J zO{Ep8d_skM7@a*)CY*zwT>oh&(6iS@F^p`bU)(zZ=s1Rl2da{gKsd_W7PSQdBzb?f zjhydw)Y2+L9F~S;0s7ml7@uk-0IhpJC(2M5pwBoTD-w&9vs1Nt>N4&2L8TnHK+PMDbEEK;ib$DG)TkC6PYif@C#u zEZG3wT+#yzSEy!2ZU>Chl>}US1j&}Vl~%|8s-7;Hqi(x3%J;i*4CP+o!}x!Y^9$la zYQ*Pvq)rIllQ}+E@~#RdRXYKKK^QN!KK4 zj)j7FCMw;WMnlA&LM})|-W2{Ph{6qS$&e_$;70zDB(>4-Vj<%5&VR+}emVwwg9Wyg z0t!lt9nj#8HgscE6q<`9PjJW3WUnKbr$%=mSxtI<6OrD>6Ql}pey$slxz2=(@2J#IPq2c5N8orvHoC75|FC34&-^sFlP11s* zMiFc#%3KueE(*5`t@V=|D5KSpm#K%_rL`SyKMvB(=pyT!NFv~06mEY_Ac({51K^k# zg1>Zk13Md&?{0iT52yHKsg}`dJG&e=o z)1r1_=qRa`9x2mfIq(>AD8m*CzV;r9_P>mwmr@%=yb$w&1dcQsl-2avq_MwympBk(JJ^gC`gJZ0%X6{r;J9?BBhUI%}p4O3VRmg{8UvM~E$=Lm08 zMU2RDa4Eb4>6qi}hcw1v8-W(Gu~0G=fR+}B6cXNT;qFHPZ?qE~NB6`Qh}}1v)fBwN z3;!1b8zI*|IR*Qc+7uhNUU0V|aDI3og}eQ$+PxCcOb>htZP! ziMMM)!u}xB&Y{RijKg!tXUf)X9Vkhzq98N$ugLGKMwLD z+;gQ=K5MKsqlbW793O+4!Q)<*3=p}#C0zdN@$r$ZiR@Pj zVwalWf#8$la*z>m45VWqNK?c71pEvtyLea)+Zi($^RG|NX?7n@U>cX4<93dxJ6418 zklnEidMWCzZe2abVuv{Ors7@whyc$`Mf}D2&Gr0S>jX~9X+UVZ^>A)3)N*fZkjF6# z=V69u-de4X-Fbc{gT$1Pg4@1>eUIEzAT$gcIl<@AM%3WXur-aT4MtPTcgbu{T_^Am zb?et<37W#<;wHhD>`kE5^ZVd^Oa2Q;Zr8{(9NlUOZeUw4_)_*B{H<^cm3tARdqK`! z=f~yj9`*`wxV90D3E0(vQ=?tZVdSQn8-9W}1k^ZD$MY9pI{k8{69)w&LC-xdRIid9 zILI7@)8yug5=Vsv8KGpHLpSGHXd^&mspZoLu4JcDPMe8*G(DOi10!9L5Jla3?7ish zM%Gsu1}=`6yDFc?&G%)!2Fy3z*$5|h?)sFFzn1Eiz>mg{f2QkBaj&3 zV1y4j1eXGDR&%vDU@)XiF^V?#cT7}5wX9Q65eMuG0X|hP-)jp+iLYIaS;2_0C6&Z8 zur7VD<8XR3DX9iT$s!-BbKom)$zvCA@>e4dxax{LB$yjf*F@xT z58;j|WTl!#6yD_=E#)b6QYT`k9G$G{WAh}b%+77&I{LtX2ZPljPOBnm~Lu{B0gx}!8%h?xY*KH3fBZO zwGoj}xYP|)xIa+}cL+1H6mFJi7loTW0EN4{oWjjQ;fjg)sub?({VH7com9AJBMNsd zBr6o|YD?kDY))M#@F<1brDqDoW+_~jPM~nv)(d{Y8s&5@sWY#1?lGwOs&sDluc~v^ z?&s9iE)oR32Kg{V^BkI2xP_k(|#=DnI6X1F_wMMFH(f*%Vkw1mGw;HKuqFmIqJ26w7|7hHBc0rp2nqY z=l$Q+-=&UH?zBBo%ELV$;?=U z?Fj?ri=eWLvdogKkQy-AVTsWkg?0Ks_-@p6gKx;%20!E<2N|F6Gf)q^)^LT0iRqF7 zskSAeR4?U;8QYTEJ0N7^o|*kxMcLv>E34MS>T%;;>Na!x7dy1kt_u@@Rtpf)uvW}8 z)tAn|l=2!Ho>*PAw6aMmp`0_GqI_3l+6IIw!dO^WMJ5~`DjJ@RZP=0KvVtZ?)2!;) zlWAi)qYWIpj{S`ONk)?{;*Qn6icF6IJ*D#7d%0ki5>FU0OYu&wQi3J+E3ScRh+9Ig*K=M7gts!ravvXr>d zJ%kyk%v8Y_(5R3A3$oUfOM-$F(^p|ac%yVnYf%uYmdOS0;O(86wy00Z;ZX~3I>Vw^ zYh_9k`}s+BtR*`Rq01yY)+Ed%!;akahuc}>9R8Rbd_Xv>y_ZWi(Q-O66W%f8f%Vi{ z(p#C@ieN1-;RPYGkW+6a3n_VQQ(j0SVA6$blL}(MlkoSykhnVMCdrOdYf!Y1cza1^ zv3Z5epx0s*31e~<%L~=AtH_m_Xcd*>gF{O>>foRXqC(JS%dB9eiNb-;wN$~~Fz}BM zBg0`0m`Y0IIsW5xG?u~dw@JWj7|J0%W0SjQ6T_tL+)ZxxGwK2?a}AKrpAf~N_R1N& z7kvU$f-Ln8`?HF{%J#q6-LqNszuD~`T_~B7z!`R_ahaS>aFFILvtDUI9H311>h>$) zePSC!Hx>=8+=)Y`Ar4gKHT;R;-yUZV;@`+MnA_uYk3{2wzl+9oXFYC@sU;7kfEF8C zCCIRp4r;b~$VCV^ZNQjMGZ@uERKn@@T#iG6)~MN zCX<3ha5vNG@S~&1850345>dq91?Phg5Y(150i|QKg41*p1%R~HydUQ|Bu=Ytor)@( z%^^)>hyyOOm%(Xxh|3_Sw`3W*<;)Zav61M;_!wMSR8h1IJZS|vaL*B*NyXohQy6z< zRtw>v7Sx?)-YAG=?6k<1im{7V7|k*Ql;e>yTlNF1W>qD)$iL?W5+XxQ)v3`V7gc#Pq*$c#kr zCyKD$nQ7#498DRv0F<6!aEyjKe$UQsZIdB!<_CS^Ww;5&YAp&#l}83t8>XfM%)-I2 zm2mP(O0~j_!rsw7o*P`w87Wz!Gl%zX^eTv;pH zk!j;wC~>NMU2DE1$wej@aWjNk4$zj3Yc)!6(%N>sCew&Rv8>%?YT?c(Axb1U$0h?W zm|870ugTP6NiZsNCl}+Cj!?wuWF14p*JLVWp^=*e^qAUS=2XwW&!!*PI9y)b$0Xe$ z?OhVAt*KritEbX1nSi+zJy?b&4Pp`njtyymKIbD(Il3E7jT7YcN$=2K2wgz{o>jB;1p2egwZD6a= z5DJCxq>yq;2X2FV)=ldaE8u7De+->`?cYWnm*(UY}u8JuYRj6EU8y437zydnx$<<`H5AOc%5G)QDg- zKCtdu9I3-kf)4K6;TUx53Dcn%pzGTqji`M_ow3 z7zbN%MkRIO2~K+tMF%d-%QoGEAD~SDKz`%EPdymHaNW=OQyD9x;eQ}t%do%Iyrtol z+y=;I&S;|9s$275@9q)VjPnsfQL>@YAKv>2Mb?+$hMpjH{up|aA&q-E8vY1XcmmbY zd%`@V(b!_V((B~%IiP_whKmk7<&%C%`c;YcQQ~%dKCjM@1h-0HdYL05t&{}zK@9rI zAabmZ1hoH7V1bo!9z_(q;7H_QYZ+1-XH79X2p(}%mQXT1Xv-BEDA;o#SHw-(Y$X>2 zL0DG6CRnlGd%dDPnm1PX!3%!Mg_K6BTpOBdJ`gZiS7Uo#A=lW!tLJQiLeM>&Tc~Qq zt1JZv)d?3y*inUw2f48ZR2bHd@Iux;XcN~+df=#r+jUXa^1pNuuvc6QxQhH_=xd=| z)uo6UAWcKszF|nsIub*%NEQpQsBea6qMr5ma>}H!XrA>VYrFzN93~PVvI0iSp75}T z?J33zQGp0-Mx>yJ0KmnyYD;SY5yiD2z~a72??O`PA?-UH;UlIL6WA57l9GFZCKSWj zMF3Tz|<)P^vr3+Mz&A*--sK*++-i3vj9qBt5Z zAt*!fMa+t~WGLoLR0i)wz$hIhcAHHR^3LX!q}(yRYk*~5IEfVDx;Xg+bc3;FLqj}Q zkC&MAQ*_p}bau4@T6jvhdf~TCa#ED?5md{CU_gf!32oq={nQM~9{$=-fC1o06Y|-A zrkdzAY61rcx&V}$-;ix-*p^%8!1AfV9zVnd&ShZ4FLFU4@MO+^!WK+R=XmRJjHy&{ zOR)MbnUYpdqv4|8X!SH2PRGE4cWFBE9KgU$;+A@H8%Cj?^6$x{#3*B0`&H0Z#eTg> zH!x|KXP0BAUd(VcPXOZyHXO-Wco@&UaRAv%@GEZ)>3v5eZKH$@{U$H*9b z^*b|*(Yw?(K>(!IQEi2%0V<{%1m7yLFL0~9WnV<6dE?_7k$KT0Kt|>TeGftjPURmt zvLUK&OL!GG7$DTmkquo;hC*?4WJ4F9tBRu|8@l*osyI5bp^HDnVjp8rIk)!`rWm1d zhFbd$8f7rN5PH+52)!Y0v>~G1A0D#@>!h;l$(m{+%9BvsE$Ju>iytaAZ^^WQG$o29 z7Tdq|axPc0o$KR%5z>lB2jYS7EW8p&6YoSi;dGe19SsL}qm_|jA}FAHqIIMLWjkf$ zcEH9q=VDx zdLh##;d-$Xx?URmI(7o-1K2T#7<~F&x)Z2H8|H|nr9V;aw778Yku=7lzr2r*R6&W!yJALHU z7*VP}v8>=-8K(<1X;g{92lvWEJ5=d1eqX zec8qkoi0-xW2nn2wJ=1d%k&!UqL9B0=O_um8LmFaWy(i5=U%oaY~}ijQN282db-`Tu}*;HRchOylF&l^C~3*P%t$F z_)p0y-FHY`l)*%Kz`@DsL#P)=(JUE)In^^B14uyYid$r1Wm_dGL04ix>*s#u8XtXZtCD z!5A5BMbpB-(JR3uK%caL94P(?=txhX1m0+3tPK9L=gJ~1R9pmGs#ZouEJo)jNA0Pn z6rmWIGA!w7uu$gm6A%lf0$2vLTBlY8u~@9K&1lAg;+FHn!AHlN?m~qM_o1i7j&KW! zTvw#PX!AlJgyb<$`@A2%2yi1AQY@y?@ao%WP`+}|g^RRl*K{|;BaDojmoe$Vhbz4$ ztY(nd+!(SmJV)B}9i&_6o4;Bbc3v=G;O;pYmYvyXXcPwQI#%-1 z&3m1O+`9C?pNU6FH9!|Ws9!Euq%c+?dHOw>xQ@Ehf`Rz202BI zZue8u@%w6I%7P%!d%BYd9GQ>IOGHNzLWbtc4UkwafGGr18}{xZeifvpVYK!2Zea#C zUb3*YbN7!RjduAB+}$ahj9|mQaJ597_5$+jn0S=Ji1U`iNMD5iax0vMu?x;Z)s<2( z57xxqLT)v7cTu&_vc^dbI1TI5`IE2?_y!KkaejBRd;{lSSXOodP1>Sge#7vmP>It8 zQ0tJqtg#bR^5QrE-Bs<(>*aPt+R1MnL*7*DSxxs4Meak}T>p9Pq4>G3k@qFgIr8%e zC8Ob4u*vC`wrgzP^@7t>jJb!$<%@h4nWBaqWS?SnK3ZMWJ<>55)~<#s|aCPJ;U*- zXOo^i1M-xNNrRoOnDEzBHUAeE@J*;{J_WEO(Q<#P=38JmQ8mwy^I}Oo1HWxmL&-Ywp;A@z zucfNy4V0_xJzeY&+UD0IgGJirhlrsWTix{Q!Jt}~YNCWEd3K~acu)8BGK61Ten=dNXju_-RP$?ljc0d%>3$n4F z`66B@JDB^1d@?YwT2~Pas%EfC+AFhH<-s@&kCn}X!M3`LB+8|Vra2%{raAQ;`)9zk z(dbHmu7lQ4)x?!D^ku4Qg%T+z(JHG|Gd?Od!H`5AHKoN;6MKQzNiaLAphCFu7PoqV z;HWjIqo-^dMHr?B%Y~8*BgB)Dvib;xAKWhjm*+@LMKVg~$=F08|W@*t5Ctq!hjE9OC@ zb#Qg3%7$5)CrG}S4PR9Xsc<3*b84JODm_q61i;{@h)yQv#9A@F_REQlSR5OMh!fd< zYMj^>n|Y<2$o?nhMB*x9Cqt!E;Y4-!mSGzhlVF7+W*h=oQH7~8BWGmK1jGWe1mak!OHFY=%tvBsz{LQk|>;#wH?na3_K;P=ZE)LQmQB zCd;NZ6QoHUgL#^wV|{@XE(2bT8(dY3g(z{vDDxK+5Ib)$5)+Skg>$qAVwAv530W;~ zK}3>n6N6jerXJ23mJkGonX$ z1bjzDg2nw5z~G8F(;$fgzzn6w2WV3$s^Ki_jpVVQG-jL3WodG0W%;b47%x>;DYt@o zg4GHU%H~3CG+!*hsodw1wFUiW)xsiI7EEe0FfMIDuvdoz=ip>|bXgqk4t7B}WD43> zYi@#P#`rRMp}m(IhS<8KS0)D7XgJ1GXX>gr8fa~(kEkE3>lh#tkAK6lOS2D1t63V+ zpjr_dYGZ&hVOWb z{C5eAu?VC8D==gMUbVm|PGp%_V7yTY3^E1T=+i$?0)qks0)vSM5E$Gs&;nz(5*Vuf zAH)9-_4Q?!5Ev|-Kwzl!8QgTk1URV4YEr75ooi`l?IV*()}TjoxaG;{69z_WtKB>b zR@qB7Vq3Tgf+6YWV5fm58w!;QQr$+^oJ727hD+>C$*t312J_OuBKrtN0YB;n0Z~v! z@@*8+&wG31doSWg(+xHsWiXp@w`7LU!GwsBlsZ&^rEyN;)DJ@iOdL>Z%0L-^RKj5F z&P*Fx8;oJjYG8x0DD}Yf03?)@Nj8B&K%~4V!GQCqc1!RXICzr%1btSW1Wou`Dl5Ru zaQ33<^}FC!P^8~qwg&b%YSoKq8L_G8E*= z21@5LLD8s-&gBS!N?FmUKODBzv-u}eg3^cr<3X`bMWm*Y5!Ee|Yh|b7&=`>02Qhh? zSz>Tb#7WxPA}7%Vkc%T2Dll|s=t5a*hAw~=)zHvI1ThMt%RW$OlbgK+cDWe75uROW zr!FBuYU3h!in(dRLzgA&q|$1Orp411G(so5g#&RggCJv$hnyi*F(6jF5#6?-dym^F zAA`B|6hATL#XTtWsveSfFl9yrm|m^n4ILwQfxd+v27w97>W{hsz^01`U@c`LmUNj{ zDfg0?&WzYFL){{u6qoF7@SLh;90{QX7+4Tn3BFOTw~XJ+s9S${&o8Jhct}VE05w4R znmtGZ@PL*(+Bv~7k3=OI0H>*AU&MT+T-GP<335e3TAP)rTtIRp7#r;(f^&2k{3`)3 za*X(bBG`sU1A#8kaomNGZ47bg-F#CNiyL!P%spfVW7TfMBV~_@@w5iy)-sWn8t*2C z3@VBS#ZNBimvNwhm%oc@)%bM4&NNnq?6fkAcN#VIMJ!vi&u7jhfCnk;M4(CB@T5Lw z+o^Mble8&*Wr;ITNO49b#j3Di623T4iT4y>9GJ#yqIXFjRwzoAV|{8ZREg|{c;i(` zBC#V;s^qs77`5$t2|{B<)mJqj$xU2ZqxA4Te!)!0-bR`#u%;cmdx>35%*--qt=Md- zKWB^gy;zs@+V@E+L}SM@UpX5$65yOm@;SUMLPE&a@&A3s+9B&&na}gRA5q zT*F}QdFxwl(udPI%qz*6vf72N%O8np@8z(Ti|O#ts=rHIOfUH8`*A+RnTLDZhA>cf zsyV&3Y22IDh5Q0IHqA$7orV{3fAE5%^c^nw^=CQw+}*qvRD=FhdhJEy&a5#f{{-vL zuJ;=HbB{agU{>n#0Q*%BLIfg7o>Lx z#A_rKTX2boa59h_8#2N7yV!#dOa4G^iXc$SXkXnKUyAlbd;pk{6?6-jsT0@95ESK@ zDx#Z`XAaObDIipUguerdyH<1j#U%82nF&buw5$C$qhEQKvyHvbXk^LnF?k^mI1rf ztL!5hLp4z@dNEpQ7`38nqk?2zIf4<9a?v~HyT*}cqPZGVlO8c~y{ej(tqo0h9;l|c zZh!lh#2E;YdYJ1T8Kl?gdE=sBp;HyoRI5WkNIe#`5j$97B_U3XjcN>D=v$Da2@Q#T$X$hNoMR6EV}|X5tLcBRE$I`s^9`tyhp?yn>A328}au z>(MmyxB@*og-)44a50Vs)jGHT_BZOE=XW`KK*qdD_#k}O;$TloU5RHm_;`xiV*VE~2)}CEe>W9&cq5;EWKs(@$fA9? z)=cidcY;oJZ(=9Wf@@rcrvu?l>9q?osd4A*ap#?PaU|8x*(Lpa4gGljU7U_|H_RX` zvEcf>PT|Tnlw(xFMz~g=kH>ckN+Q_Os!v+ig}Vqt_)q9lEDLm?fD!`!EUxWB0;oa1 z({L@9x^)hhpb!wRZt6G8*J&H+yo1x$%H_jXvfW53H;QV6#0gG($>|p;7NXiZ)^+kP zRJJYvg$bwynGmWAF*W@9%Rl#mBY*F9Z=HoQ(uAABJ>&Q^hu_H=4u1t|)9MlHs~YyC z^VblNsId@~ml~sOi?bAHtLYK+vJ6inOD4el@Oc5s#?+D}JAl+?;5l#3HUlkduO0PZgV499F zK*ieZIIyU&fXX$WO^f%fpm-|A^)zbXoFha57-cEe z0g_sOpw-S6OXqq&>!CW(eybPYxBZQn!J{?_3(yn^Uhj*tBq^Bx$2vcPSqbNf}x z>k}=HLCfJEp&7@1>2@VBN5fY>H$I*pI*j63hFhi@(FEaUrI>`{Yfr3Roxi-Qc;&1Na$b5Je-s_LX zVa>OM*v1;a9lWIoYD@S{K7#@9)qZk^5905Lk<5%B&(8@yC~Ornof< z{gqBu;gFNV6zYWqRZ(JIg#14ALx{6jN?f60_EaP+{-E8E2nZM=I5TB(Zj6*q9)Cq8w|hybhRzK#-|(;?=jaA{ro(BYJI$_SGa@USR__xc<1+K zaw4%<64`q!6zzxpIs=yliFSA8n@@X4v5K@RS{4=G%8a_^n>I^tc5>=b};1X5q zBHhK4E{&?4#;6)=QP8&$)K?W%Vlc~2BlBk5S!IN*>PHZM#JtT>*=<+I-HH=f4Hn);B&n#-V| z$4F|DugQJj{|Dk}lqi<8DUPQ-3V5SWoXyR$6 zN35@En2e{Pai`$l#M2tl7V$KhW8tS2cq)pg#hCxjCk&Qowz#d4u7^oOUIPoo#cZw zi|`qMb^T`?waw!N?*hwLjLeSm0Y_S)ByX<3?LU@q)te^9oWHpquX)276oLJz$(yrM zetg!Bm2_*H44>f|`A!^HYs0YO)9JX{p738`BZ;*}JNYALkpKI#77+2qRG`fr7;TYk zwkm4NGkPV#Iv|DrBv@ztCbCR9~KhMOZ{HMwr9k6_hck^Ngo*?o=494n!r7gCpfR4*&?04ci%8Lo( zVEh78tf-<7NtWKDRs-vF(g+wbQgsB3vkwO$>cq>0o}s5oC5^o-O`eCK|_!u;YHU2pmP@Ed#-BFG9Hp*w^)3bp%Wh zxh4D;bX~Lv&CjB<_FkgeEW!j4u)jmik_gz-*L6zA30K+ zNQjq15;(|qsMQ@^#Al@b1nM>e2o6NShR0Zl>jyE`X!u^>Ue|sM(pN8o@xt$7U2Gd; z6nw!eY2M~{%*Wj}t8oWSmwf6Fm&CRVIR0mDz#i|t*u0v4TQz~pccqoTyJr^)D`}+q zpIL`Gm$FXz*K3^ocq-kok~qvSAE-!o^suYK8EbISz&bCup$m^*VR=`RXuC*^!|@PQ z;Vo**q(z@E{yrqI)2b<_Y;i9HB)op3kSSkM!0v^Lkl=zMs8?7K zJz1F9_>k5Hk-zQzT^Q4A$PJ`BKaRe=!ubQ-o+&I}!|+Zaw+`Hec>#-C!b5&EKHelj zC)u09RsWyq0>`li-kx_8-!hD)-$rjfCU3wgC2!`-o7ysO{$Ad2fr@*p#2b#cyc2&5 zT+PJwHN5O7_<9!_mWg6KB40UWqW>fO%BhEsU}D!VTc&1lykpi_y5msLUlPT+)!3iv zM%tD^vjg=An1(+GToz8KL_Wgjg$?w*w!3H6wtfh4%iyYuDO!SGsf(oLi-ikc!F+Obx@Wehq-k(zO*jJ$^M}-qY)l#~7c(RbtL1s-5aeG>r!V^> zqo)XPp30mpXZQ?0B7S+Q6MR>s0ivy_zbD+$O$?u&IdymussKqge!Pag%$$8lzI8!N zJ|y3|U?>!Z4?lE)ZuAX-R%qtEG7&`2@K#bjOs1}s)NxqWz+g_H+VNKkK={TbhkItY z1&n4He1q3e3Ch{fy6p%=yR3!oi7P>Tqdu!!doOZqcLEXRQieZK&uvYsZfrX*;qw$v8Hu1Ar96P5{Tk}Tw?vdd2kew#3YN^y~> zb8dGR8S~tYLMy=~Zw3opLY`SQo#P^Vb$Uhk90JSyRG)L$g%_yFVojah_%D_2yqb6G zpf87MsvAQtH;}mMGi(Z4ilFXXiUFGR-b81XqZJkn46~E1TonYJ9goIHSCn6U4wSUP1O@S?rdZmJ(v!MfHOr{Rn>tyie5p_6fk(!#pz1mwP_Ul)_J0c^F|cw9tJ>dtk8* zRWWS~?ISacEhc@W^WVerp>I9>$hmqlg4Ub&8nm7Y`p^-JZQ)yv#aw~~7083*dXkFH zU{MmKk{Z4*Y(;pc97g8}M!yMi0+7J9L2B!jNZzjkNw+Z<3kb!X;#i$fe=Wi7tv& zJudj9ViHwP14yg-U0(z1M5_KHT*h*$UTC6yM1*Z{BiqY2RU*2WM9()sfGWlhE^`;LqJCRATA*CDXBUNhSX$h)AvE|FuNP|6spL{&1t@IU_IlR2CYZbA*N` zphX(KCDQQs2v+y2;R!Z0yi7uA_#+5!k%pH6RVcCZmz>UPq~UA-)*Ak+O_)uQhVPcL zG8+CR=!|t)AblbYPc$XYV*Q00z6#xyX!y!0X?T88Ui4{M4c`I2ur>Vse2&ynrLEy- zVAAFE(Sc}qznq5uO_qd&oss`Z8vdU08ve+Im`hT_f0ac^lxhus3-*o5Vf58$`1ek# z;Z-EfK6-ct^4H|t;b%+E9ro3W34L(27)}j+Ja_mYzliB{Z(pd-9d2(ul%{fUQZ_;8 z(RVUTP|aKn+~Wnuz`cJ~1witB@5cc|XCAUBhp@N1K9ybzTYuJG>kQ%gmbyN5-1WXO^y(PG3r|eM{Smhh2Dj$Ajq=?}qCs zq_KN_W9x$alM8X@<~Z+4S4UaHM2ifcrf>dW!A8_V9;B-G} zpH*G&l z`>vW@?o0a_;E*JoWyIP|?H};WLI{5Rj z7luulby!#2T>D_UgOQkth_oz)^-MrvZ%0RA_9a++Zy?xZq=@;N-|!gBn2753CaCT# zQr)F+IU=fCSWI>8eO+j>e^~~?lgv>2p!5L}9u7$l@VLsy)sbyb>RCuBb>{Uu-A@SN z;Z$|wxHk*C+oV*Flsd=;m(f;`T?sGv`Zw@fnr&9i*7Gp-QE{^?7H~pr`8zMTjK5`a zPJ7=yET=>*PwA^>i{%Dlf9$VyFuRl}ZxW(}bUi?VM>KpEu2PfC%o#bH&MxixMCn?8 zqwT=hX;q(CXVwV#ghA;Dqcorcb&m1?;a}31wI_h+VQKd((LHauf z;M;+%CG5ikrBxM=id{lGzKGX=`eh)ag zi~b8ax%3_aV-gHsgF05wp?yakr9Hoo@w{xXd5D=#a913kwICBr(#YvehUF@q9O!{WkaIS>#7`11r!x*}c zm&?Gx%yVRFn83emiQqaS=16pvZGgZ!0_{i!(K+QtvB=IsWVu8(0Rr2oYr0EcjZR$U@0>EzfPlE^)?i~W9C zNurL3SGBkifhyoCy544RZRDvx>z43Kh1Ppf0-@YU4Vp^P&z*=mG@Y zEF>DxxStRY`32N#dDQ@JZfeyJ0f z7gA501*L$0#LNeubz`^5xu+u0?w#ZW~UN*<93QjB1Rd07bKSpn)UA!V*{TNcxZ^b30FIF$8xD2#7~hp*WCDu z>`&qIC=UTmyYf$@D4q|7p~J&N_U>x?ju2)QL>~gK-0%qgy$eg>1pmpxkXOUsaohy!ftd9p_Ku%9(#H&b~h+2hg0s{`zb8(Nyb9+CX8445PdD|=GNz) z(LxOCmJnii10hE6EFsnsT+iVcl0=5N8X9zf?{wXzV(Fy5VYRarj2*ffEfMUPJ-&4Fkk_#y~ zFcs@X=mAWluW;}W3un$`w{{A&`hVY6gqoDeJnX15c?fiCZW{f3n4cX)?yWW zJvkbQ%R4qkUsG!dTEPd0)m&GuwnEw~kE1IR%}B;QCu)y*~#LSK8xjLdz67 zAjP77zz+`3_drUIpzmN2QqP`YEo^$`e(8YxWfVx7)f9%&PDOkhl2Mrs>NKJb+YF%t zH|ZPTc#Pms<6>2>^v5G_0)RMHvvK6Dy_Yz(Z+Gb!v56N4l zhq$GU!wNyIn_v#mDfYHs6$Q}LmTQLE234R zhonD$NpDuqdPZ&-@|*~@+D}=M?7=w&3N$)pxA~S04C=AnZ6<&yBb75drF-d(noCFXO4)~LH z>u5&3Sz61dLecP@;;`-vKcSlfWMup?{Z1~P=5P6OfFgay3c`zaVa@aVQt8VwK$><& zhmA7|I2WrVv5}MLq3HAk=2NCN3?rUD&ikz(im>U_I#b^-7LS4Y9OF$gMqkQnXw5jS zmk=m09=*LnrCh*^-ZtqTFb0gE#2gR*hw<_BS})Tqxo`0-zl?Gv`ripKk?1DdA(zl; z-iD-3Mr>03cXweGmHl@j-;?Rx0&Y+v%=bF!W_&sx`F5u(ujJe@%qTb+-6Hc2WD+ic ze1xyP)S@PKQ8jXhMO5uOYG^u@#vRLBHzlFRbf>!MM%-XTAltAK`-?!PJLK-Jk|-o3 z0t{qMI(9A?$f%O2kWPBmS+v0&WHB-1kuaOnIk)Q2x`|Vg8M&nl8o`GgCx}xj7oSM5 z@Z`pBJwuhk)%4Bj{H$TGmKe|pRWNE2p~@^wU9z7L+yF#Tvr=lr=iiL2OOnmE|7Zq;8g+Fr2A) z936lI;3L`rHEra;JB^)WgJLEf-uqS~_VsGv6hMgFxrZCx~dOdu9{C!bDJ0G2t#1!~AOmHDQMkLQ2&d z%7WV0CaCEtDgx6+aZQLG&PLE7ekvX+hKFGAOflFwliz~2gPm8>z<}nlgPs5HDek#fxdK{N$YCTQ~uJrgvZ9Q(lE>YzLaCkzMr*;Y@m8$Y}303Y;T;mub zR8_UyChbvV;wEO;ZnzNjI#DIOJM3#sIH@%r#c`03)jd27SO3hs@bAfKRVr4}52c6> zW$$@C;xq#n?BJ9jpm|_hl%j2#BTvDBQ6v$2cMmkmMkhEbCOmFUr8<6!ZLO(*l~Y;_(6f}7*<_9GE~2>tB|Z&oHvtx6GK zI7yByU4I>O<)6~I^1AT*%-)ksx--#45=c7COQan z>JK+A!boEAibj+DCKCiu_ln`I*x|U6DzU?NeV1SwTX}%eL_$Zywq%EKU$tTYMt#Ao zjpE}vN+RFY_TlH#4e|NP5$>0o9~zP@K(IO7R7m@78!|_K_S}N}@1kDK0d` zV1a0C380Z0O$^+{=L=LJYQ=*sAq&x|D;kpJY|_y&Q&^bM9K^rCYl-h+cL5-nJs&(privPa z41fXR%>_;|$#os_T`1L?xQ_|hV}V-K3E_g6kKnRuM!YLg+i1c9jJD0qe2IkwBiRTr zT|P@w3=D`Bo3Nd%L>q1sFpV%v*}K7Vsz%m}eKhJFA=9W-0AqQh<`6k3i%`VrB*!h` zz0CG$?-BCjsz6Jz_?MCdB=-N^Jet(rOCp|#z?TYf;-(0wzD=a?)xb_ns)k{?#zTgB zqXkxen_9rFj-M35jqC_{Q;?#73zZ-;Ey|*(Y?B_9G!;sT7+l}NXmlm&Nrb9GSPXpu z`?6CUiI^gCD5$#);awDm0;_Kt|F2ifJ9>1_N8a@s?E_TwAdysT$Xja(XrD*@CKt5Tb zTpx4@{xD0bb~7VPDUZe05y1~R2kkaNL;&o`+Jc*+&dAj(r7=o{@9+S0{7{gmph;?w za?m7u{Gq})M69faICG%i1Aq#=Nonf10G&2f9Ha1}?C^jn2~#K3<}?`F>Faj=sq_uHrxQ75-u!~x_K7yWv4!X<4K z%>R)@`8gbo5cLExgIlQ~;tceB`+q(xZ#8djp#ThUi6ZV#*5F?&>KZ+53!eugXwtbH zrSfw$+=5aKol2%L8_!Pe8P|WBgQNv22rYrq5X%J^jNR@!N)4)DhTaq6Tq17)GcK#Z z2zp`D1{%yupb?UN;U(46kI|EGos_;&_y0|l$_2=}=NlJ9C=lNJ9-fu$Mri)<1@<8; zH$5KrDvaR3QdNkNW;dfERl4Hn>*dJ+e-)n0_rf@X`Oy$ugRihIj3+Fvl4CuEw+aS# zudhvanB?dG z1<{HVrB~{o(H}b0ogHB(9na1oCHQ{1>jM|3&Bd3ob}pnXMy3{<8(>YErNxhKEJSV? zrpPIPo_DXWZv8i0)SiDa)jB)>;z6w+6fi~}$0Z)83Z=>=9u3D4{5xejauu;yOaf|n z=55HcttP7^njGd8H&G#BU?6-qn>-#tunj)y@Uo!(G|U@U%EG_) zf}gJe+B|>634X4O{xFfV6f$XIyfJ;*rzpHYOy8bGVuFqkX@33d_yp?4z2vXw&(OL3 zU-UocaD6gGuFBXJo{pBcH_B)tH_t^|ZZIuF2cjLN_2>0JyLA)3ut16axvjH4&)=)? zyYpwCyMw>4#_tQB`{DxrW=#Hr&!2ETf3L>xOF#Dd6Z!i({Jvu8^zZTa4fuWI@A8N7 z_f7cyw%-n&=3<4|zoBnsVHM?fH1(-3AhhwvPSl>pAEWqz_yT2IbTY~bE?TD4JuYad zZA1m$Dg)g+;NmOoOB47-#KA?arR-+pR|c`lR2lv$RyyH;>?OfUmZ2C>OeBM@hI(X_ z1bb(6Iv6CGkH2pF%j4)Yefb~o&;q;b^OOPktzAyw9l4k{7X&Bb3JDI(011L4fCV9l zbjMyaP)r3=;um<0+xh3rSTP5QhFv(-*PFuz*--O}>ni!KucPvp_|t~JH{&m#Z_$5m z)qmIOzrWIdrEk;|?maH3e7YLbO0WIRiYt!%I;yPbIFjckh`YP5{`h0~n>qKE+un-5 zb>BJVh)?71d+xjY`xoJF{Wl+e`U?C#^5{=L*oD9Ix4*FOcKluN=ubXS!`m?$8_^L1 z_6{Z83O?8qMkc$2b5sow1Z(ggtr%F)(C~Z0^#_9=;iuHF1gPE;3R zNW}PCPXfl@BT2mYJD-G$zY9pTYK0*0*eUs+6s-4QD|1`h6f7yX`QM@tyUDL zvZcUy6yh8qq;YCUe5cm@~a@N-ER=QV`y2&Lu|Fg}UcR;__Ef|34T8SnsB=eZP? zIMjm)TkJ;L6|))*U#f(Fc>^b7KA1ZiR*JYn=A1ZZyQ?6hfDE+`kV1RHZd7Qxvlgv} zH$xMHvWMqFvlL$)b}?ipKQo03L2#8sjnln#hBFUd(w|)aah#@VQX#{1^LEBPuR!qoN5 zYb6{AY>(&i6JIsazJuaI`*V;*g_%u0KgD@ay7O(Yjl?(wo_^ip|2996@@Hb(H{Ed* zNjCUW7E&mwcO~0-vx`rf;B=}xceC3)x-i}OCaK=3^eVy)Jjzf|1Vw(+M2`(pzBmM6 z;f3Tu3LAqG^HDfl)l*%O^YAI88>(Ado&25h!M2LlQ+R#J z$7nvvIE5>uTw(%ZFkSVt=mt9Ykk~2Z#%9BR;cVsE9Ey#?Yu11Ua7W62HQf!@{Kjk&ENtgk^8 zT+fDl_0EfD^X>RQdFkf8VC~2eOkUW<6P3VB(S=><&McP^h^VS)U51lC>UhnnUJJpJ z)AFcu$P6dnHz9(R*ik_mpkP^yZKLLisH!l3ji4pn@kZ3xy{Muw-61Ojvy4NI=c=1d z@qloNu@yQp4oycq%Qtn9E^;cK?*_;)WcnFDJ6>1~UvWhd&qCHCgb@1tGzXck2(SBe zrCP4nm58zzmRDl0JD7bT&gsqIxu?{yMnJb*$);>mA|RN9I!{h_QjC240UdNaCU>_X zOpm*fX6J^3vr_r-nduH{JO7NzW^DdJWC2o3WKq}&Vl9dSR%*gyZi0cAq1qQ>8NuB= z2Z9fv(%|DmV%`G}!eB(kB_LKJV|XW>0HCOr_^byWg`oF^_uh!XpO%-m%B+JcIkmzW z75P8aw!X)(!EIH<9;Dl~>@ZjkqDK&5Uzj_KyKv79&lHn(d5zQyFGi(&OGWMQn9Pb_ zu^A5KC!|A#7_$izrv}BeQ`I>Egf0p*{r>#66he`+)1Ch*A_X@=L7w#I2M%gtn28c) z5$P=WMVEk9PN-~h)e$mx?k1>2EmT^t4&*7hVMqxSss=4ku-tsHKY_yNYmq>KB+s=p z6)H1EoGj*KTR)*fjGYg*ofu3~7D$K5FcVz~0@2IX7%;^MZ5TBIRIi z9ObSLL52V$NwMaokX))|y1yzZ+VI5`CEC2ylMro>W0@urZQlnhP1x_KEPw_f-N>^= z(#@uAnpt011=+G1hLs`P5gf=?D>~IB!Fn6vcIIwO6iDepQ6%3w5T+9O_63L+7o1HQ z=ECl$5Nj$NnxPbXpJBX6if#Uhazl+6``IU<4o=U^9>%`7_@`Qs{T%h0^0PCw!zsMr zG(?uYfWi0x6jTiFz(dk_=%tpBh0tFvJZn@UjY>87`->X4{LE(R!Ve4UA1mQYgrwAVcK|j z{wOT;T8sx#B7Po(pImeNeB&J?4kh9DJcmP&>K5cb*-Q6d_$m@@)Q&!(IaoutlHQCzIs9`L z7k(-$R1m`CnmG;qtsn4$i=bFesKq*uybjroIO#}x^8VI$xWO^7?7!W}g@STefm3+h z+i~YBQl>{9ar+kmDzXATPl~&RMq%=j_KSRzB>y2H!wW7v%*}5da@+R}9masj73_rf zp~DOw^coP0^jQ*o9QP|e%=8N6r~V0_VcpVnV$)*m#k+o&genw5uq;c7e@K2n(RviK zHdL$=QIcs*E0#j;>#gc-_||nnFs_((8c26b_21`l@t{^O-dL9VUFilMz}9E8a{%_>UCO# zrGW$hb!!oDTvo||XL*vlNCv=dpD}B21=>Uss|2P;WTSjr_=W=m>FC!EB)w6Z-%%$K zN4UBVIsR{*_7@mnzUW9{va85Gumy8(PZ&N3XJh5+lqC-GdMvERG)6|yxB?nAefjll z7JLb}E$^HrX{ye=v3usSaNZuLe?9FQ!J`*c9{CS*4uLMK75U6?n~O`8!-a4{PRTm${Tx7(kgY|Egkz6Wm`=#i>R6+@-s+DGbRu!1 z#6Vn~eE13jUIY?EC!eB#+!{od7Ff7x@rkbA?Ly2Jyi8W#3XL7LVOVksQ?hw`hw8zB zs+B&LitH=x0|{YhCBZTB%i-w8!Hz_uxQ1K8R@DtI@K)G+3Ah%(_TP^VveMp0b13h9ZMo zGNgkfZ9ZGZ4Av@e8d#8^C_xh4+a+Kb4fyE;f$9m^;>L7(EuflE336lDC_zqnDoT(G z^53rnAGido`YM&+N7_XRegJ%q&} zPz#_btHQ_rU4wccYV7k*Cbi*}suQX);jIAxGND2o!%K{q&)Y_S3(=ML$hyDLLhcM94xbTQU70srr)+B`mBGhf*6@%!DgLSVg*X?qI@Z z!p3^*9AqrD)=2JD_#On2l@*}EZXNl&3hVLUcy_sOy?q&$wY+P`VGRD z)T70_q-yH;qDbrOu})T|8!`gz^uxp6jKCMNJf%nU3ddh8+nFejy?CdYPsm+<6k&r> z*BmPl`!8zb&ilS{xe(6`Knf}WvF{9jIY$j9dlzb)e|Y5oXYW10qDZ<&?`DRqA_^*| z6~mfEf&p9uC<-Vlia8Akh$sRQ6m!4;tC-z&UDKL%)itkK%wb){oL9^!=Da5EIn}3U zYMLp<_kZ8-`R;RXKMzdzuTP&kbt-gKbyYWx^|7IX&1W$Xn863*l;f_kiGCjDm;G(v z^}6h7r9aM#)8EwPM(cxexiD~w&wLKREWOzog|}wFM|#q%j`-mSH#9jW;yIWsw5>WG z7WU2bPwQ;$pVrnMlFrW>+S2TN%Dd`t?g+Mo?BS7%6Ao&%>nHiJJm4dDKGIGzk7uvD z!II5W5v~dsus<3JQ;Clfiz@+ScUQ@Hf=Z*mY%i6>7cJ~|`+C%<*%9<{*py#vLrg01X1_ME=p@UP+Qfq8Gktbp3c`#bQA*oa z{`8xD_DMBx!lMt)QqWQzFzXOLID}_;2iULNT7g*!Ii;bg%uxDl{EUpjKYnQ~y(62zvBn!9jLgHlAof(}G3>JXrK(Z~vm1^- zi~eh}80x|_7+{m#fS-NC@roPS_p!oFQ>N6m2hD60<3@JQ8I#jj)njy~tJDtAvw{a) zc`MZmCK`QEuJs>`;|zx!bei={_GCaJ7}tI9NjF$A9Dr~7#A!I0@Y0PYShJ@O3CSav zrv`zpby8Q~Nx|mL(@wXQ-{b{TD)tAM!PDP{7Xs3DWy(`3*ozC{8GEbrIsq`B3Z_)R zTOZg|*2D6Q3VN^AL!b11WI5&4UL{-Y!hE$%Oa2aIqV|mN4caq(88eY-N2?mReUyUw5EP%H0{p=%) zb92)Uz!P}*LQodWr8M!mKzIeD@oWQ_Mv&A}nQq16$iFk?F{(|P3_-&_ZI0M3TN zlzZ4%dC~@gGai&Z03vvQ$cD#UtiR4rDT6_voKx`nh(|5cd>`Yp5`3;-6yCtKV<*lQ z$l0WincNxSVivZPViwC{CNCo~PH%Yap~=X#Hj13;#bl8C?`O|UHXXmI?I?b7hXegv9bF8jxI$7F zWa)}wVzX0zcybZ)BYRwmX}>L#`D_~wIu-W4lymC;8zaBF^c5rBc_SUvMzX51G&D@~ z9yb~*RFWb4&l)76@9nVxEtt*eYW(U&{Zm+&WrS|Qg!4~NaD^&Xc~_M@iIr9f$}=EP zipmhXAVqnN49f)B!<3dVGk1W|(WTOy@!7|Xj*8gu=d{X^`l%z_NJbomCyem6nQf9a z*z2_)KIpdJKb?K_&>Zd)@UZb9z5}ChK+64R+_0`Hv-`t$;n;`d@Kz~*_@$k4%YjS- z^`(5v!O=PVUfY;`SS18{JxM6e-tUugAOx$G6YSLkX$Nu+u!m5v!f?dV?Bmd;XkG0| zVlAKDXd}!(Xin*y` zu8(=DKIR=v#T@%!a?DW+#=Inpx$ujFc2nT%7@AnQ(aE2%T5HCk>Hm@-~5Wi>%FD+oR zsG4CL2UNQ5#VF|KLqy~H;zamtC0q{ADNf=-;VuDbxjCia4VY+>JW2IQNT3(2mM%PQ zA0T`rmXgE0O^lURws;uc#Ko+_FpP_S8e9Gb^O)q9h|(L~;F-@Pn4RwrTz)?`JpP2Y ztYou}4$jnvh8OBGAh;tyCyZujz_U_Za&9!sDSk4f8AdLch>k_Rs?u z-(ue=^T(yWLHHJzF7Q&gwHOGlov8*Ny}fN7X!a843s!??fLZXZGxoAdUn`d8o<=KD z^?p>x?itl!F#iZFhUSnGL3o7>or1~@&Uz|QgqXvF-(e6LCV-Q{SkzmkM-Pzv_uYk7 z)%Cz?@alrXM8NhNiPmFZe_SG%{e0dX`0c+gx~OpWNJe z=>?M5^x7slgUMb;{0l?vpKcBltg_5egpJZHhM^bJ3FEK&+o&a%m-}fb=q&fV2YvUY8P}t&tH}Ei*8^3rvZ6WbG3i z7YVZOja%ipuW>vV;jmSXo0{zVa@hTWjPL+EF+{yrC zD{u;R71?cy>msnVtZFEC(!e};97f%@{9kUEscHHe{X9>jv3PwrVNjduXKj{eFT(au zc>(ibO<0;IDAOv%A>y0>=N<kyc)My-kdUqG?@tYr*3FFogleYS{8F^MdTVeJSsA zld^F2nSL@VnML8k4N7)pKAu3O9E4IPQpB|6T!oG&%t(PD5`VAHu#5rvs_LldJ7$ox z(p$lDq!G}T%1LUUzh_{Ox-f6$Kpw4f@KBauMHibS`ZjT$f4UD$5-pDX9-1JEpQd@1 zbA>%%=BK4VK2IO!%u1InPU|qk$${|g!(;d@4O|aqbChNw@K@cFD^w*B&KgcXMJkkP>W&x#}Cug>v(~xq3%2Y)75Dj}m z!{!Le69fa&?sJ0pzC^5TaM~~yF4ZYm-OZjK;3-3ru)dR?qlnk-L^n)qw8J~6kdjIP zUat~M3R4NI|Em54u&oXMT=5^YW(@_6&7i|~_NE+!nJ)TCh-}y!$_VrDbAOwE4gPww z50+cu^r){Fn7dp|m0<;C5;xU}EYfoI_&rSS=i?8}br{#B0SKdfE{|mAJFh#l;dO&8 zhBvGj)f;lJXd$$l4OSQilhM$ejf&I#z1|icHw+tFg5m)mkon(G_vW`kHndLjlo)2m zcEblSO{x60y4RqKRn0~|)R*Qjhsc@)lR8+MLnqjIQb9|984`%-RN?YpUk zF@8t?vky~)p*aIH*Hhl*CSBG?+;azBC40$4g=u{E6B?>`$&;UJ2e&LkK4E!IUzELa z55NK@)BN*Sqm#6Faal<;%)$& zY0h4;rDu@1){0rstf?s%HOC35juQ)SJ((If(Jbxsz6bWOquKY+DYe5yEe&Z}CtW@O z)mmabb_IMn(|khVtsN$q?YjldqQD87eK+lGj{DlcNPXZ0?Y^5%HpkN(C+e>~#jfol zKfjn{ej(KgUop`ZvLki|>{HunFgK|_jz?a?q{5vnWk*hXAkP>9=}VAF-W6eJ=V03ohj++0woJ{?2%;tHHQ}zq}PU`71j?%1#%y1CH~?p&_^q zO6K_7#42)OhcXCntSM;4iqJmg)6fuQy50OEzd{!tG{HB%6pvv&syoB&vE_P`RI0^vEc$IAZ7 z00k~;I*yAP;S4s6;R_oOB-b`(x}cH@gw=I>;hV+KJ%S}{`^{3>7rWu=7uY>A%Bpif z0}Q0V!(4Q=Do5sZKG>*?1XD4Q^JpBEHOQlF(kr>Zqh`pWg&>OtLfA_iYiAW!o`-OL%_!bQEhZmh( zmtQ3h=@|rXRy3K4_>P!3c~B3!FiXU zKo7PDPC~OG^RZEG=RkNyQWf7m0#ha+HwTy<40wA3OJ&>i=tf!SVpZM+62kHg?DS8bCbdE*}RZdKhhe^>wsIr$LeWL#ke**Pf32LpipK$u_2 z=it+F4tAAuFy=p_91J>G2mdqX;Es@k?Tm8pL(IW&_uJzP7iN##^EsET0z`tDH#-_g zo(7M;Hq$>{g4Q)83e*=cTg`YyG;0%OKL<6~UOC|J-mF&JPR-i~2Xt#*%Qv_Kb0h@x z|F-{QJ%A5IVZ1!|)1M!6E&PYAuwCEG!G3*0pK0$K1TSxM_5Ok8NfYiIE0lJA*?9Nh z3NNeI9QmtjAM4bhO7^KECzmd?xbL*9x7t3tz-B;IAEKd+fGOITMtb5;NjotX1Ln{45k^#5%8R{OHUe;iV}-gDRL z-A<1D{ra|-ZwD7U+~D}i;=gJ>CBCcOZ+}dgxR5@7)~gp1TDsEjOH1RVl>h4XDHl`B zW{^fIc(8!cT2k2)>>xs+9pnRiCNKhWzhRPGB~G7cnEmsUYS%U%U)Oy}>yxW{KAd^GOirH<>kC(SVcp`w-N3c| z-fVm{Jm6#hMNPZRI{VA_=Vtq#%FJHry<=YKeQEzJy8D-dr^_Fs#-(0)YFDJej&$(P zhetKLO_;vzy0mY~l^PYFPtC9^?|1z9;fDLpoA(&D*ROg7zplD6cgH=qhnwKHZKvn< zSk|*k?Zzo%_DyJ1epXZW`n@L|^{wjZ)Y!h{r|N5GY=1U*88xCVtxacw`N` z8{G!I&&};^mh>S-@_#sNYLR*mx>jgdy3_fjqpS8Uyjdh=JAW;*uYA{p!?tfe ztl8~w`L^Am+y)TDhCZ&+qwxEZ<6oAim&I=B)p*4!o6n0Jj*Uxm7_~O}gYC^C?Way} zY&XBD#l6)Bp}e&YK9w68SR(i5;IiKrn3$9BVs9~^y?V99txg?>^tF<%eY{@3*pH>mekvH_9riqIm1UWGV^)q0 zd{xLSXX@y}m7BLM(c@)g?ca9CE&YCT6^~nn!;yP{k4xQO z{?PA}H7owES9N!{Pb=&_j@TX;*cIdp+OAu@pq#x_@aoS)hLoLE`@@DZF+NY5TK#nP zRVR!6x4fl4fAAV}H#B*^W47mV$UiUsXxDMe^e4Fsn@t{G|8l`4b+;^D+RVJE@6Bnq zI<^>dwomCg!;_>%K29`Op+I!0FaYH+Hv?+Pucb}=Y79}oih#htG-1zWI zT{FkO8?rej>WpTzdt#|6CBHSkS=Qro+HD)TI&tQlke8o2lyNy+{QlwEqmHilyrA5N zb(PF|Oz_PP+w*Q?x0fTk4IZ`G+V<__l-)HP?v$)KIAu-LMYHBE`x75rXtXjnef{A3 z2d3;=(({yy`Tn{YdxNY;J*ehol{4pc)gDvLB8p$G@T`Qn=H8L0l&TI1?iXT<`E8!^ zxK{6Si}uGqsb2l=o!l|wJ`|l}y)bm!cjI#szc=qRqv((ai?2@k?Y_;b*>@t}Zfic~ zhsCvS4A^VuR<(Z5Q^ohas@L0hPm!kW9bh&=gS+E3@BS%V(AsSG8}Hj!7o^@dvvjN6 zu70}$zxy63(Rt1VO=O3MLmq5)E_!DDrGH_f=Bk(P|6F?gzTXcgKi{bQV`Nb7(2^_X zMpg~n*d^1t^Pca1oYtpOjZtPhG75jIKcl)lf(FaMb3;CLKl9|}lCqJxu?~kkR*u?T zu*2UTj+MLC>^rW8u5fdIj~TP~Nlsdm^|9twO zJKxQjWcy)z-HvS=_6sd?`ReQhtuFRTM338t0(CFCR6e}q^Npa!gRDALTz1W)U*pgH zhrNBew(vZQxK{(VKB&<&^6ZkX`xaNKU%lYzp?i83X`Oo4C8A~6)>ox2ZQD5Mv$@Z_ z&z%;p*l#vrNzxBb-wk`w#Om0f#Cns`bZu%+?(gDX$?2#AJbe>$xZ}?F>uLKd%_%#lgiftPLW-6pH=OgYP;;3dtgFTvG^tBK5Vj#csQ{4p9iPJpZulciazHa zZw=IipBq!K`<+SM?VkVQ<2odJ!|XfZS90%o{Q9|Z-M}=wr8S6HW{JGAWg{wFF9GEz0+JZFgz3w@?d#pHdd9QZn`~IuyRtxO3 z<@!;L`II3Y8rHY5a_=DsP9Sm2h3zasIV+vyv@#UkXTicWGLq(30ay*akpQy&imA zaIndXb-9t|rFEXe`z;u>Pg^GAVWFA3hHSb0xMH^>p&gnRtk`40Sm&6(gR2kzuJDwx zxxcLi|(*{WXIChW#@Doai{$8b6daPy0P;K=kXQHcl@^gC~>ZFioJ&y*|uyFUxa$I_T#O&pj z!{RzN_t}4WZMtoPH$AG)XnNUU#CMfrZhMTZn{M4{-0;N~r=A7vxL+0)#{5%ma_!(F znJE_4_D*_P;H|^X-GA(z{i$;gs}dc0E$_Od>VN|E-fTXZ;uSn#$QbC2{c(Dl>FB;VXjp!N_WG{UMci!NmW@qwyz)AuJ8S0jpdo{%&1h3^OS?b%j%*P%zHpt_Ub9w?SQ_5? z&e+k@-@h8tE$ZHq8s}|0=nAzNSIu#{$HxBcmb7b<*(WC0yVJ-njmFklv1-Yg3GeH7 zeEfXvdAn6J6MFS?pEu93)QqAH*YBy(qD0Fow(nohz4kb#UiR)O9p|=K*S6I2cMHOM zUu#$+;-)m_uPf8{_uk#xsdCNdX~(x6vuL|!`>MjlXPMP}KcsKBM&bSD-)xb3y?YFWQ5nQP3&bz(SHXgWnq-cr8KNgs_x7U_~W;04X8hq{ZtJb9++F!LQ*Sc$|gFR1w zC^S8|rN;}u*`7-Wb}2q*p^wJ-V{0N&vFe@U!(&x#|ooAlA zMJ_7cWJ%<$pWj*O?8=w4saC@N=b$+w_AReIv3jTQqQ!$xJkbQb&K-LD4=c~Ud+sJr zOX{%l#E(~t)!Ke=yF=M6rzZ}uZ~pK4PlFfr-oD7;%G1J$tCss7y3lxXr`G!#J4~83 z>_C&v#iw9@l!K$pLMLip=GrnX0D41 zDsyE(k><{i0%y23e|+-!n8z1VJgh%HTIjoc|M)r2JO-aV(V<4mru#f6ep+~?#HaV0 z+^ekku&8L?R(I>Qt(SYNqD7yE`>GZ8d6qo3$ccd4AI#lrPpvTVTu{4z=8pUR?6DjB zi_N=GV^y;|!+du6Cf6)e(AVnKxOSSN!z}*lw=QB-mvYYD?cAUGItE;cD|&zSzaN_R zYZf13dF8~${!WG4PP0xdmht9sSi|iO&LtiHt}tQxht1cIH*Ojp*s}76w^gcD%qg(H zQG<7HTL0^s`f~KeO6MN0%h@^Wd@b)2mQ$N-Y*%DZxpSLJc5W6?X_fo-nuk-f>%9uU zbGY-{-&V!l7!%mFd-@WOD9b*Z>&K6ZE%^56pyWjBhEdN(pZd4J_+u3+?F-s|W#F_a z75^Ak=0fjXX%&Z`zjEnT|6LV+b6h zd!$UgSuf|VKJ5Og%Kn>9PxkMuv(xR+j+*ubt@f@NcFb|mFBvnYZ;uaq7kZ#cUB{2J zCwH$MFtK{HWsT!SCPwY;KD9*g?5tPME?!$+w2XuF$IGowtta35JoD%B?@O*LIjY6; z@l~6cZ_0g9=i#-A9i)uhGhMo$eznKh)_ISE&7|E|dw7mdOIzLN@{xvjhBRzY@$XkN zZ~k0tZN)yVzk6DL!mzQ0&OE8LvB=tS6Ms20%*Lu!p9203x-?yOxbNM{8Bqg2F5hyg zjoZ`3-S0osw9A;@q5BZ2`iQt+Vs2FG7;^EuV&klQs=SS>TE0y{dDo@8Yv29#_eO!Y zXASB6x^l?wr`5AR&o@tNA5=B@`jv>BQf3d`&ds!I-g?D_R)e2R8^2+%&yQuU3=Mzv z{ifiwQiXerzEiqovXjn;OAW*4n+W`LlApSGQ(fAF%u4iQned^k267 zRfOd2OoQOsN#dRC%Z)DHX9mt zV_B_6g}iDOzWDFWIX!M<{=6}MS=5|Mxu3JYuYDwUaM9%}Eei)cdo^UtC)d#{XT2M5 zA3kYA2iGe_BMPNjM2uQ?>$!9%+^O8$9i7H) z{$bO-H+xf)|C&&E$@cNZZaw+j@#(dB%PrO{yD|6rFZXP$2EN|g`M}*m8|_Q=U$?oC zZTjDvHbw62bgS*g`E5FPI``ucpHD%$>CMcO!Y2Lmw$)^tBh@aC7#X_$UQ$AE({uG> zJ}>Ye>Tu8E#2-7Gw+&!u)P%Zx3%x7Cfd7Q+UmM+8;B8h;+{An&K| z{n79D@cJniq8{Yf*jIOq^J!6gkL%UdW^Fh09OxZ8zTY~ha=P+oj}P3^Y=5n|GEYM` zpD1AebKAD5wGwN1F7b44T=Qz`)RRqqYf|`5?&2A9{#cx#zP(Dx?WV6>)FyIN#QSXy zAEB}tG$Pw`Zc>X@zir*=^V5u>E?eK!Xz~7f-{OJQPMvnK`zg=$-6Y$H=N*6fvEhSL zGfr=cnB#NsW#ht+&#r5Fa7Lq-E z*X*}C9hW(Lr0%H0yn}P+raUU}^p$UHvPV?PgZ`28Chfgo`Qc>i)vMQ@P1;xE(!SCm zMe%xm>(UX1+j8HY`jory$Je>K*UlRzZn2s@!Ry29s`IM!*imt3pPr>QHoD&BOr6}n z+D*02i6vc-EpwZr6pEFZ)}etq?%&=kA|# zHA{B~TU`3h#`=<%hyRHA)q+<%PH0i0PMJF8ZoGPKe%faFy+O9;x6TfX_Pd*&kurbg zs??mwTC+=T8Q*xw+|{>_&$v5vPM-0jZ|yIr?f-wr|95*pnf)d5!tbkD?uCiX!5cp3 z{{CI1H|yUlI^!C-x9Iqn<5~?4zqqDVv#l#ix0-uzl-I)|W~J16HqL(sOL-qwSZeDz z@P_-*n3&~T8hOtu9kU~0){5YUdz0$e*7S*5cDB*!w$~DSUF}iyTjkZ^eeL*L$CK=B zXK5Bseet@~Pn#!J{_uL!=)2V$Z<)FBw+HJQmYV;|tU3Mbg*k=&85w(VS5UJV)5lm? zzNxq%DPz^L10jF!nty1~YRlvax12_9_#twW&-fEtOO^H;U+jh5=>@C}RFhmOry9$2S!%Tc;xgUA0md*aGE6MMAx{_NLmOM%=S-5p1i zZ1aAu^8jt*f)y7unegh8ndLa!6FcfvPRSY<>sP(r(^@$XZrUx3Xlow6;`ndp{o9_m z`)}6||84)Y3=9Z)XPXe%ZC<@z<=zkP+AYj>b@cdq_jLh9} zU8j%zuf)`^I=-yc-2)vvun8c%hHrhI%3q87#R~?1nl^D$kJq__eu?~8B6;6$KR6zl z-L{zD{mhqt_gnYO`_6>US1n9W zxR_RbpYM|HFOL^35!~vhL8WsWy7~@&@c4}H|2Y4Dt9=72+%I^3=9(Q-J~<636;@&L z$?vT?JLtMzpViJgs9Kjb?J~MbYnx0iW7lu7b9T_&=TLx=+!2pPF277h`Q&Be&4!h()5rz%WWmUX3ZKnSM`nP z9Tghm?B?#`?&4a@ts3n~625*PH?yJbC9`CEv(g2vtfxUE$R1c3c1DuI=ocBzQ^Bp8 z{KMpYX}=rC*yJ`i#?~Tkcx^P>gU=YyFS60JugH1A@hoB|@VBX*{S-}8JEwGQQ#oAK6XwiT0gsNt@e_c zof9A*l=X+xA4&Ef`~@XbF`OwFPP%zhyQ-5hoC%gbcAhC#zINW4QMPuTK)?c!&N zKth}SF+A85o6}L(WGxst0c}pP@U?T*OtNJrgGI8*a4p(Aj;=+U(2mJy^8~y$#a!df zY-aJpv|3NHI15$swX^RAk!c9KM~U}G?kDtFM`NG)95{U9?~|z48mB41)*SzVUI^D3 zZF43*6S>;9Ne0{M!0AAe#Z`nTYtjc>ja`6o9!J+rw2`m-LOS@_t+&$lw=tCva++q6 zuA%wV)k~lY<6cG!|6pT2RmVNd5O% zdspLd<#cEHje2-K_zQM_(f3(}wBMVHeIH1=W@BFbsxC9_5Hr)d(4TWi7ppVG{_J5K zFG;Ed*Pf$m|A2k24)C>e)+|?Q0QB8QlKW4}3|`-mW)J-*zQ^)1oURXnePMZNo434d zrt3T~?Z4*dX;#`2TGM`J=}#?;1^G#{XBzETL7JhiK+^RN%i!1ST4brcrZH_7`eqL4 z>InNn-}oB)#uY>}>AEX)9bcFF*=1-n$L*Y#DPP%j3-8MM!ncqCo}aWzrb#c4Vj+lP5Gj;@Qx>kRiNU%Mbpl#(~mmvcz2 z)7R4l?oHZ8f@EQ~-XUGR~-Zx|+}6ao7B0h`b}5o<*{lH}PH^40#THvWEC(QOvHuS)_{%9DiAtpWQT#=8iyDRnV11x(1k?~nr>ViP_`gj z6#`%AHX++eCF_|_*+9@0PO@e5lg3#Z?W3=x@n+KZ^ViD*)Cd~wurHJciJsv9ogZBp z7TOJ8&=p9!PUT0}A`9)NFX);>x(??@*J!PF^%r#AAzfAS(=Gvj6%e#bFmJlj47AAn z*fmX~ZS@7ahLf&!=Aig%`Fv3k?Tjz9y^oTvCzyA?rYlRMIU^`D7)M8YR3b?|^OIkp zKjtH7U*dfygmfwOLY{gba;Ujxy~%oi9$g!kFJD_|JAA>{*`%v%esn>8G85?XhB(!s zr#h2iAAN25e&}bHsnI0nsrKO#Z}>|l9qq^-k&g9Rt(TbzC74dTNXN(gq!Z+yEI~S< zE^B(y7L^}e)6BK41-dZJJCLreurKxx^u<&Sg{mMvjjqLa_Q~~ivqF}wr)sB|r&~<6 zOtI3sX<&p(zSr#{+1Py9Y^4qUg3ZD?LBDvaGZNzo5$+8=kQ3%a5*S*4kwPU6_U)NEf^3f-jBQx-o{fF2-Rr z$+GwQey!e`W}#j9g>X-NL%8oqwq<_e3-y7EAY6=tFO9O_z<&AH;sCxoU}3uQMP1WK z7h5*;HC^j9n!$qRAnLkCx<=&Z{*q~>4Hev9s)DYn#o-@pBhg>19R=kwh%1*-T2?N- z;dBR*4gN~qmNm^x`-6$f4Q-!8I;Q8RzJ_~tlAyjuU3W;=k^Jb&a@77V(1mexEdl>v zOZiINGAnEQn}`}~S0|E=#;8bDJ_Z?z2OG;;M6!KR*3f>%u`Hd^0(XT|M@cRi-_t7xUUY()G2x zmf;}IYgwctKY6Vn%?9X;e)vLOt3z^8-OiK`9QY!=K5{D?5qzQhHEg*-_bSr-Sn@fIgA6e z{X%~AEe$$go50tNe48avt{XAjIFjv=&v2W4A>3Wx5NF2OBJEWx7nX=`+kUA$cBl;k9ve?ca3YHF51CjV3J5wdlJwbnT9>`wpr+jpn={ z4tVQ-N4k7qU((sxhr$>ehGAb8fh2k4=U%?ZQnOGH2I}fTy7JY3$*}xl|79iVa>|cg znOe;+U$E;H=^Buq`(cJw+fHyl#IX(U@6cP4)bZ>23hPd7-7k!7puTCOuXeuTX|A;} z6GRkrVLE4#K0ci>oOi^#=hN;VzF>DE=<{a}WF_gx{P=XVRCGpx05zc98@&>3{VcJu2#fqL6Ud*}{`csKN-%siFb-Is_bpu8dPvi4SIE?=Pz`6I`Y)D>kxho*y_H8!iuR6-z0 z_3$6)NP*!jjHf^RdD1zrujfH*PiV^u$~T5Tj&!m72$dj40{J}RL9j1rS}=~c?ajn{ zAZK=In%mmR?LmzDIXb_I&gF*Gvc#QUX?FWl}Zgxp==qJ*(7PE3E7F9EY^Yk zBwLvBL!LSio^U7UY1PU1=y7zd3x*Re=V5!;ok!dqIc&(*^RmPa60=4j*ih~gu_}zn z?2$?i{Unh9Uw&!=b|UVJG6p|EEZW$0@v#eu=Hid?{v_|GmLF}_#xB(iZuMrd$|V?X zvWak=flnhIM|x5;e%R&gL(|q+ncqw@EIN?al)B)A37r_$9RdGHT!&$S9t}(;!+hk3 zi4PmQN3!xYn3a3f?Md_GV)kKt{j^J+UX1w328m5Ge%a_dwL1I}A7xC*$xfQJZpGI4deon}HFOo`%P!HL`Mhj%0r_A^W3}3E>=;Td6w!gD-Ra2j#XQU8^p)9fB0d z?sR01kt9C-suGLD_`!y94T&k~hj}@OSZx%7I;9I0T8w8jo#TDS;xFLyOmNJzn~7Hv z zop(>HBvDRz&mQ>zaC*LQ-p&**A$Hx0m z&t?;PsscYjd^_wLmt}L!TBQ)fDU^LjvZXLgqjGPq$>7?En5Qc-M?oD4`zV{DX^k#t z^`|$TyMpJS>6)kA12wdnCM{29#C$xQuFVo(3y+Ye=D8Mqu#K)|LpfN^z#`0@^keltt022VWceg*j$b zzVNKmm2L{{^P>yqGSw64vIkR!ldi%T2iREp6XSi4vMY(xBTY7McNejuc(GhRvAPiD zFVQ((_akCz-R2&sK$3XfPQ*;NWixdXTzURf8b5k5$wqn@NDAj)(7%^NnTGZd;36ZdM2;9*w|33$Q!a1u6k9;rlJoktoHSL^RUyd^4wbkj7qv&KGc zH1X!hSsei$8!7h?(5}gXaFzh!o{f@3%ziL-G}Q*Ada+Wy7?VO?5>4U8D~kNk->g!8zeTn}Qs7?D{Vv4I>L zN9-q#%^^0HV{3?EUcwE-$WSr#!#QH={Q8JE@1H=BH*Wy{VEbJI|Gw5f$S~7Jec@Rc z>gz!I_UA_*j17gzf7^($qRjHRJT{H;=jfab?Bj+kn;7onhJO0U z>xFX4Zffjo-hWu#nM==e^PZ95^O<*Pm17#JkuXhAelq;x)=mCl zbrJG=bX<{s(RAcC6toZVAmS~FEApL*vuD1re}d%u6ZgWjf{m@4h2Np@hx*H!BZc>> z$v;vm{38EAT+uU|_z??P5bry6#Sz6KO{es_;KO`tz>Y7_*LSb)-qsgPGR>k@x9cjW9vtOnSmcs zzZcD)Q0&C#TWC)P$lw;j{=>Rxl3z;lO8IR@yfPJDh4&=Bl6WlX2`65Hp4I!1zp;;u z;d-;mSCYIb-6jzaagY^c!7*-&6s}}^t<1L&A6-a3R@TiP2afUWQeNh4Ehp?=CH}aA z%%kNW=`QJs!$Jw05=;)sccl!7>5bb{lFuq2ABV_4(r4nzdKiTlg7~35>;?t0djAto zs3a>;{8pLxE#itE7vfF7m*v^oN!YDVJgKV8**Zwr^(XE?_3$R84%zKb8 zlkiOoXlDrVE5vmq-@ru%HHr5H&eE}<3@AHF14zCP$x|{*gNY9#u8hNtCp{V9Kir(z ze@U7|^0!Dnp7=`Q9cs&ftuKV#9i(S8{NmuHiT}Ym1(L+pNC9t2d}N@khpl&l-JZmK+sK@) zU4q?c;4J@8y&WQ-W^2h=x=?+h@aZHUqT*{vfBzs^0b9=myIY8#X(w~GHVJl*5+B`3 z=1=4w=^XJGY+%5~)(pY#Kg9p)#;|ly{*eYky^r~7YIhm1xpA<2kK}WE$egXyf!!Cx z%TObFvV2MW;%praoHK(Wjr#x9$%1Tc4D8wxe;p=sw$251%MtHI8DXn@T53$( zv9Ao+dKB30M!a)BnX|Pgup3MK@Bo=_mw%)w#7D)*fUOaM-POd8#>t$m2Z7xT;sJ>= zXKOoP_X6 zJ>ZzHyjA7;2jZj4$?{5hhqt#fdDcjWQwk3vo>9VBzO%wrdO{WbD)|K9n7*kgvL3eP z0(K{oyffr4+(OuY=u<14+E)tyiFk&sEU56`h=-{7O5&qcdNvZ5RPuX?FQQI}qW=VN zv_CRU4wtR-fZf}~`%nfS0rd`UFNoisCLgmo@vv(F4F=Q`I!oqktp)5l5#Kyp=4|~0 z>{cP({AZc7`RTChMZDlIGG}wmVYe0WsdHuCSpJc8#Q!GVm-t}fb$^vXF!73xlwQOe zQM@LCJgdjx7q`(=Z_gth0`bNzgZ+murV!79`T#dY{t)pD((6omE(1sZ99k&Lvvmou zn@!wfsmv9BejvVKh0NJ{0@$^K^kaTrCG&gok5rbp-)b2s{;W;hZjHkjAsCNAxe1=+lI*u719Mw1?;9(YLdJ@&}*iXJRK z7_VFVWX|TP!)^)U?gwSg=A6TBZQ?5r%bd+8hh0D7NylZ*=8D5^DDm2-WbPyXNb$tu zh_iX$aBe*DCFf*8M|lg&Q{9zeWlwycNE4~E@##5?B5oXz!x-Co4& z(T$zW+Xdc-`0=N*Jez|HyF-b;qJ|qaE2VM7YrT-=ulk_-~9%rUSk`4&uF9`S@(j%$# zyb#D++L=x-M**)a;PnN(rGR%8@V){*RKQaO{8s^A1)P;fa_}_r^H$>0cQSC6s}<>h zK+jnLzb)Xe1iTQ`pJRKZ&Q3b z5PwL#GVxI2_O`MfFq>_IiGN2tfp{A6>BPGdpRedAUXS<|;?4zR{cOHEoI6asYeAU@ zlKc(g%ZLXNe@^^0@gBr&z-j1*c75s<;nvXBBTv+=aLU=?^3B zqmmy)yse7QB;HfS7ZdNV;+u#MA+C&T9U(qlC4YtZLgLgdk)9IwU=vJfE;+DI(ClYh2=V^Js}LVV ze1=MXD)G6*m3+H^_)6lwq-QIeqi4h~s`OYE zmHm8I#hr-1B(5XW2w-UE@lDRwS zIZ3<$aW~@EiT5Y2l$WQ(6NpzQc?;<8qu=JM zPP_{7O7dmWRN}45$Ux~4Eg{~HxGUMSh4>ia5Ph~CCqA2aE#eP}?D{zf4^5^Lyg=iQ{uz+zP?? zCZo9zOuRqw?XtMkka#6$S)P(p>P)BKqTNA%Rd<^j_6=eA=;y)3$ zswi`M{4A{@UWs@qlHX6fG4W-@uMxkk;%|tjQpQpIqlJnQ{qPfUNN%N02Za)Ey&ZsQ_zB|lVvc0b0;iMZkiAL5mX zEB+5B{sVEiU9v5icx#pZ1;l%(^zSC#Po+PL_$ZbBSH!0ihvp62N<+p&`xg_hOWcF_ zMdECJEu3pbyp5~u2SrajaUF3*&yU1s5m)lwV&Y4QEB;vrT#+GD*v}4<->#BBERa7# z@+VdDw*~UANdC4;UJK(srtB#S9R2oAC0|}3?@IE8++@Ge;~A+T@tP{`FVNG22}q`uRNZh9qBuc#$9EaC;GNMm(1IK;qqrXA%F2cp~vZ4P`yb{GW8< z?}$S-VB2!yj*Vn_h3_TqN?gegXNh+ruH?7-#FL1#xv~&~83w(8)Z0kY1HgTnH=})`~aYg@7;5fdQ0re1WO21$t@hozH zvmA^xhq$*h6PKpRKa!LX5kDv*-X%;*=o{)@x1KI4CMr>ij7ZcC9uyj_-_b=!4d^ZP zj*U(1tBZ&Uiw%#E;-cc>B4Q#V;&p?gV!~qw>k^`dMo2MHy?aMTNGP75>k}V4K-V`S zIW!?GDoTp)txJrLh=33v%tR%;q`2_V#0cf6PgD$q>fhR@nNHpfY8%j|Lu^cQH(gjv zM5HtzG)~tiPL~)pARK&07D#gVnB!ZbS5xU+(bV14PQlIEVU3^4X zR9t*)SZFj{Cb1)3!oc`MRN^rpHoT5x2p}>vu`k3%j^yCpQHcpsR18=Vl{m!3O(<7a zc%}|nD@7&5Mn=R8(V_8)vAX7NMkHAmlQh7^y`Bn&N5r_e*ENDeWXU=z$Rs^DGs0t& zFb%?^2B9m44wZ%ub@6bM;u4}HNRi$VF;Phgx?~qOPwo(+SfA{5H@H`Jx4RS?9u6L> z2gwTVg#&$H?m7yLoeA$9Eycw%30;g1j}#lEp)PJ-(tspP26spsK3(Fa_+)n%J|#l? zfZHQ<&0+_DwuA&-)8?F|vQ71*u-J$`eG=SUls$Ksx_PAB+)2!(zTRXv9jOnFh>r@5 zhD?k4m2BXyoaF;IoMf5MaA=%m3|YZlXMDyGIrSOiZ04HR4kIg}2(WYri;d}*1VKc= z5ya785JfFEMi-U@BLk2!zZ&A3$Pb-C#hL!+Yghma}QVX~)-hlkX@eR3Ta z4^DFeW^4*P9_j+Q_$YR{$u&XC(6CtiJe6Llo6ab9e5QyqD0dfJ4JIjJ>NUC>Yg7zy zH_A(T`em!#b$UNx#)YDkJj4~<5BXQGLKl^wix`*$m6n3s42abvU(B= zv|K`E5iS!!E`gJBT5^$sk{AZ%F$waVi3Cp2ap#Q_A4*hSeeS$`UVZLg(C4NLH>uDx z1yOw@i~=2OD6K5UdMP*FG?1cNTdu@oA?=|4?&jj*Dm9IZhuaVy%4zE3;_fANONJj$ z$wz;VOT=#7y1BT^wJhXZPN6#}bgQRV2EojTOz<^je>)VEV|pODPn} zJw=Y#P)>bRjLy&9=#D1e5Z%;wNxaRuxf)5zH$*QLLqSh{2y%8Y9OjL+n-c3m1}SLF zaIq!{F4hV$_)ac1#<7Mvlr>b;v4$!FVvYGGIvQFW(Xp{{P%wjh0@??(@x!WW0JOpM zu1nB`hJ_`8m$98RG$KCMg-dEBI_y58T;i7Z60Ut<`VyVVJmaSGjeyn~-6#3HWSYAr z4;ebtWC3;4^-c=wAHl^-FKUP?_$4tSG9GViva}&18Z_(0(TzfJsN18WBXs&!s{Sf> zeYomM&OMKbVkP86gL)|EcyE9vBQ&QI=*BF=I#Sq>u;|#t*a1*hWd!vtR=hEID8+|H z!39tyfal~k8zeAgh8EZ$?gS~&XNcS^X6Ap!oy^H@W`Mp9&(00771~om{C!Jb<_;5j*M~D#U(A^(;bA}-jP;qhxpxThS z$ta0;A4++4tduDQDk%at`e?YFM=6~5D)uBn#r>2PutE(jScNb-YJDk8-Ktl(zJ00y64ck& zdQ?s>ddWN>N#B;x%Y+;533`T23%wbT0T_O!2!g&Zv@=0AZ2+>UNg1@!~wuq zf!+a-J0jpN7E4tm2pBHp<%p`psl};8OCE2*=qg01`ITcdz~BZhG!DAU11PdLsY|8G zj(ms8NX{tL^gU!%?&G>k%Ec5tb>?%2TAP{_~E zjmR3cnB2cIa^ir9_()1?B{(+z3~hOAc^h(R9;%4PhYmLQlk$O*7%&o!V>C>Q!53VW zZ4A|t;e73`7nLc=hQ-3Lt8A>21{7-(XXqLj0*7WRWgIww44(iaS@999F{T_Sts}(; ztk~)=VpR=YMNR@^Sr16LKo^k|7OjR@Ll{Z9n-YsxNyVEGio?#XzU=`6M}`A9qi$cp zc{Ktl$bT>nf?a5S><6D`=!>67ic;G6bV3;1{KNVh7hoe1x(v+8N7y1MlS)j6ZKOeE zvc3zeGkGbPt?Ee21xJaF{&JNydTv@RCWLYSTvn@Cs=r)T+b5o-E8fYKK8~E-;&ssc z*LMN1FQag*RIuNrH238mA1@*Iy7R@=$t3wFPpF_>9}@QA^&LR2SEX3O9vbMo zxqKLjv3e!g?&tbpycFvaB)~JQ(0Ex~-j}-+ebh|H8m=VRUv-2Igl7ZTw535|wr?Cg zXv|=rN|EFC^T?r-6*(4NxK>rN4FwV_97rtLEpeFhs5O-dG3Vp6{=3nJN z?^1YBh%Qw)b7B$<7{Ox;#^il+qP{I4ALBXbZtLah)iEsx&(gy2fd&(#1Jjbxyd)i{ zC7}VPRAey9&-;qC292DhCwgzW>x|vSB$VhIHMaP!0H2ja-VtRzk(WgBVsDAA5POZk z0xAc6>Y7Y7brFH5(l9jGdq|?vHj}l? z@=2WhaX7E&IVO%`+{ywTAmDukd;@WHe0K@t zZwR<0O)OE{Sxvyxh^xc>nUntu;;i_28^?J+oaQ($|Cr;vJg(ozO>Ji#0q-o}Ndi7c zz;_Aw4FT8Knzr+M0k6k#-Va?lejHRP{v09TGX;Dd$4_v2jtTf30nZiiQm~#HH%u2^ zzlVSa3V4Koj}-7<1bm}_|0Upe1Uy&3OTqeO+|=={E8w{9+mw8ifR7XK`2xO8z|RW! z69Km>WZKRO0^Uf#I|=we0iP`3D+GMMfL|BzcLMG}6YbRLSX03L1iZI^4-@cN0{(}9 z9~E$1FKjAbeHQSNv~WNjUv~j-CE(!#K0?5M67UTIeq6wB3;0I?FF_M5^Tb!cTM2j{ z0Ussca|C>YfFB}W9TM{>{3-SPC4u}q0e7MW5$gDQ3V2TeA1mOi1l*n$lBn(fjyT%$ z7ZRnSl2Z@H7EmBjCF^&if&Y zkUo$r;>m-5%6vTo+RK?1bmf%9~AKG0{%|G9bjFd zsrb4Gcz}S13-~AjpDW;-1pK6cKM-&Ytm88kuhIfuPr%y>cs~IjFW`#=e7k_35%3%V zSH8urE~jN-J)kK+cnWws0q-Z^DFTk`8BK+|UBJ%>_+tULgmrwT^p_KGPXP}kULBP2 z?E#%Yew2VO6!5(Qen-GF2zfX-Nu8g1TlHDRrR(>GS7rw6s4W^;OY`8^!x?L4K{L+$SeDqfTHzY^#v=FHr~ z(vc6hDaZM6gNZBt8BYD!Bu<{!vy9`sp6vpDLBO92xJ7x1ZEF9Q;y9n~E*$6WX)563 zh*yVD_ZxP5J=J*9JUhf2Yl<#SqgDfv!?0eZx zHTMwk@x;~XGJ})9$l0@v<9vMY2>5#eFIL%L=Os>mU5@kN26LQ`Z@7R@;5cvR8jkbf z?&dhJ|2oHcJuf-V+i6|Jbi7InxQl=*-?_$g+h0q-8*`l3AIR~`5R_8y_2f7oZVboya7PLFJdX3>Zsa(hUi&zHg|p`( z$N6w`IL_<&#BpBU4-*hJbvkw!;tj}`E_9KQ-CE9GmWfFIyEum3v7dHY`r__SKGe=vJp z_!{Z9luq>ijd$%l%6B8a2@2b`Xz#MORV#mV!2dn}NDEs$?oSGGs3-;X%j!|R_W zke@A(KO>O8$jLwC;`>w}|B;jD{a>V>Y(Lt`^U4D5Bj7y*e6WD;Ca(6|Uz|LD-@L)` z^IUu%5Lfa;85&1ZzVVCky1>a>)`tVwkn{Ruh^y@x&dL7;*C_dI8pn@wd@gatp5Zhe zw2YHK#mWE4abABlPY{AlZBGN@Xg?ordyeyZE(rA8=j3@kxg6*99P(o30akV|q zIL_503M8Hs?66ryIxl=LM0((Vu*KCYF=W;p}_57sIB`S1ugC1m~6bHWTn(9Ow1Nah%sPp5uJIH-or3zH>Qw-am(kEBO}RPm0?K zP7m*gi=3WEAg9FZ1;-zAJXfWEIO(@-h@s?tPT?%zZp78`vS=jhQS+h1)%o)@C(p-g zVq=3Io}VPH*7K0#d_K2nBI{AhI}3PS0dFnfHM~vRS&uk|%iH5EkPqVIkHa-eI(8Gt z_Z7&G639;!$S>jKpGbyq|KQ~L@_3b#=gV(9AGjDcOm{xLdJtFpVX%Owa-6T9*KnNo z&p`pd$Z`ID+OjD#Gmo7eiKCqt;ZO0uP9Pu4$@BHY5P|$ef&2o2{BnW(Ax{1&XXiOi zp07`|&Hh(u_a0|+x&D8AV#ql{IZT<5G%`*jBNEa^jiO>pB01&|o5^vi328bx6x*?y zU3L*lrP7WXqNZ^!$IMnq8cc;K6}#`}p8NW=UhA^1e}2Eu6#7kUxR^$3}iD z@;onU-4VPw*z~wJ89vPLIfmyLe#G!HcgF6gf#KZ@A8Ys`!`B-ACH#k=wQ#@uWO&K$ zk^Y%aFua-JoeaMR&f{b>oX5jshR-wn4Z}AYo^SXG!;AHZy*^b8uWxu8!@Ctzf|fSnHN3D@L}-1sGnu{D~7Lvv(Jri z*57UT5yLn2@$=H_vrpVze|SGr2>%T8itp>|=={I#^{(r$64zh%zcf6qpRc3yNrrbf ze3IeQ4PS2fHp5RCUgf^n{WmrIHaO3t0ftX9e3s!$3}0*b=Wt$szK8R=@w?&W`$yJ+ z`MQQ*XLvWm2N^yN&h>o4@aGMG+weTYzcRf40KX3UycjF)j&pvW@+>?r_*{5Du-x$P z#C831M*gCKk^QnA^_#-kPggjv5B&^(2G0K9GW;vUe}}WrvV$W1v!6O}_J1v$bvhc} z)A0KZp9<%E=fFAcD~5jn=e)jvb6$txtpA(gWe59pbH_%q6cJHfZY z=Nk2MhX$cwbK~;oC;P?Sxcs^0ukc;L=fdN^(y-t~u(>?Fi@2^o(8!O2^XusXIQv-* z=l#HEhM$JB&l7RA(64(28F!HQF1-=XQ=NkSxoY$WXaMt+}&OVR9StovM zWL~Uu3!K-b`{CS=li|Ex{Zm}8&tl}cJ{#fe=dj_w8(w)_?0GeTbKLfDuFoBY-w$V> z8F1E_4d;6POI#oSOOfX~=fJtnoyPn5>UF+b+?{uOf=75fkAt)SS%&X0>K{U$^~+3% zz5aC#zX`qsEeJq<_vU5zhL}4DSqQpF`n1{vS0wBg6Nj&x;x2 z`s?p9cwX?<@O;^5c(I3K*RLwB>$ifles{yi!uj>|aX9aDmcm(opW!FqtY78f$bMw~ zBslLM(%>9-u;GszJ`c|N%i!#Djp17iKVbMtIQuX0NMs#2ZZ*Rj!ntm3;G9=CIO`8E zd?K8EE`;;G=qm^54N({}(vdzs%&=>r+cS95-C&tBiaqob_*m zv!8)*&Ub?03k_c(UM!f#`rwfv`(?F}-(mRohM$9T|5lh1SvUIS@b7|1csw*R{BLmn zIY@6f*Jrfh?~Ci>Z3FUq&`-X2vB1^`kA$+pe}e50^8EUH4$e9iruz4}>jv)!s)@UP z@`FdX4t0!tWAVTGN$_=BA z=Esi@p4fjS@o>HwGT-V(zA2pbZ-jIG{|@K4gT=#f6J*?xMm`hH`Y*uQ&s%VgyUFll za9$ruPm8P@y%wC;nXcmDyv~;P@4p9m_Vb|1FOvLY$mazm!sB*<%5RqZawDGu=k;Nq z;XlB6ycK^uvJUL$5;*T4u7Y!Y`or1JP;q@ck4OF_*5MVy*TLCmzTp+KV$Z9A;cei& zPWFLw|4uXfWy9CPx&B)XFMxlK_4&>4N>4=Q%l>N{o(ku@x*0y{Nnc;@$H{PBho2I6 z*Cie&pCg}(dF_L<&OxJ2qo-o`(-M9P^*f5Ye)8Z041XNX>(4?s*X`fp;q@W>{}Wdt z&+FR>@uZ*^uETH0bH88wbPxX+M>xkVJHxMop6?ZK&Z{Y$^~b^sf|B8JGTZPK;`%uM0D1Pg z8_xZH63%>unX%_p+wfL~-v#HmL*ZP9NrumYv(J~{tn)6M>$%17_<#EO>h-)t-0j~@ z*e|JucY$w1egZrX{=DHU;k;gb3g>$6hjV?78(v~=&_uB5a2$E4x-_F z52!l5FZ6XPJ@21g|ID|8v(K*LVW0UW{QE~MKU>oKLq?sMaE|*Koa63Q_1CzDgY6vh ztkduX|9O2p+-UgShX2FxCk@Yr^LTg{&Uxh-e(puzzdqhdWk=@4^YKbJ&+BIJ9l=|} z@rfaTg=c`WxYSagq9!{}sFlHeIKtxZ5wR)4|B68TlSYegN|9XOxkjggn<_ zk&$14JnQEg{+r=dUMgCi^M0;)C474j3HMh!ILGZ{csiW(dJN9`pTW0c-0uyKd)fD| z_iqJpw}08^)o||j_HgF=7(UwYrwm^T=eYlYa~(Dr{tcXco`kbb$;Ezs+&XYQFEhOH zlGy9nLcCb;>Gi?mto)pIfVf`g3^>2e9yIC~8ufoi9nQDvQs1W@_kHoC;M2LTPx<%h zHXzUS-)iLZk>|LFjr=JiU+gvCr|ze`xbFW-^YayN#2)tp zaW^i{$Mr`3bL2n6xCf2=QRKHFUt?vYpM3a3Z^riL-}3JI=IzM1{QCH?xE^;p^1M%;W90vhJns|M82Oz>e!r3b0eSY*WmRNeT+e}qr;EG! za(`tQ`RU@}ahPy{AMZIMUx4~NPwT!D>67c<&G4uG6Up+=*N{~Yo>k6tzM9~t>ijQn2Y*-z@4*z4KV@V?^u zI2ml@Cy0l~NroIJ4;%R}P@l(1xwVl#x&AE;U*o=L1)JV4AH(y5&xiYQv$(!aR{yxD z%z1U{z)zu0vbd|mK5vEdIJ^(e{BZHV*6j-4$2jD9oU9ht^UXp2JM_8V$R9?Y*Z;a3 z{rI|m131?+MO?4vBgk|APJ^?~45Q9L&i(Q`>TsPe{lw2p@2@JZXt3$|)`YXq zy5f4iqmXwWhR54DIO}8@b+#hUI=kSkbHJ$6*zMF{)AMQpXPvg6>PfBC2-z{Uv2n0aoy)OBVa(iP7B`x!pk z@GLm{eAV#PaE`kN&N{ysUgnF~>yT)8inw0?j>vO;2E)0YGvVy>O~doxtg{!+?^}<+ zng0{cJ}d5wtOM8c8aVsB9?pC>IP>?xnSU70`~t(5yHK#{^?y%XpI-+I|DT(Dj5<}^ zVg;MNp4?=3y5S29UjcWQity`Tj^W+iVg;L?Z+~&uC$G1U!t;X9h4o)Dd@Y=Hw!(Sc z{szwePa0m_9hAZ5`eeR3oPDMm-Wkq1L*T44)$noa>njXFea!{10&EOYe`p4z&$WaW~(= zrq}-laebZ+GJL@SU&rN72aj;wW_{&d*MAw#`~PL)t`6@X))}5_)M@o~>~TAbyZY|Z z74|t&T=$t_$ctSBXHI^3+LCz@&{cv{yN5aB^utu@SEW5v#;T!;2ig9IP1J(_)Zu*m&|ZKj)XIx31^?P;9Q@TaQ3+l&io!Y^969`{|D!J zRQXV3edzZcj`(TJcR+!6yW`tN1>OfJ^*L*##k?}YPwJZX5f!btzjUk~SX zrVpIgf#GoWGa1hM3*oH46wW#y!8z`BIO`vWvwpl=^k8%6HS5%XvraOc`CAR|3+HuU z1f27l250@baMt+%&ioF;j~IUUk44LH-u=TMaeZCQFnkW2eXfS{Jo*C8^W}))7yjh? z)YswKa5u&9>tl-HH^O_ZvRZ@MjEv5zanW8Q$A95p4Q=A0h76fyd{=aIXIh z!(V~3&U-er`Le5xzb#{LQ7v^vI)v5Ol!E@-- z;mmJ=e~f(c@2<-Eby~vR9|?x_x4=2>UO4mR|0pV7^!vK*kD$XkHP3q2=S7mZn=j9= z6eHhCyjYN5A3WAv;p_elb@BoV$4x%xKd<-Kwc>7E_oX$=FZy5X{9qzTw}9>-kP78>z$ltH;G%eiQ01g!4FA4rl%& zxcd=KIIl0@%zqDO{WI`ws8g<7WPMoY-SUxnaeby#h&Zn&s|^1~#Ymp}cdWQuANOH6 zuR)a}dDfp}_}q&ldG@(TT<@t`GOX7xy(^LTe4kVf`9#j(gy$NS^28!1}TKUoYwMX3V(06_`Tbp6IP3Q^e3ao?hMyJJeU@t&>686j1!q5P4ex3Am*RTdWsUu~&Uw9B zDejKL$w4GM&U1|XM&zd=zstyfiTpI=j~e-(kk3N?Pa_}SBnSnY>;FmQFBjMIt%E%8 zAFem@?U8>5^@keyk;uX-xK+H zs6WKWk3fC_^6@FL`>z1!I#(Ch`>TcFw;Mhb&h>BFJa#{=;q0fQxb9~tob^9M9Uc$4 zaMs^~Iy^p)!&(2TYa{*h>)JKqu1|h_Z*An;A-_F%Pk5c|X5@PyzXSO}Mt&Ib?pJKW z`jd=&Ci47#VS$l<5&1mSf8WS|h&;dUA2#yGkpB$zOSFi+{^j7Okgp`}_6z%LfIRnO zW8@E^&M+fC8qRSiq7KJhh&;#5MxNvDGxA@^+{ycEDk)Mw| ze?Ga?$iI#}|Gf1>BmXh-U*h;YXylI~&-416;g?+(S!b?WZE?5Gyq|1s=5Mwar_ zQhwZ>%0H^)J+ZidcAs{SmNotBH06uMvy|6M@c9kOmx#y9I)(Kwz0BvkD(`uP_eIKo zsO^2P^1JGIkCXKd>zAtQy{+;w;+e`1imz7Q{z_jbUe-IT|6qOZU6ogMH!8ulNO{?Y z-ghd$SG<($x3KB4J1MW! z)cYvqyTunNuiVV%_bQ(wUZu4Ed^ql(Fd9C-A%IAp}Do<aOKNWeQt^Jzqj;$O8MmLytgeIU4KO@|9YzOwXMDHRKDW|?}_E2>#T3< zU#BUr*UtMqafI70dhQaKGGpqkr96`KRJTlqYxa`PIsA>F7PaLUjFuo&4*r z%HO!z`y%BJr+F_>-r-j7DHr+Yu>WCQ{p(ESpLg@VS$Sp;@6{?s*LmRY{`COmn|gWA zRzBly?}f_$lrMCtay<_FOz7kD8OoQ4=P3WJug{me*guDLPWSh(dn)fU*!yhdRfc%a zQU3R#-pf_?&td&fM)=pAlt1!-_jKjCqrB%R@0{+vTowNu*55tOzwW7g?j-M1m6y-( zzFm3lBi_#{ZP4c-y*(QdGm!nA6G59ewKKe@{KS0+&tyY7JJ{I z{LNRrm#Q9Jzu!{-`cCB?U-O==yuxzthm_A+;k{xF{~WH*Ht|&DEnfGz;mWhc=PAGV z4WG|dp8clxv&t`i%X_Dq{yFSt;=BHJy7IXHc+XMZ`$O-A$`7vdo+|fQVg05Xq`vYc zpLk!X{J}i$XO%bF=DmvCuZ8tTiQlXI<()piTzTc)-p?p+u-|)!%l&g$e_p^YLr`3wCzxuF$y+L{V0`C%)1?5>Kof-Oo|+*2)v*_eipo z-&M|!o2~rc;;WUP7T>G9TX|n6UcPS#`;3?0FG)~-v3QE|MDaA`$>IZ)w-e7$ew+Ag z<@bm$S3XR9gYpUD`O0UBpHlv!cw)Wi{nG3rzdmWoKM~JTzRUfgRba&C4L>^lcMLfNxZG{ zFE91^p2|yA^FCbpb>f-Ihl$TtK1O_r^5@0ZDF3(kPUUZl$2W+c*YDyf$~#u~{iiAK zEe}VFJ z`8~^&rqT0yFUjXKm6uNTK3jP;@#V_viRUP9C7!Rmi}-Qny~Rs4i|%u%c!Kgr#8Z{e z5>He9l6boEJ>rGRZie0h{DNlQvz50HU#&b% ze7o{~;swgbilgT)#)pqt2|x&YvoUf zpHaR^|7P)2 zc2ukuQ*{J81LM~Y`D|5|*J^0BRbot4VZh;L9{<$9mrtGu!JapgV4OI;Ve z4iAd=RGuimC$U8NLHT`xHOh~R?^Ry!M&DyDu(zO!{d5 zJS6;R^uXc$lG6u{PmfC;&@a7TT=IQm#>6F$9_e1Vc$4HN;mf$>{v$_>82CUCePCqz zz~qj%c5gT)y6zQ3C1qJH$)?o&D0zSC0j{cE|vJ`NuG{jR3o_W{M_Zu0M6suobszmoC( PmGAhjM!90x^!xu0HlnsD literal 0 HcmV?d00001 diff --git a/set_token_path.sh b/set_token_path.sh new file mode 100755 index 0000000..1738612 --- /dev/null +++ b/set_token_path.sh @@ -0,0 +1,3 @@ +#!/bin/bash +# source this file to set your token path, or add this to your profile +export FEMTOSTAR_TOKEN_PATH=~/fstokens \ No newline at end of file diff --git a/test b/test new file mode 100755 index 0000000000000000000000000000000000000000..4469211097b878e07046a10a603f9b4d73ea4985 GIT binary patch literal 180488 zcmeFa33yc1`9Gep1_cupG**#NV-2=wRIpI7giY?~L{Y#MbyQGLM3ju;f|BnLWE`(z zi)*z~#ii9&T9pDSCIkpBD2O8BLPQ}a1V}_BEXw>opZA~{%f7_`wadyHAj~enF?n?EAv}MMo~tej86D(w~V7R+M%=! z{$FHj(|@it-R#dS)2=msvR^KLY3Aoqx;VrBv>5Uy`?d714-vvY`!mDLgZ!EI!ME}KkazuqMun>-QXwwYnS;vGhR#UP-xb{{xsyxIC1>AD^Klp;`q@$$B&y3 zo!oPBpHq9D+UtZ#H=J;?j7+>Z26<}e*&{PrP0c80neETtJLCT(E9%lLEEnD?&d&zAn3fr`9l zD!KlLG*ixTnkzrLh4L#=o`?UZ|GNY$0mtTLyxCt5jo2zkZ#?WwR;GoM-KH=mO zdS_f78c}@t=rK2qxn|s?k})@pC>}Kah6!UvTyf=i!no%88zvYSm+Pj@HUxH=pgaGU zDZde2{#J;eman3}7}tTT^E+1JpOtSbSCNBt`C22twmi?2=T6kCY|HK6|E0kHeH3tFTe6%)+ohMc%5cicOIjsYIo+{n19n%tz9X9--Exeo|yvh6Th6aH56ZUnG@gO#J+AAHX`h{ zJsy#`<94cC&L5}W3#T*L{3>J;UACBAhO8)4(v!n~q-*K5nzeBP*yW3O_ zS!sTveP3zc`(*y;vDb+=I2GIaI~6-yJDJOzReMW11HcLcAS+pYwTxr?f17>u?y0AL zkLrx*uSYtu)Bna_o%qI*j!xqAHTXkyz_it9{AZbf$-ESOQ{%IL?JcMrFkIiB)EX`+gOz0q&2XUf28)HJ4mE zU{vME;ZEt60wOX0c=OL$nEEZ=Cf|vV$j@@(gY$FvTP~narZcZs)sp!tE?}gz!tDsZ8uk-Ec{& z;pabK8jKq`$zAA$X%K0yDTvj>Y0^f8B1k-OhmvZT|m-FZkq=8G=J^lVsoEZvh zR1Xj=?DigtHQ>a?U>qmDl%rgXUqkV73ih>;0T+xcj(_9CKUoOSXbCFRs84>yPG6Um zM~?<`oB(81Z}ta;(aNGW;POh;9+{fTA!t(fX#n4~h(w+FS4^5T=BAQyH#qTW`By_+ zudbSwKLI}{pzb{L&smuI4Xuh$>%?c}JMwc{{xI`5lTTa=LLuqh-Y>i(^V1?Qb5Am< zf7=OM!tSvlAjb(^g9;~J$%Pvwzb+;T8M$gIgxyXfE@=fKt_{b(lr^nas}7Yly$Azn zm?yI*M4=jkgHR;p)!>a36P_kx&Jkq8?(6x5(r!~Yo>ZaV3#`s5SetemfgpCROh?Uh zyUil($Oi7fa;ESdz|^a!g%wXfHwETNf+6>?_BG%~-kq!QBjoLXj5T`pZ99hvUWXcW zk92r3zt9`uJB&aWvpr^*aG>CRdkV(!fFV+V`FHZq-+;@J@r5(bIaT_ONcG*ei+zQ5 z@0^alc^HR;OcJJhM}AH?P6;E;cF1HO&!4KnvDMQU4%0a}1@1|JOG(dbfT6`$uh;*e z5`VMfzd;8n1Pd$hq>qKG+F!G&!?jgI+m!v|n%Am^X5IAdgnm^+JO1wbbDpUhnse8A z2c26rwDY}V+y1F)XzmN2#JW`tJ?e-R)t5L`Ly0@kIj6vhFL&Gv#yV9j07sQ)Iek|~ z8_-0|AlU93#~plF(y36bv*)#G?^IMn{cd+EHsjCEhdUis#=dOttX7pFG_2SLv(p%I z2Oli$ogT|X;fAWpp*Hm-vgaKh?!MSry#YN#@rF>v&i)!)e<$^HHzAou6P3jJCVp~!BW3uD~EJvGAd& zo>w03P+hcRX?Apt6W@%bLX*KCWz;RVtfS5BO3XYI--A_f;va<**~MjVWly^ug|88V zx>KAUZ`ZX0fN+lwoU#wHr(J}qa6J6k^6V$eXAC}SMyxusIMK1faR+6hYsqO)QpJgL zb3>~(hZ48ug!*m<1BbHD*zT0=D48#q4*<)+Bk0)P>HBtcHkd8b>ANgiTlZKfF+9t0 z&&&+P&&qNtHna`7;X`x6nd_XYfw@ACs)1Yz3_mc>amV+}bmG?*z@^KcCiX8e0!Ym( zIckOz@3`HG?aa)cUdPt%u%wd+FDc(KAU896+FB_p+B-OVUbxNlGZS|#Sw1-aUTjBZ zQTDtYPNH)oW?7QIV_O=CSIG--SERC^1D#QS~YhqJTQ&7HxAt)0Y(B|vv@8=||oZg41mV4Ydn zAx>f_2r&%vy8wi!ZtJ+=_Bl@GPN%BK5TZy3;lw9eLX0f{fKZ~f6N}{T6>f&2;M&rg z@C-0rVRG(w;9Mv3g1m6zhOwOR#l>Y?OGXZOvj~$b?ol7^yZfdq!tN~vXSw}O2*-D; zwZk(riZg!*xud$48#@c;uUE4N(>yge?#O18#}H8-PHa#ATk@T-4UW})~GPW)RZaR!L;o28gZ27VvAY;e1RslsKkoqcY(-ia${u{t0=@&Ftk@N6tYL=B@cYhS1PL z^+Z&2a+My}A=&|Aq4CRgZflM>6XdxQ=Ir6`4{>Y zY}S(5gr+CcF1nd9&P2zZme1l)d|Bvqk2StT0QJ)6fDdyM$6~KnnWdN|9LYSGv*LI) zL@gXI$&ZA=nv|EM$^$xd5p`$+4S=4gs8>}}S@BB#N(`fWiqOL+YgBc2bDGog%c0&Z zRQj9@2=knHm3rGW80H%mPFYEQA4EhZY^+jq&?-!C*x$-Yl;r1yGOFz_#xCa;Kp zX-C@3Ch+na)m3Jg1|J?$>u?MRV3jJ8CihA>5Q@K(f+3OFsQh6Z8sS?!Dr}hvIMBR=Z`HM&uU~?j>lWD`3>}`2gXQ(= z`1^EKmZ{>7oTX_D+NXbD6wuj@@~Z$a9|bxED*W@mH%K-A{7u!^rU)!D zRS{F=L*Z#uA#^qs+@>|8xySTrhMYH;D#!Hl5p+ZVoDaLxOrOH0*yWijOF7?k+otvM z;kP;AN8HzE6)Lz8%^*I{R23K`u(fTc`zHa{3@7}L>0>$0H`!}UmE}|40)|bMos+NX zBvWPa^AYJVQ)P+d8*2~jAZKM!_f=K5T(uNc&93f?rcaTfgb%+5OjTA>BHbJS=R@IA zsj$@ZvDe@+H(Ndor)2A8QeGfB9Dl~)JI#Xq5mkKEn7gt1wl0W<7mX z&j!Hx5O7VOW*G6NKp!6p7qwJT)KbN9y2A6);5$tEh`vMZLLcCaCe4EN)v%o)iVsY% zzFg|R3D*0lFk5Wzy<%8{jeM+fyJ3q{@c9TX{Bogqf)QM<@$c(O*b&?SY z?S~siFQpkbjK$5kVFe$Ol-3Qq?G|td+%UhR5^%#Fy^{@W9?j>56-m!m#M{$6t4>l5)d7Lv6Y{nC6Bp zkS6eJU8qd)1-+r=ec3OVH3X__Gy&Udpv(1ZrIK{ zILCX%Pj1N#`w(r+8u{F?MJT0q7-m#}&kcKeny#`M)u#kzm?~@De2jj*sj@o7Cn)Ec zDr??+0Dp_BX0_MRQel;+&kbuU6;|E)M%fZ0sAeiwqk>bl#>3ZVzVxx9Sc!xy`eXo@ z58>&73jh4gD3)rzQO+|}V-2l+RlhY=5mV(u;i$k!J{!^6^l65i>VNc99Rtp%GFPCg zS@OJM`dG#8!|qX2Whv*I?wx79eE3}z7|GY?Ow*?s#7{I;1qN}S8`fFC*){TU!X8GE zSl7I!ldU#YmQQ^PxYShHIr*yoW~wZHK1$qgsw|Owge@^u7Ik0MMJ-qT9#!6Q_^5ZR z=~HB8<-_j)Qs@RI4)uVuq>pqlc59Pn!TO7SqQ0-xx9o-;f%>p}uNYPzE*QkeD&L7|y;ppZU?U$r-$x^7 zfz3(je3B!cS9AhuntEQ`WoJhGLnr=@qqnHytDM!67evZ#l8rOk#r6 zp~=JEelcc-y&tD^sq~KRIXJq??EP=A)uZd3cyp{E4TClYgY5qpgI(Aa3S!V9g~3EW z2FyHssv|1Tp~D7rqjz(m}Q>hc4tqGzsC1!5ba})jDq1b;~hC9BJ>8 zeC$0`spH3!U0=^H5{R|p664rGQXMQ6ujJV%VTXV{x0{0ydv4OhP4#uc!H%SQ3Z3W`4wFuA*?vs#VT+E*x%br~ zbY`18Y4f%n6xbkD$L!su0tCSUq{u?p0Cd$f8TOTYUiz@zu&>>yel56SU*FdQvy=_~ zb7Z*J^CP}ad)2OMi3AQ5cs&xMi5~{qtBy2%GJJhDsio+n@sX?|cZDFwIcn>Tno=1M ziUkKkFzACr{>o8|-IBZ-?v_l`swBSDKybTEUyOPH$ij$R!)F1;K}uEPDpDI1saM0C z%!~yG2vDNNr304rYB0{VnDXryEVM`u_VOkUHcMg`1}L-9QBb+H1v3D&tcsQBifFH)b@5IzJo_vEf0AH`Zo~T!LHaHl$ zWf^xA*L0nkOl&B?^0JW~zh2$JgBp;5tdM&_p$2LiKLlt29LoN!ad>~DKG;x%0c0gi zOq5i|_~54se-W5rK^W_dX@WOu)F~Lkj;X1eRNXj<#9$<--AKe%f|=ytQY&AF-5jBi zD~RNDQq|nh6xM2#6}xv8`j|DT?i}ki)B=@MfAe*Quz;FL^$^Nn9;hU&{zwnZp9d+S zIz#csuv>yPNofh_BWQ_NmH+~Y->5oZ&~Ut8e#kus*u0O;Y9{F^?o>e782}5`tFNv> zD}7`{SBVZ;fl?uLXB>0Go(7J^3?Vergav=oO++b5i(O&q17%~nC|w}-Pg>jp7=oNI z++RZ$qM`JKW>9l>rA9rM0$>*JAr>TX^v<0a!P$=d2yu)g649mWt2?UPjOAd5ZslEWNdwX+gD_yBT&73+D1FR5M!M{TGs2sN8 z+`Do@Z?u|&i%fHdya=@rdL3=4CqTYUs@J2$1?Q$FLHYN5=uQ45O8ItFptVf2QFYSI zWYG2Ma{OgSAWQA^|*WxxW55q*#ws{*X{5S@- zdHZ1?v&|cUp~e5}B2Bh=_i5ML(yq^jU4I~L6rWvRbCIy5u!WH2nL(V0*mVq`Z@7PaNM}UO14>QqLVgRu99keXFsnBvnJQ~$% zc#ka431OEz?8028np3Fqs5^?V^i)^Rp{<6*DnOD_`k%D3E5Y+dUBlk(2!UPZLH1Y$ zm5M=-;)77y4N>S+bu;M?6mx7>ag-o?0Nki@2XZ#R8Y)yYNP8B*%+3Wl?ToflMPvI< zYlODM1)4nz2XLS(ov zAdxiwcW`%9ROx&U1~5=Np2Dbv{Y9ZxNLrQ7ZCdmMiabgJJD@=*Ni~Q=xl@QZ6)7eM z43cVmpaU^r>0xLrgLOt*Q17{BE@AXAJ;opUmQwHGryI+=;l~IM!Y2Yk##n;XVk{?GbLSb$HEh+ySeEO_ z;BbjGmJ|9Dn$K81Vdo25C8qGjGb{m2!Jl-2XDp8g3_(ulGYYyvLo~+nEHh}zSYDh0 zV2ovm1!64c3jH7`cF{ePx#ag;BD(qkleuDq8C=%)E>Thrm_tWM#e0w`)dv%B5~7SJN7x#E5DAe~ ze@0v5dTCePdgepUGjLPtYZ96UL~Edj4QM68b&iJ(ygVuDr;8S1^(6uW;=poJesW{7VP#Qz}`99?}-~ePO zzQS=|$`gKC1xHq8?L=tiUWy~OTT76OsYf=2wVFfJa)dQ-d~l>|Pcr(Y{{)r3rlA4x z!`VIELRbgsbVfF-ix_MF&eUVvag45r8J436i%L9p^H ziOAk2)mP2}zA+V%@&f2z&}kj%@Jn5sfnZYtPE&lKE-u0=4GZ$T*5C0qhxaU*XZ|l{c?}LM^4;;Fk{`qet z!N!wu;DLH)<+*GQsZ07oya_R8UQ$oar{jW7G>cF7|`ms|pSb|NRrBpI-eZP*#Fz`62@ zFL4aFb7!0?pD`S(%fsP2lhIFb#I(>^m`VY;z1LyQ)T@`vsC}9_P`j@KfbF7Ni{l%S z-@{#~D53*K2%X{ph=@a70TY0DBw~1G#L|cpA5jT+SGcBKbPGJynMB-#2X3cxOwZ%c z)ATa7IXjx|Sdh$(p^A-d&~VA003dT|lf#I52ukw62wJ4hj@qF`_>V)CCq)A; zqX&@^Phl4vQT|B%0Zb8cU&$v^hTRwV!|=^yVdC-E|L9a~B-%R!?WNvuI2`)RS4n2W zQDn9oi!rME+Ejq?HIx>__dH!)-9q&#td`J0!Z8F@}m(jjJP~Y~mP)|eq z8Nfk=t3Ib6i$k{t*3y>9Ffu@f+am!9ypTw(9+jFz@I)|{`nPBxOARk?uj+6bT56fm z>(9dPerM0_HlGGhOT-JW{k%l{AvEtg5ih;{iTE?2-K;J0-6Fw>b#E&mEfN18U@()m zHh_Vbh+htg@i6$`?t5U*L-swscn~XdYf;mEkBiK3n&HM8{Y-{~7hf3A(zhX{p#40sKlUsoD*|{WDA=&|yNj8=fHqdMvGxEG2Cap2GGbpVRUWO2!12Lcu zwc5UP%X0RPBY8JIKN5&KFq2_RY!MNAN=Ntv$#W6md?;o!i%(7zgsF=S!d38Kn;={m zM3~3%n;$x=?h(G?c)nbOrP_o@l|&X3()a@jVuIC6W4#I3jQsvDYkUO?jlM~Zi z56po}hrQh8xQ>XciDa?OV6m0LVhnNZ^&Uimj{C=QfbdWE*o$5iE;m~>b?4(R>_FuB zR|kQEB(v{3N194X0Sn1k;?rJi9|a}p3PHz`1(u=EH%9zRA((`Z3Bbn{CAK$?-=l<( z=!M`J*MA6sVCVGZNVw#-8(zi*f8`%Zm5oBIsqxLVU*{*)WIw5@K_E@4B1CHYO)9NR zG^wTreKq&k6J8)9KwS3orZQx0iBvHOzQ6R7YNjStBtWX$TP9T=?X5eU?G35UZjMwJ z`APK~L{22tXlm!!lF63+=>pr$jFiaohQ8aW?{l0+&-4Dqo{&%M&m4v2TV!JtSq7Ag zD0HNk9&ey5yItOVmR+R$l@Y`h*3DEB&?fa~l*;{F6QK5XF&0RI%SNZ*##V<_sbB=_ z{+`pgM$R8}e@|YRz?B}PY75QE4Z{sE&E?-V`L0IfvkZQ+^U_Kon8gae4 z>K$Cg-xN3(s|^LMGdDYt4t8%#A|vk1r>%?c;+YNPYjSyK5Wl)@%H>8I9>CzXeb~LT z(9E#Y3!sS(!7)J_Y5@x*8#A(}<$@KRs`802RLCGX`Y;Fp_nBXaaR?fhjn6+3Ws2tf9ZG_Q5 z+d^!Om=jSH+Z?r?<+v>#(q~^nxK9}FTaurfT~3mq_2AN&oX;aD|)!Zy_Q1X?~p zh;4xmmAp9oy_=zV8XWf_HkJ$9&VjjF2vykxahAFjR#ex#FEvcQvBwoCr-MnB$B_aq4ZHtMcUJ>hjbtNE zP%(AZ{S%&B;^r3O_36-MI9CPycc^tItSI5YQO2t%ycyxZxk zl+|d9aBjejz2Yf&|)^r9up0Fd;^-$+iJod>z_`s z#|R{h^!mOng&uqS0>Ox`nJYDxJz6nHfq+}~i0DQM-CcdfD7`6ryh*&lA=AJ`jyqG& zVu$K|68%JJ~L&ktT8lhry+u4bM}T-w!5~n)tIWMA!)o)3K3@%w59kdD?2-2W$zOQ zv@!xKORnD^D+}2MA8IR0gkWW7H?^`KVQHJQvK+-&*+pbF%S?VlOJ;(5K|>3_4bA{G8ouwY>@L{d%e z#sT#nn+c7^{L)_}N%`|o6gc{GMG7-xv|z7#0y9BXtQZP@AZ$5lb4Nbx32{{g=;$}w zI4%ceyo9p}h800&+^^5m3j{IPyJh$QXY7477%fBY{j#v`)BNGU-9rq4cJvkF@6kjbEk_{r?t|O&-N#7) zy^Vssy7lTcxG&trNfpvfxSBo6?#kUEBUhp3F(+8 zj#;C6?C8|0N_j5z4VE^krn1JIH%LVhY+d01BF3Z%%dz;17uJZ za|4c%kwxL(Q~1atuRS1TRl0=h)zVrzG7Y{k6o%aCg=BEDf6IY|yo1k!$SUrCH(A9T zjGVrT-5*U~#T&#WeZ?$NGpl&EH1Jd0=+SM8=O3C`MV=30Xv$Tb>szPY6q5c@$+RnG zHq0Jy#wog_%4b!RB|OMinp(nLI04bj5>9Qhgw3kcW|ol4)Y1}ad&l#;P%;kgs37eF zBR0{OZFG;VdYFm_Awa$ACt{Bq6p{qP{+n#xpV6J9|C~X^M0hv;^$+*0j^g~qN1?`# zmY59H_3m_rAh2ECc?8F44*tiX#*a1p zk3$vp{pDOF2o^@FD9)7S;2GFQS)UKv+u_Gzq=y3FYaSBCs-6_%X0$SfF2d07NZodI zOs?I7Kd?0tJ6U|u6#3msrx6Km946>q=~UFE#-&A;amU|{aj(I+p$?z>$Lu-@aJBra z<%i=}DjTk_3`@PRUF85B?O+}_1fk$kjJYw2Y+I8tUlh+l#suAZKQf)5>s7-sGOW+f zbUXb(g2(mwMxNVNXPv;gtDi8zYl)gkMDSIhqR^XOI&v2_Z{`55un69-0Z)b9>mdZl zT4MpMb!k-VS;OQh_(l$QymuI}kzcpsDu*Oux!FUTdWcxN)b7uE^%pkAN#BWvCbA4p z-_=|Oxws|EP$Ac)z=&8p86T#tM$!Ml`&N5r5djFKSizN8QhDMCf77(jmkSuL%Q{Fa z;DP%CXyuVmyF5nOp3^_AN+=-H>4hq7H8?+JOw=^yiJHW;LF1T3YXY;5)5ojH!+E%& zpz3!pUKfWY?7m8ShhrYq@?)Aj?@6mgg)|!{g@oETQuG-Hqq%F#)}7j7;*5H<_qa_~ z{+k}0{_FWgyouv>+VDsGX>Hhps1r!=R_fJ(0_@ePG9<41046Hccc0p4Dib&uk0VY~ z$t6AshL?m(S5k^`s!@vs=4}|q?0db`1@v?8=8S|Vw&8IDTHgac;DqOh^%c7-P6$w` zQD1$p=S9LuIz}!SXpb1an@nk1{#4dNqG;?*t3Kj4i}SU!fD;Gl&`O@l^;#84u2c3E zKaf_B=NI9gCR>-=R(LK7NXesD+F5uWEqSGXwzPaaKd-fn%4d~^;ew7Z_$ZQoy_#Zx z@_R)g=m?U>whrfC7>>Vf#9@RqyjPYNZvY6vaL8SbQw#H+Vj6Ju^f^oVCSw7?xR{5S z24o7{7!nA0&Px!&1QDoEM{GHcw1C5mH4Vj=ORK^8li{g43C>Q|qslbm!h2)oa;yY+ zDjZ)H#;X0rv?`ZY7UevkZ00nT4g9W`-nOH36sSSH$qR2HiAcH8`%~%!T_}tvh4e>r z)Zw~#ZmO-5=#{TBb*TtOs&SUu3{MS!bFh3GFtAY{p^%ILeusaA6f$|MS`v7H!bX+Q zg_LZxD`dOLwn(3Wr?ML3b&arH5hs*JLoP%1&H8jZ75cdvLdi!UU@ zbVQ-02i85Qj+E)hy(dGY4!R^_W`~|R>ibh<()6lqF(|ZNmzMiVVfEE=UEJUmgF-K( zknSHCwI$yH38V`f4h3;EN3ET!tw~6hFvqzrbbJ#nA?vKUw0$ z`IMg49l6o1rreyG#EEoa0jpXz;YLd4&FA1y1yu*~sgVJ1;ur8H5TLwq;MN`ve|?GQ zXlJEf-Gc-IwhnNEHr2hj?~l=?m@Q}2(ropEd2n_cMK|L%gm9GT)GDRcP-RVu;1XDb zs<;7=WJnuW!pBDwUOsiqrY|=Xii&MKI!E4}0~yF;VaAH@k=JGlTwL7%l zcO0QY7>3zF@ra9pJrq$qqCFR)k(45VOsUXa=dcvrW^zH0aHR)>B%2rU#su~=<-%li zZ^X>ksOzNs?_5afq|58op?Vl&k+81yD#MJyr&D-$fIb!?2sm025y8LESq}EW&JU!J z0&`||iey>_c|aSaXjFx$@zI6k?*0+i$d^s2QEz)dBPR0tw^sz}i8`L+6@gX~AwPg3 z*lXcj4N_DMh}Nj0Y+kywTF*MAhD0SFp+U)F3G6Hbn_CG2xl!D0j|*g;S00p_CpNpF zYr+X6H)y^c)QGM?gxIKl|Dk`1vZOU0JWx|u;X|(o2urI9uPD#7`iEBpLP)Diydo$t zf#tBM>1BL<{-bn@BXWfF&PF0T171=v^Xc{+n&8wpd#c1EPQ+jE?ZBqD7mCZcS+$ii zTwfM3k+V+dptixebzFU-G;l;veQ0WAby6pt*k_@sk+n%3bYjhH)|4(GzFJA7T^$pN z;Tb>2ot{VehtpFiD@;oYA0?>@-vaTq^Cr1MIw?l9ue^toM3SLV-O`!Ari7$kP4CEg zP%x2o6aMTBh%_4GrRG6B4gFpyJ2tA}sIZHkRI-_Y4OS7ldNoxN-*7L9r4UvcWZvaxawnF^_&S z%%QdsVu~{`YqC`Wkx=Zfpnz;b3nO3k9nHiNYbGGAWUf$bX{5MsO?18k&j;V4aG)-* zm5&i8a6$8o6nus4mg2Hx$(hXlXK@EV=MD5GRm42itI_Wo6Irhgz`#Ox8Mp3Rpn-?P zeda0Dw;tvx_M7}namsjZ{b4d92;jF?_B=`H7OtygOcBqo!0W>LZ`lwn?xN`#8K#&5h2qqm3{yN#7pLxInBwzvaq3QnDL#$GP;m2ToJ*GwCy$`sxvJ7F zbjsj(A@62f5k^A{?~u!`QfD+`o%Hc}iY8}3%UmsS29ziyCQMm)UmYbaU&)^eD%j*O z&I>N#TpnaUe9H!mDIALc;(c`^swB`votSW3|B~8zwQ>i0b$z^yizB89wnx@6ek6(z zK>33*HUYjxUQl+GX~m#SQqz!p(+c614e(A!E504$56VRE;J}9jPjGm2PNla|(*Sw;s1u?u{+Ul1rh8UwOQ{?#@e#ehqveUoZ2S`i_J3}$98Nk?GA2%2g) z5P&@(wZOpEB6VToH}ReDu@*ZrLM8z)ilG5JJMG|xs(Xccye`8ie zJRTi%ec~01!-wz++P%^%oyf))>Y(dIuULYv2<_LyD70vc2lFhWFkBR}g*((NU4rJ4 zp%MyBm9jS&blGB1=vrM`?kgpQhU;PmT^8zI^?MfXJLp;qO4IZgIPqREW+RYSaLa-H|0SBj~R|97+!3G_2p`VMyQNVLBR=rRhNU+M$44`O+k{)u>ep;Ud3GlUj@C)+ z0)4%kjUDlfl4|eq9Ni|qtQM6Zpf#t;;gQdjfzy8o)8bD;gs|Bhz zd}v48{k@VUpOs4}!Zb(q~yK)5ARaRowUc zTb5hAD@Wp}o6REg%@TxNQ)cq6Ks<_X&a83b#q;CQ+3wBBcytb|K3uNVI1@ExhEp}< z5l#>HAUoW0s9YuOQ88aicz|NaBlAag!MAYc6)$&IE56`RRxY*S?7=&oMDcv5Z}B79 zcW(s*Ipm0jbL6nakaB(ctZK+y)MeC#oy3s2sRz_r4*_mNo`-WBE#dA=V&rls5q;i) zJnfN;%}xs_bI9Cqk16xf8$%7jQ1=tr#K;-WuSd>@Vo5}2b1v0RVhUb**oluA-Ta`_ zV>hl1h2kIcJ118*H*h#{{qs5`s5=hlTe7&FLW>(+PT|=N$Cpljxb^kr#eILsj$MSM z2_@QL$0vgMM_1H+5SSdso{xvr645yt6}Jtai9_@99H++;1|8Y>rb&ZPFIh6&I}JLd zyg2@&#v>e`@;s9Mbw~N|xG9K7Nf!@?lC=iyy7$A0DbEMyn83nkR>|UkuC%$&A(ig9 zW7b|`&)fvQ5L3E^4?Corlj-Fow6Q38V2Gb#QC6aPx<#3BJ0&z=QNCw{3yabK2}rjn z3lI%jixLS~lqXq3i}ElrY}Tl7d}`2%#KSwu>~?J}NHjGnXW7^@WmIl$L)^4|YQdf0y3TRXc=?=O2Wf&iF0i*I71K*!ADi5)dHY$+-ATDLP zQ5lEEBnfE^WT}02o_EZ;Aqk_t}UPTxd{9gRxhvbXGwo6FG%*6Y;` z3w?HlL2$~h@O-1OEA6#ip)@mUK27?FU6HU^KY#H5m0kH89^LyXyV3yBlGA+F9K`o! zSK2xIuqz#q)P>rxfSTBqKTwK)hFuxs)48dE_ib1BMbT#MN@-;|M~7*7yYBt{FfHv{ zFfHeMCm(;pw2Tj;Qj$y4l0u{8BWqpWO|vfTVC&MY%S*7>`?4-Oh27+QF?@J zY$XnVh{lynNWD4}{UsI*{|9H7@@T|X0=IOVH>-pLQ}HpQvk50WXaL{>d*;q0P7c5+ z4{HLU{6v-R;MoToRLKUO6eppbjX{JG2Pq|4_ZAwHM1?|ZY@O~#g_`s|rp0Yg{zV)X zQ?;X6c^VEXF8yaof9=rf6E05QbSesYbm(o8Ui8f>pj?lDBI0>aTL{1KV50xM*pN6+%yw?RuxH20T-y zWg0k0kH8Gd?$_VYK=+jOkU<|pnd9myTc|?eDN>;i+{)P@q-$|-Q$W?LZgN~jLJhsM zF@fd>W`+wb2K7=Dn86tQn9}12HE4Nzv8ORT=$@itKhQ}g$(Kj#Z@}E(7(bU}W7$d6 z_K5sis3y{Ul`f%>*CScji0gFp>b>$>1ZjiOHmH+{=BN8HNQ zZAJ{+9mpagYb1)cvQgxI#sO5$%&W+f2_L!N|BMpBc=H_qsfSV>GWNQH>yiKk^H@vZHcnO_of z(V5MPusC)+;-KD?{9g&!_VwaqV91W)gl=NnH1%UMtc{R=`T)00?lpB(4Al z+J(dUMw4#gGF!1>@02j^g^xr&s0Lc827Y1u91VuY!ccaDC2(SIHW!Zm7wA$X1YCtPAGR0bLLEa?Vlg-5T|h^&Hb z6Jk^7%mHOFdXud$4dp9Z>6&_(2W|*IaUJ`juwI>qLUSV3?NSCIF|2(Xz}n#`HB>-X zvrJxfMB3R_5|d_iRgY5OBnBmv+rxL!Q z7&tAd&Hy;Q&?aK*z0DE_{AJ3n@If7|&px=l!v^$VR0be?i^NE=I|hnZUxkI0q`CWy zn_8)Jz$lXGwDWHLCQ*&Lg9Rro zLw-26AyWt<95@Er-5V6^B;d0HsDK#Kc2^gWvgz)oe*>(KY^liE4@{qaB|OFH<+7|i z5V-%(F$~7h7MO#%&^_sfV-@6#$^>=}iAhkYUj1HD1Nt5Ll;#!%xL&n_Bg3`@A@}@< zu?^buQn1q@553%w`%oTKL6iIiGYZ3qi2tfTbb=#@tP3Ybo4`G(UgcoGd#0GN1|hW6 zLLFmcN7xFHp~Sb`_{QB^d+A6hKN2+@jz?*zL-fP=^lZ!`l+YaeGXt%+pa4A@QX{4! zaV!9C=BrGBslV;S{7GS|rTM@lvTPs^jRiu1Dz%CUY^{hL+6WYi!?p|d(taO8>ZNKi zYPrD%E_q+wjBh=gleo~tD0+{jvjqX{vgxU6%ZwqxZX>uo__4YSLwc1c0x{sJa%l&N z`}$)3L{P+Ql0%e{@u*kFu%6r1+KWl5lSy2SJ^)Fcr@vTwO+_gtt(^2G2 z8uSuDEEGJNWIV9{0imPX%%(QL0$eDYB3xT2g^1cobBZfn5K_M>#LGq45Sb4lMW@co zLf(sscuhM5?~OviNafEsq7VXe7f}{UmvB=Z+xtvrNGOAY?YQ5LHw|_BFVqAcX((OF z<|a?h!};%W?4~Dl$g2^0!+Az+oPocz&S=4IHF?6t=nTqUy3Tmw>Lxm4vp7wd9neeH z83)nCgPPPCg#n%MT@H1I#*GStxHYRVBE%w1VH_@41Qf<-VBtleEh&sUF)Bro3Iq7) zX2O7@QA34Mj^U^8wQM%6(n!dUI_ zA=PzIhg5h)p27%kOc6#e+i4z_R$2GP=dxfEnyo<3{2w8m3 zXDXo_Ou`O>#DwuxhF~m>9KZl!W-Ja)ffg^(h960>e{y0{UKg>zY<;6z3+K(jj2NHG zxo!CNx)t4`g{-A!<#RLUwr``F^QA^_Hr7+dAIz*nm1^@ZGC8mRSWPPwbi15#Ho(BN zKsJ-=YFcDyNNQMI;4ox%yZob=D+u!AhBQ5>J0&|}KRs?BEkGke%1M1i=mIoz5`4jE zgmH~=+@6ZcltIZ0dg9J)ruIOE@eu@{^U>HK0T2KT6dIN6pK#`gHuln$C{UdgI@=<) zdAyo4fyx#?B6MIormI7J*ED+x4Cg%L&YeI6e-IP5>#oa4U_tpht)!oOl62g1SzsxYcK0DT;dFK;69#3EVe)@3|L_e%b3u7|fv zDHAnKzJ+Z{uQJZyTAle=2($oJ25&3Ic9s@FKR^R*43pe0K4wbp#=tYtHvk>&NZ!-Y zE?AEB$up$NGeD@`_z;rBiDK)gi$A{0n{gBc{#=qIp|PTQY7chS5HXS)7#-EC%`7-+ z89z&=SK6R_GAxr_dYNN)6nN>JT%-IF%yVOA+6*)QYx>{Bil$zuKa7M_U?1v)`6Oh~ zw5_*ZB`+s#FWRxRsAODhV}E{;#KCP<$2|kDMK6vlK)Z|svgDdd-<8qteIIBO7*1I& zUlZqH)N-_J{;~@awy5(t#*XYrd?O4UcF#T$(TvjrPp(yCpo+}F+4BY@!yU?<*d|PF zaIc8dcUSbYushJay%Jk|IiH;Bu}gQT%FLd3OVa7k;KaUU`;LX7zB{9zJM&=-`LCPP zeOmquA}1&4`Nn7`bqNH*ZCiQ~e+(;BeLlzGRx~k}PvrN{aF>0^@~1#nNOpWQBdFXdRm z^50@M=GPB+V79D0d)k93n#q%T-n)0W(G+&C3chlD)5U~Uqc&6D$9A^9~<_X__T*Wm%}7@GI3&rnhtNpDDPN{R5no0`XDu_&9@da7oahv5uwK0`z9bS;n3=$X;C`W|f8zU# z9{)$xD1g>yGvR*cr}ivKGGSwIqHH*w2iJltq^B)f3$BOOBQ>fQ9S_4iK3v_dubnUA zU^Gf6P9+r(P)VfR(?duRIFvK7iq>@`^psDz3K6qqkyHa=B_4)*8MVemN-3u&8DpDS z+a99zrb`9R!IJ9)364@=wk^d!_xlzPulH!7?KcX{V2KfQJEgRcP$Bs!g4ZTRm$D}^ zKDolJLdOGYbP?Q9Y)}h3`h(wvR#Fd*{!--ef;58Iy*u!*xpN$fH*R!r4&K!OK ztJop^$b=IwW3Rdw!CG{BuH1?aIP@1PU6yeO2+?;V4@I=MXD%_&8r&DnLc~`72B#(9YGEI7 z391UaKues&gHs7QA@~uuitF*`F=hr;8n;3XlboM;gG5u~08{)60MN@U8Y8j`o$;K# zSq7)?6I|z#gRvCh*3SriaZDywH{&(V*{GSD9%QGw%xJxna7iY|J$H{&B`EYqv-xc6 z#CEkQIRz}n2H}Lv#(=;1Utzb`r=f~X9m1`Jc;ei}%Q&&RG9Mt*K70BfIe4%SKJ!Hy zhvR>j=_LM6a@CD!fymuRB=ZtfQjL1j5JTdlR|$aPKO7&ZX@(d_gc3tCJZh9Yj{9g_ zin>3hAH%Vy&PulwIwEAa5VxQkDT4pw^B>h&+4FiGKt~borudfAdgY*{yptH)m5hGI zkX}l_POiWgLngl8X#uxG47d%2U4RJk{ikT#d9aJ5A2F076o;&CK0mhj=#?=z+|*RsfjNoFZs;JTu6EHP4F?Jolq-w!+_AS?QYlu*bhcq)P$L6kcq%utUHv#0%o%NARl$vcbC1@-FXu~>`vV&juzqC8e#<=~;B?jJ8Ao2t&^ zbPLV$tN#S>@lCI>$^@~0oQVda-on5=R@bKFE`r~)P^-Xtbq>aeRd)ki4RNs^lYRyF zO~lo!QMUrJcr22A_~!jfZnzqfs2HQn@x z!@Ne2c-gFh#I{Fct>fc3LM6ja<&9Civ4Dm0FQTWY@(%SaA!11N7M|kstSBF2Y+^;p zW<4t0i?9Umnk8`0^_HOU8cksHt@q#(i0OR5w9s>W4=RmIAXY=8opXP13EJu<*ezeH z=Q6*0HJ6}2mp~x)*AQh1sQ(z4)~FId_Lks;FMoOo1lV=}Cl6`#P6QDD67<(2uJ9Uh z2}G@8LmG|Kmf**401a1lL1?=K=8GZ_oDWT7WUcn9)o2;QLr-wNzAouvRhFcEgH8&m-ns2)WYSyk0?kk_*9=_U}^uo#*|1Bmd6v{=Nc# z;|b&}``we$EfhTrM%rqjgm$V?=guIJ^yTGx1(~vzqh6Ii%qsX7H7bf%KrnfN5Yz*) zZ~>?-nSOb8DFY&?w&O)Ar|i>`V~vg4z`vm>Sc$Ogyt71olUUC-P?z9%r_^AR3_tB= z8I|!=Vx5W-nE7|RPCAou1Y|?5@_Cu&PrqW{SIh<*;|WpC)Pm)xvT1y|1qrV_sqWc8 zbnu!H1~FHXd8TIgsf5p{6^3Q~-9&H{p6${JcDKt2q*cU)8pZV9dNo*}U=K~y;M)=@ zF}a(Ndbr8!Rpz@G0E58RNX}O7kmjjvfNgF_^Q_I0y#Y8;!lZ7ax(0O&SahZv(9s)K z2he3~#1XVN&%Xn}(8)lE(W8^#$&!cB)nF^}2ZIi?&;Q7mftmI=g;Oyjem|Wv#mi5C zmzwNxz071A)hP5eF-@UJW~1uC;r;U>=DC9Tz4B>$JcDA%E@wJFFdcJ(c$A(5iUao? zHBpd&O#u?#U|_eM`R7$)ZUp;W1O0@#t^zy<2g z^m43iy&CJ4vXwXl;&GpY3cC-rH;g)L?a?OcoMZtT@%1 z#E6yCQoqi}uf`V7S|ZZH4AKgSZktHc#@1R-anHPnhA$jx#Zv+^t$;=?9 zY)uuV(qkyHQb;xe%U-s}v_gVUV8NB#;&n7dH7GJ8^|T$lY|LA%h1r6FusSo{_fWFV zEhf^K0f5Apo}k)A@^ID`@+hfsfq;Y03Fz&80aCKa0G&)}o#{Mr5wF8Q-^qUlZA z?Q$JP77eE=6snLvyTpO~vK_VjR3e*}KkFmN;%y{!;8wWX=~o2e<`A@) zWjZi#joTBLKY4c8iZm!9vm#xeA}NzLKq`6mrlf%ooneLoPm*WjmiqppRLqAWr&eaO z{Y##aWWu^OdG`HkO-sE@>B+N?QAucD-|uh=BM(dsPmnk}`{db`nGbpoDwml?scCS=SlXYvFIXi)9 z_%mlyAdQv~vK-?np4zBFKSQ@l>8XSYQP(P4iUzS~FmpEd16>3Y?aQ1!#!`IF7PRx} zoAk`tTJo9AoE6Ydin|&3fy`MS`Qc{aN0ZE%v^1GBHV9_U3Q^+EoN=cs$A6C%*(vr) zWzH`80v$}|41S$2b4Im*J&|DMjPQ{;BZ9unS+10p3HI*)Dsv|AgPF5D0JoX5V=f_? zm^ouZ?K=NK%9uIh#HfA;-t1-0iinz}Xfv5J56+oQ;OveP-W!}&bD6Uof=;|j{OFwC zeKGz#rcBSk1QI(~CW#_s&PX)%EMRIwn0Qbxv)J()cA=rkoM9P|s^U7^%$d-)Y3A&p z+u6xv&WMJ|oDqelnKQvNA+s^yQ<<|YWX?!DlR4wWBy(l~nf8IqneBtjSrOVJb0*VC z{5^xcgPAkJ(Tlb5B8-yCoIP5SGVaKRyh;ER|0{PA6`e-(5yNKA3^n#IbB0Jt4($JQ z=8Q2HzcAuu&R7C=3X#lk&-3JOCSk^NTgoKLazAA)5@oD7{16=_XjV7HWhV!R z$#rq|>X=&~V(@rq@EMoVE;F#yOa|AN5zhyB7ZABFQQXLN)k6&;*TpXTlk2J)3Etah zuB+gD-9l$FE<>!Jmg|~=5%-zvx(Z|Xb6sonnBoaA;?{|lHh}CUT7JL{)&0tK#mxjX zdT&k)CR$9cYqZx0LOYe=KE&@2Xl%n1!x^9JTo)J0GOGW5k{U!j-yYyUqBNeGxEVe+nY%OkS<$lrZLHazPsG(ixSX8@hXzvVd3Q>Te?XTB@h zz(VBPL<{OoLm74*G_P3(8*i^>TWkbl6r}NjZftBzA3Q`e8xPf#BmF2(y5g~!_?YLh zKl&E7sM;>bA#$=T<(qfH#bd;9_*YKmG(1RiD?M0etnSL#!LqqnoqriWb#Q!>(B#zEbBeMh3 z2%KuK0WA~fw87DWYefvqE3`Y>=arm>45j<2S@H(!)@<^3?JI_8)3da2- znXDQF8-RB9RoKcVUERBKRXcQy9hg_QD(I|CX7U#vi#+u`0`Ee{c4ueb?J@(444|VQ z7sce|Y*~RAn|H#A(dd#GpMPO(qcp^GhK~D~=!yhheWnMzyMpu|g(hJ%!OItZ>V^9% zo|RoK->o2>SIc)R7_7o2;zuh0CP%$e0W5rZnx2)&vPztaok*Q*JZS}*fD~w7z+t;% z7!2kl5YNsOgz$k_4!5-yxo*EJ=yr+eltC*k%_v;^$u6JU%qy=fd zkn8toNR$$)~ML) zfbDkrEKpppI=%{I+*c{pya3Bn**R$TDw|>9@^CDU2!H^~m=XQ_6@l`Qfs!YbBD3TJ zJeWDS5)E956d#wZ%${+MOlY&y_<@`~?G8r6(Gh&SkYCiSbUOGKE6Xfap=u9dl5_)7 zkWx6FB-*Cx-!$cT@dv5-*(Va|dbQRT>$ieX^DEpxRTV4-b^DOt%8G$W?KMOPMZ=(M z`GsA)0m920Zof=qXfw;f)*<&vIRm0?&BOPiJ-&Sxfxd-igl705%-x#aWyp<_b~qP< ztY^r)P+MBZMaI5AL4@@y%wEKx;~QvI;ymXqPK8E=i}esS3$BC(*b41U<6F=;gtNQR zhnUm`yYh3e<0^=+Gj0``Cgk=zoYHq;$el&%VU2sWSN#bV=sVqR96cYyYRWcKT;KF3jLEa3%d&lJM8|I;t+QKLWJ3|h({*+ zpem#GR3arL=2*3@@^D533xi8$IXBIaNcdMjV1 zeKwRtz(n8F3$<5$a}SpG3Q!^48c&vQk3?sPF)oH-DVq$wcLyfpLzd&UxVah7_OPZo zSlt|w-W_00KMi>VN1xahK@8Y(U}AzpwFMU zbOsG|VoHw6ngUZm>g$Jws`2RDY$p?UUcf**@q(C8xQa_a2LqpL$JYq(5_0x5o=F=3 zo0Eoa8Jv9c9w0J8V>#Evkg0gPZ7NLh%v#)8MlO{)x^IP(eFhA>ym?2Vp@))|MH`Y+ zE8ic+3MZntGB#zZ+JGF6XtBDB_3^4@c#>!)szI1W^?-x%jv4Yi(^q(^PJPP1@zoo* z-=|JROLX;l9_;FX`;`y>xEL{c@Bl`UnI(`dHnQ z@D7$U3h^zNL;-VgIH4}*iCKhONn_&1k99JoBc`JZUZwfUFt^FYQgvc)O$e>bkGs(~y{ng3xh z54G3gKiW2F^WQdL{S9K6?BcWiBicvA@E=K-+VGF_ zjIlBNkI^LW+wg1Q6W8dUoKx7nH3F;8Vpx5mnr8I}2Ce=<9KtOVFn+5aWAZr7>I+bv zXPo&jqWu$AU$4(z<-#D!ivDb?p9l2+f3^C59DsSIS^ZC>EZyq=Vic@?1oUpf>I;Sc zpSJp0pidL4-)6t8{|4x~#Hh!z${Swb){cNj0wmGZ6EkfqPKFa?E zR{y!?t^OtbF@>Pje~Lvx5?ZVO#YG+o|5K}f^iNxTopJMDJG>Mp+T_mR)lKdkE-Mrt z`hXnqq53Z4okN6%h}$bR_t*CZ$~Kqa<>0lBJ3JD&aG3h~g-yQM7l13n?iq!MTXmu& zw(LX)8%aHIH=!E)y^Axm=iQccI_yl{9}L4Y4%{Ea^B2BbelHvYQ|PuG+?_vgd2r7h z9B_gFnv(@`9Z)V0?)#QsXVIGJxBSiqaPG>xUFJZonqya=An(9SYLOPM@ZE}Y9=IpU zBpFXC$&So3>?45wR6sydI1H$oUvG$lEqI0Ru@vTMSNJ;06~01eq5bY9!seHLNm$st z<=#H8@Lh<&Fuu8Bb6Y-sbkbsY6Zc9J8SaX|(F`D{snm;_x^T8OkOn{iRI#&vth#mW zK-#R-g}z?;FZ?%I*D%0-Jn0eO3--yLci4`_)g_~*eULrl*N8lYHFhj6D(M&7&>!z@ zCiw!oBiC4$>13K^sfOlQ>h0{ZdU{%Z)~Q5yu+2BZkounL`%aJdor(=$o76qk#ZIhd zrxPF1uMjGc_f*d?xx3i1%Z1PIeNJ{L2cJy_pY7IsHlkxB)VF+Jd^SiJtnFt<8$SEv zQ~S$jZ;LxsYvQ@Gx3lB#!5FXuAhXjkyMDg$~lqvjgX?s9V7T?jN%T-QIH>q?iEqsve0GBBQ; z74=_|{+W!23({ab2aj-x8Um+YjzgwSk1D6)OJO|Rs4hx|J5*>+4Uvcq&b2d2+` z_v3fz_Ob4^m>01Jdfo2r#|bfO@JZO6&fjHnPU$ja(XeXJsLgZwtd8Qqf!WvX^HTP3 z;l{RGjx_xK^ph>{`~EQHi^7z0Op7}lsem_@0n=h})M$w}h z`pFi0?Vnq~eAY5?$2zPM;}+Z5j-MoNIzl~tWfyN22w{o;Z}kW(e_?2IcS|FTSQbgU zu-+8>On~QE%z(fi`{7X((=Tzn;dqY1gJcL2ksyKd*hFBzl;~uy>Mxc0;;uh7vGv_d zJ=vwX`Xo;%VvrT%(*%AC;)7-d-hL{x_HDFK-QI*Gmot{2BsYD7z`6vrkE4w*5mCAY zMv-O&hxl=UhTn(EVq&_vcbY@^MtI{I{i*FWA#BlPmz82VVO;KE=?L0i1;3q%5OY*R=*j#EFfxaYt;FkFbRk+{*o%fE}d+DzhjD0556o!ff?8w4{cP_ zW!UR*U`M4OBF%-QR=v6wbu5&|EVvZ_%rgvLdt}708>CJbK(`wT2x{1qP`^TPEx+j@ z=Z>IMqTpm6H3sfLbPczT7C(w{+^2=F0|%ZtFb;;-sM$!$;@l*FH^}K;95))xdluNR z=96t$fklf61Fo@dV#b&3*3+mWw-dh2uHbvyvbivnfKw{&)+f9c*hmsdSg($eHgrwI zWk!M%_-Zr^rLUOH!AV4-CsShzCQq=(`DY@4a&tkxMxBKAUR%zCliX8bz+%=Yz}FGc z%)rYMP)T(->{abC&|x9)Zv1U=I|j!qR#)>2@Q*m%8=^?`$F@Ly3@-Xnwyp3{l+_UE zZ(?6&ayKpmE)Opj`S)W>x2$c&k&F@yy9>)RNC@b(d!}Yu`yLRz)Ye`1#f-?q_ z$%(hY!Ep2C*HzAebUPq}*zSWgiT13`40V5x+7n-1Obq^rJytXL!WM90bZZOR3^mYZ z#L}nDLi$U6)Mn5g)AR!w{zU5mhmqApn~9WO{0|_=Z;3+R16v=L^ zaKZPm=qNth6q5{EY-?Ty9%${C{X(d&mw%)3W8YX3ca+b z;o^h{3DlOAC8I4rKwg<{4Iic;nW(dMYa_Px>Lt^J6x1q6FIs%PdWCdjgsUBsMs?Lg zQoz1gW>I}}A;@}gQ}CoN5D^r5-iHEjOkDMm(moiL-?AolWDc(d1h7#ZL99%0jrNVo zp~7(GlO$f#0#S%0M=g`$+QR5+@>T*5MPM2$Q0Mir%lz*^sh!SWP+GeOQXh8pJ20?9 zJrC7xnoo~jj8}N{dqMuI1){g0MH|&^LR7mt)c2Cr2U5lHM!z{b3FFaHX%g6m1tJ+T z?;L4>*X%@3=%@`9&`%^F6(t5tst#1_q!$Y~>7DGyVYMxj&H8%v)GR3#S5lVmn?HeW z?sXB*$l#V2_g+BL2a!tIW7hG+w_`mOkB->c?JEiO~LFSh5N zHqMdOwWcX^pU{7<|3B=#3w%`7wLhLA0TKi!DWIXTGAU`B)K{WdgS91tiJYTnAPNF% zm8ghd6%Z3dtvni{OvmxjYg>E0ZLLacYb~}_i|>S200k1>f=CpB69NQ53=bjy?|1EU z=A1K`;c>6`e)|7?n$Kr4XZG2Twbx#It+m%)du^d`!As6zE*{>J!#>nX!Ti&iR1fIU z;pzq8%PpuIU~4mXvtn!lSN)x&FRetM%zGw|KYn9gCTBYirR*?!{E1k@0Y~IvnI=@A zd)M>GX>NbPwuj-)S*efb-~<5s&RKAFV${XiJ=mU&ie0kl{t6|SSch789#=43p-J#| zkxJ4mLXn)BRDvd*L@FsAoA9|{hg71X5tB+CWI!dhoZM0=4ZVO=DvU{`05R#7V}npa z=ANaMufkG2wr65VCC)@op&ruAd|_-m%-^91q*5W-Ez#h~`pM~zRGK1n_gE=Oj#Qfd zD>m6W#BB>{V6rGhk_%`g!TB>=I z&Aa2|(ND30A_-trOJ>E6 z44jPwr%hWPnb?_xJSrq~#Ich@CI2Xiy<rZr0pk*F;(lx>S4Npr7?Jz_}SDlx=OR+ku3KV4#|&=Nx|Vu_(|Bo{->6%#}J zK@k(rLcdYeEhdH*zoPPRc5s#mALJ1i7l(rw`W>=ac9JI-N`2ws3oEQncd2TI42t99 zE*T^ji^4aK?z>oQ6!T_zF%(6fRHnJeD|t@=RwGhmM5kiN^7ZQ4t~#P)@uKZ~SDY&& zS^*CnEM2we^)OY*Dhkexj?b4?0DYm6tz>IHv?i-tB(0$S_E5j}cDT0mrwq*8*<};VHPzF#Rt0|!kBHeu8GcF{L!1u2SFJvL=e@(?Xuveo5C-@EqCZ#3B zXicm+KNQXD(o7dA#u>wbD(pQ~c2z;fup)Q0q~fWin_ojiG{f5Ij64%};p{=HKR zuuTnWV>6Cjz!ib5;zm;fyj1g-E{~yvUUxsDZi)j3dzo{VLKT-|e8+h2rL;^|FC694 z6I}E)b5}N%D0}_w7$JW~AUY^H3G9$iQ^6+RW_D$YoB^zJwz+hp4yH)|EUaFTtAja` zdX3VD!P*xgNG-UOxRg+0M9&+>t3cpm8=;(wzR4NK418BrXjVyT@AHw#Hdu0QS_Vs8 z;h033QcHXy!i6LlEM;P2J_NY7eVH}6Rip0+bdJIDUM!QL(N>Uf#b6oFJ9-@GY$;P> zupF#b+f7I${XnXU?))<3fUP@2?Wwb0Q;B|3-t~BiX<_jR%cL>$2amCcGA_`V8I||8 zF|#k2dsJ>?CV`Gy!UtSSc#kpje#!eE9C~HXI z?4h1u0SoEiHf+uT3!>RcDGbadII{Hmhh<%BAL8e!SiG=on-PL93Dkoh472AdhpqpD zV@V#R7s340Zk_?C6(rUu-1Z3DCpPW2v)#7ME(8ED#cZ1wBfpcEU7^_70*kd&3;2)# z&st<-0uYT!hiM`PgJK9Ri={|zS+({3 zTDDQAmhQrc@he zMX>_y!?b0kTWN<6gBD546BR;EBQ$E!K|UjLZk}GS)<0`K>f#zL&~k=`JEETJ!#TFF zc``7AEi>{-Y$cRm9lX|v;0jIy4Fxnj6;sw@;ucaVeL<-<0oV=5U@&7Lja$v=^KN%~ zEAd@=@!5`bp@MByzvj*yqH{sIp`qaLJD@`+Y>86ihwhfZaI8xEp(UvAM6^qDfVR^z z$D?DVr$IHK8S(%%2^E*?8#|72cuJ{(AGJS^Nb;{0Z-E12u|beI`=5iSaU8BEGRTg!QFOmxrqP_L02DQ70{xnF1f` zz}Yh)9AK%{T#a0`9F==QdCAa<1}pRKQJG1&nFrvDWlK$4F)|6QhlGweZ0t=t>y#;? zMP%kmAI1wjfL0KzEIk0B-7IA7RC#1N%||xXhNYUxmcuS<#zXBCZJ%onm4MNoQZVq1JVZz z0mocb?xR))YQu=Nn%{~q_xb%SCnNDJmc|~4LEacnTvG>9biFvHR>2$HsR9_&OU(I5 z2N8%!;aWPM+A~uOs*9Np^y03!edM`7m|pR5%@SEPDJ>`0&6WP{28I< zZ|rTe#;?un40{|YvPl@5n#S#BD+XAtBSE7e&+S@RG2t)nLrIWm2`hrj7Pu(jLME^Y zkOMpaA6%q>RnCwa?NC^tlpQARi!v!N?vAYdrmR5_xe?GxUBEuPlr0vMhPU1bEu3S5 z{|qSKI-;$JUJq*AZnlY@4Z`9kW2>1jbSfa&4CR0*-jZZ9luuNWE$CX!MkfzCC5-MJ zB&lXdFpvy8k0lmbMwnO&(&au88IM5D6oRg<*=k>EYtB%y~=tGStq zT=cxw14OmU9z1QjBnbZ!>gqAo%5m6Q2z_}kVYsyd#W=H2A?C*pvrH8fp)QzOgM4c7 zVxew<6nu_-u*zcW0rn0vBfjiYc3F}fb380%t$RmEh%)Y1qnx$0v}(+^Y8+`-fE`z< z;0imzuJK~2@i~QYuvi%l=@Eunpz3KCvFqq$C)jm-HU(Xoxq#Rvp^Le~TKu9}&drs~ zAnB{)XRqKNi24@61xRtD^GSZs_T#M-=@1N<)wI zg4Og%brYZ3+SRId3OfLw;*hSoocs9dj!)o%M0q>TPKZ_s+_jpgq02V(eEZ%H$jmnzXTc~*+WNEO=|}68bg2`D4~0UGMC)`zy4dK~q(Tkz#eoNWr4)N&k>vHt=)s z>kSYv#zbnmE!G6F{4n!EdBWf->Rb>xQ!x0&{D6N}70OBIjB%+PgdB92>SCPK6iyWYi{PZV?Z4J`15eQKpoKXQ-eP3c$cykaU(C;}ydNRm zs}K7syH_9XRC%d@(XttVKt{=^ZzEoAxRT(jIGYlaV{JAKXk%6;F|% z5nm+Qdy=r?sBo6qaPU!by9Gjci~P3?{ES$|p) z&mv*bjW8lC`D2uPqeR*ezWEcBr3b_$RABz~X4Uo^cUXV!(m@1(qujiGd&V7nh6<8W z^Vi$op9~8t3>CT&(U&n862-T1qM(TJZqVC69}pn$n{8PBJ%F0LO^k z5GX+gMy7%6U83VY!f_t_!s5VOVNn{{0Kqsx`Q*fdV8%FHTst`G)464l6Y8TC+r{! zSd;3JyuYCiMDp*|Wp*$4-PGcc-@>X^m?27MKhK@~l*O~s{*JF1`2GjJ`5SwO@{L_s z`F_p%{-^aVbt9k5-YcVN+cMED|7-8hexyG(LsDkX>JRIG3UT+&QzJIwJ8%3eub+wU z{9li}=oWmRyX>9c-HY$SC)R9x1mFEHzhzY|zKeGrI`#&>FWc~^OR_aQl$4+%u6;Ce zy4n2UCT@7Si#bQ-06{PZ|EStQr-y`ZGuL**e1rlc8`(g0o4J4~VblNG!r8O({)+4+ zv;UGe7vG=<&L#}^@Vqn)`WIia#1kw5BI9BMGshf#8Ug~Le_qilu zd>4{{@!g-qi|=9*F1{}#(W(&wzhkAG|D<4xiKbO;pP^v!>2nGp= zteWv;mN@W(4!fosbyxHX{=c2UV6upX=*1VTWcOOlGlgA2e#+u3gTH_bH4flHZRQ#z z1>XvwRP%aBVo-K-E+k8Wqt9RR@+kxi(dxE$(F(9a+Ny^)N@3TrI~J{0L14D))=h8Mjkn@j>23@Y(jxR*6ypK9H4)bTG zR2jG}>4nV7O0D`Wxdt$$awPmJDzBjyC+$QYk#w;Kh_ZAYf_e!D;~FR;E2m?o|5=>5 zyp>I{kolvRK?4Y(@@()Os0}I2huOX>`yrT)Tl!Zo$qbhP5WD3JMwhACio91wW_`(u zt_;sQN(^A%AIDhQzgD$f81;+Df?y))z|Yj~eB1tA!(Vz3qp91AKyM%^9(V(0;zD}) zYi{Q-0ukvcm6LEHNarzJa15h3#S6guu*56&eAe z(#`AsOVHw0|*!UT4Yw9Mzs;Ysi)D8U=+UJ0mx8d z>S;S0DpL(NyCR84t_opUFm+zy!ATc~*B;W6#J@Ubz6lF^$pfEJ#$Q9{ANVvk*+?p4jv(Z1^T zuKrmReBm3@O0gvf?ghYI+>3lq@a!TmASv5C^dfY<-qZkuc3Kd^=Ty*xUfSp>6ipv! zk@-g4oCn$^ZkoB$A;tnkZy``1^nJ|y2^(LBmysJ&s<#FzFR*=T$wncI{&Hyd? zAarvWSdN+1qubj{!~2`RCMxZN*>+a52>Gf*DS_sL(i<$qQkwV5BbY==^EN)ZFWb#V zvN&o%=u))-UkTS&@9@Dq(cNG3YvC*hgss$9FXOZ&|CityR2b zEu^BwTSCA;rxsOLi?4`4bw(R6PQ+JefCXO}{|TG^$o&0e@YtI6s!%YJmS=O7a(1@z zI;s+xkQ$XkiBf5blrEhTzS8?w4qp+m)n-om8Q8>lauwz`zy53B9bD!25YgQ+W+~0H z*?!EgE{|O0Qg9VP4ai?LeY)MWzh(mRgSYHrbytOF?NYpD5qQf{UbMmBRu>~nh_jV< z!@IeQj-_byt_mnXZuXrxSfIb=BSHqwlAQwH$~D{uK(PqGT6&8@Gv*5XYDh=Q0mrH> z?m~G^06`dZUYth(yJY3j-us$>l$R#qStiboKMQ*Nv*w_8!7tMCz+Rpg+EB%V?XV^) z1~Un0H^5-V5&akjuI9;LXd{(7t1Jeyr;U_uJ8$FK9w43bXe-7_cnpA%NsP3aKV+JX z^l3`IR8H}9e)z&bOmOWY!&SGFhX(9PD7a`dMGG^~D!QB6gzmmGQ9&$(BP693$P506 z*45*d?_*v8BWXHFyd!%vd~l_-czpOtFN@nAg8XpvIKzjKX{DPnZpnDu?*Lp7dmw2M z5GB|gCvhB`a}zsNuXCORlX(ovmQBYkIB|JuV*~=Qx!jdsaE#R0#2DQmn68A;T~8(h zr!+FNB75f99E!~Bkw2-`sLjn@y&lZ(M|s_wzXNv0Wvau^ZbsH%b@#kLGt_tiURYYH z^#dJGY5oH3d3DH{yVUCKZ4*7#!mp+v2SsuC7Y8LF%5(GDyg|mW^M>hz&(q$xs_eSK zWkb$`eXpia^8TWjU-dO5_zD`s&%>jJM>K~icBJo=3qm;AQ2jRRQR_D-16a-1WFu2A zWWw!(4AzAG+Q*w8<%Fp3DrM^jld0q9;7sc~o8Qf^0K0=0r=5d;rRVqx5x6fC#TTcg z`Vh5^m4erC9RG3j7%H8Ug8o}(fveh&VJzORkd*CaBbU>v^bC}i(RqhWu=I7U(Aop? z19pTq^ETAxt$!HnSN4W|$5`=$Gvw|s+6_iX)pODbO|Lw~aih&lV_nfX(0pvH{ufam79?U5hj-L+ zbn(~;j~sJ6e(g;Xhhq6{`*BoKMOWVH9=b<&(}@AXV=+jOnndJPA!wSqPDw}2z-*1dXoOYrJoIX36@aII+m3LEI%NFpElls;f{eDB~y0 zCaJ;M!#fM?3@nQk=RR~G6$`7bQ~Icdqits5@gANrnhzHhZy4lQs|`-8m=EU1KsQ@$ zSt&xpS7;aY zs*ekbAb~e|dqvHX`*KwQ6}%diTj!JZfcZZK^ub7~G4gk-If6}w_b4#kh3Z{}ApWC6 zDilUtUbyt`<{b#M!tzq&fuNTPOtoVhAOv3@N3uZAz$1^CRp+vqjT#OFNPo1`y5XQK zFnX9m)eh&NC}B+{RHmxcN~ddV7U9**x|*`j7yexV01QM{(hWqUY6ZBp*yb@GEK=L7 zOxr(8ii%gD*!!Qo0cha~DYX=0E9Rpp=pJQ8_&7nZrQ9X7dS`5iji+PaOgdPxbIvsl z_aSU9%nTzd7OT0-*eRbc?pmVAa2+D+D56B%9|ZYMg6)&8Z|g-?eIx~m8=jDu-1S+p zunOJq&8r1hoSk|DNOz0*Ms9mM_*`{Upj&G7k{K;KJW=pZ4sNw|P>J2;4cNv=)K(0= zXcKF^ALOk&x#`!NAf6cq;>*^Q+ghZi+$M4c^M!xd$mXw|ueyJAX2>9$I|zlCS3coN zGGDmX{cG?o?q8o{trsh7cKf007xyEh7ylE2(VX?4u5@#X{mX^TqquTf?hd&4Ar=_M zEVxfoaZ#aDckqurymsI!UR%PAs#bFcI%|zn86+z>0I1ty0asQi9% zxtdzIC9G+dx75v27~a?xQnKX%wpVO^fr-a%*4tt311+@7H(T)4e1W_?Xha{j`@I)K@m5KjYqn#*DmV5PS03k#ZTy7h3LSwa=zd68M%nSY!1WrByTfD{Ow7`styfZ69rcA1s5f^p`7=;VG2 z@T~?ht!xXHU#*Ahbg> zs|lbQ4Oby#%zT+qUgT7 z5ma)l0s{{Qb}8G!-7A0(Oc$+dw(qM&u_%ov=XOM{iy(c z!X`dBKduSE`-O5>k*xsV{5Kb^m@98{{gSaXJKgI_G7s{qA&$LNwe%+Uuf9FqzXsZx zHFGNQw_z2iXU}wZt|t4Jh?;yZY=~sAb#l-@vcTKS0ug6Ns-iKTHZ$8Kxtp>vBo}>e zF=wCWbeSe9MFy|rkq*4Hxi5hlY>B{BOhJO87>PT`7Y|y|+Ka2rP6euc(8X=l>8pS$ zsStb&RBsDGcKKw4AP3}sPY9;Hk0G3nowPV1_~cJ8s{c(P=$k1*@bC-6vjY zI6_rIF(z|s2>?kd5clGxSa>NA{iulo5qn=ESG%({z=m`ub7SR%l>k|eQAJXJwM zv8Zg85H${Ea zU%x%W`gL3)(Gn4fkcASq(TgW61=JDP3*sEFS~NJC@LC!=Ude*eQn2V6mw>S1WM^&z zb*3y-u9O4poAiPUQ)RX|Q<>}k!|qG8w*g7r0&HG<>vBA75ga|V-hNLi8|PWAR9|lQ zyn|~ZmaUHwKCc{&-#S%Y$655s(=kpi*EhHby8RKq-TRe*8f^9uJ;LdtshK7 zH&l*?`3`ogdEezve`$jj9trt3G!qBL2d6`qmiN^Mzl(SE9L7v+z+}^?b@d%7L7{=5 zT6}~X1>@$gx5&^G+nMuGKt7fwEH3*U$+O!Hm{-K*DMpo^kC_*jklZ2$yT9CmWG?Sh+Gi4 zHYHq_+xSIYX4N_LfDF=gAytuMgk!9i4L!)Uo=)5Q_JYV-=SGfCnJ^V<6^0bifhIBcJ z{)gIy?0y5BGT>=}a~PLlefUK{ zPTvYj`)8xp=yDW}neW4D^)`B&!rjNq?Oi_PtT(43CvW5Xv#k3LKE`sFXTxx8ON^&b z%=U5u;&zyiK8vHp3$QmoUgJqkEZ!3!G0qPGo(EM|!XN9L`7++{<#mTz+0ANuJ%XlT zQ@M_N3&1*08n7#`4?%NY1MLE2A&KyOD_7Mn0|_m#ds|i_%*3uM!=<~iZz4PXUqisI zH{PTlLCZg4JyQ!4wIw1w`|k5r&n8OG)ESeL>s$z(8&YR?E-suI#$bo3O-#w%@ zrxqtPK)d@V3UHi5XUG2Kx%$Uh3!+u7*wKp_f7kqZ1dZD(JnK#GW2n}gYQK?LGMi#9 zd=O#Pe=#3-E5LXd?v)4Im54z(pG?<*XCxPWJg}fcYwfS(%|nv)l|bgH8@ORpGuu8S ziUCR?hPu0qE8C-W3bz}+pfCE#`(BYZ{t*?S?<$dm;A`(E$ z0@;wl@k88?_8|0&P{E$cb8x8|V1#c<$bZEiv{u;-TT(K@EpNE;hp3NuH!hFcZ%i9+ zzwO)a;`-fTlDqr;P^{m>6ZM;`G1YI@!hZLWep^wj(e$=CGO|3+^<%VOAnq16qH~1|Ur@?nQX!Jj?u-qoqNeBMOY+eL~0x=(O zZlhkOSvp$P;6(EQb(7u?jANdK*gRfTXbjjoB}Lq&VxFe`iYo>qVh zUALyeyrtFGgY(1FymX6}3Up`%IGxngS>Dt=qu$hMJ`9TIytje-3Etv(Q1#?+$>D~3 zN3#P>LQ7Z{B3}BBD7m(nza4A8Q0H!~0d6yE@B)~yI%kzMCo&aX!>FDGV%pl-@a@+M zO7k@TtcTHoc`04^o>!vU*IzL&60i&HbI7%_yLf-V=d%zr!$^)oONjrEO@C(#unPii z!KV;+tv>Vz@iaLQd<(P!2dDF{02h>S@;r|pV0VfMdSUanFM+d}3po{CZfTxrs&pRD zG@X%rn%1Hwj$3APqkEbv!x?V@jK$ruX6k9fVl%bEHB*JZVWz&_GXErJYG=4)cEQ2{ z)5n>tSJ0Yp+{udYhR2z#J)q0QChItBu6#y8t%lV*Sz~p@Cabvhd)tpqR$;!JtTXoz zYu0>SCY|J z5NWph@YXaqj5Vd%hNNt>fQz{5h1po^NgZ9>b@U$?t0ObMj?PXUtq)msn7?m4 z)*jW=?ZJd(m?5sXz-qQ0J*$%*USJGZFa`Qjtx?cmL`OkN?@A92D-T{{zW*1gBoOC@ zMC4&b$>7Fu@sHk9OAN#vItQ&H?vSftdXCuPkFZI$QHDS}e(i}iAaDXWG*6(PpnZ2Rq$U-a7xvOk&ALS}bgIFQ z9(1?F-C0Y|M@!)P#D|OPORyvJ)0JxQWV4TZc&8$;55rV9epgzLa-!Uyt*=z7;A_%nemtOrcfp^8wNlW_~(pQ@MDJ?TW)MBiE}FleKt+uczE&y{3Oc z;0?H2g0tXH8_-sHC#Gh)|F!OCXcqON^|l*%h0-EubTV7s zqeZX6AU_D9&{B2H$v^;Gkoe&&@YK3-E}FjTYwODwO7=*4%k)Q&hp8_C2A-2;u$ zAH>AB?L?P~RJ}_hhIsGOUM-{Hg|3x1xkfa>V3u|d#71*Vd{(1(N|J%!Li{t%njBbl zMh^PjAO1>7f$Q-u{)s#x>I?3NxeF>nc!w(^LzOVJ8ojl9@@9v#C%64dJJS=b(Jch^Ql{l`+`hd?X`y?GD3e|k=G%5}FSIB?G?ujPB1aMzH zNCKWaF{er~<718phCkCX4mtSxq7H}1FeoV}o-o^7J&k^A>Pzeg?k4L!1Wy)EOl7AP zd%TOp^;PMfObnl{NEQ4-_kkW|I!0R+V^|aDXa*6gHg;r=I zf81_}ToP6c@PHkj_EsXl*$wSUGuX9A3-?NepLHNSJeBq=)dmcr3kwxP^0fAZYHjDA zQOURLreri;u^KV0o}8G*$Qox%;ZY2A52Ehuigr}18@@w?DCu5mQyjFbk7 z;p>&StDlm3`vQ!47jQ;$;}lY~`jdmjh>I^*bs}HtUeGuz- z@f$3qu#|vddZnKi&>wRGMSCy&6WVVdVL>T!3+x3W0+Hpy)(>R5wVGAs$T^^;@*+-c zna>XtJDf}#aOilP4BL^G0|bb?g(Uaw^QB?s4u!;ZOuYMIPh#&nCSG=-H}Nk_thW%_bm;Mg}eV0M7CT8O9K+f8la>n{ITR z&)0t{wO+ehmpzVKS9LfQtyS;$0i**$7j#H9mjQU4zQvIJ1dC~}Ssk*$f?9K~jBcdY zL$iB5?0;>=Y~x{HxUsG|6Azv0c4p5W{%|TqI|w7a>VpMTguW*waBXPXL;lx>Kjf?X z1Zt7-P^Y?&Gc%TzW?;9$cqp~*h&@}awKYM6r^;9@}D9&`(r^ETRc=lkBtn9s|RlNlJ@m>NqPE5T3 z4k0?qec&E|P3wg1xJ<80+*eegny#X|&lrCey;>x9(oHRMfAjks}Q6 zwSg>Qq{9+6oC2H}g);;zDh>;oHpwSY)hZ;A^d01(qYhxQj-LbYqP+*<%RG83LA4hr zF&zM}FX!ubXshIFm-;tz*KGC=Hd~ko_tTc0&Q1gbqj<=-A0T^?4=Kg+0AvRQe1Nd{ zdAyN;Mb0K?8q9gnnJ6Oy{)g6ZIaY1vf*?VnSV`lK2t+a2(OLIx(ciXLG#y5zr(2gS z#UXxs6|uz%@q+NJoU_s1w-N-U=WKx4lDZvjJ^)U~8`((`C1}$v$QSL=U$Vp2FXwc| zju-P0($8XgwD*DMSVpwQKeGTrnivC)_-k&Urh;B&Yi%1qIGx453`2j{CU!~?S7M_{ zvmd~#Q3Vh;N*;i?6000$W=(tLchqFZ97IT-?pr7kcj11ARG)uJPCCJ;yR z=KE&Lb>%3%4;$MXJ*4^+F$q!)kWLYK1)8|vG)9$pD6v!2002&9Z^r1J{Vco8z*@r> zPO|13>69U0VDn|jbW{#^6q4*wY3qnkt(=Vj;WB`JbYzoO-P9zPlwh*x@7UCfqd-m8 zjCcT6+@t|rO1qb9h~~1{x(xKrymG{pRD!~#7@kg-@06Q!ev_HyA_t)Ggv@3x{H_&;HqXb zRBwktpcKKX?76&&gbncvM8p$z^cgow+GXy-qtip!8R({mu%9CRV?2vvE430nVU_FH zMP#`~!~`mPlduCeVdHWz^r$6BvIR^A+(&JT8wt8m5p$VE1b)Z+vI+vA=yZp zdewCM=XZ*8bI$+ru5&{@dlU~FeowdC-t2bAl=GL)pILM6UNs+7O zKX>(=^?l5rH|BNu^}3piPyc+|-2Wfl&l+&$pEnE{ zy6nGh*gx_7aeIFHSmnf9=S=N2b;+#sPW6u*ZhzH~M@1&kbCD&2{C&hYk*gh6Qf9FHkaOV&!=^-*Z>WSl5s^kNf>+9|ogh1k3&Y ziGF-WW{&kwe0>JT&_Agz2mhA-4gXfu*ZdGWNTh8N_#vCr?*ds~>R+F}-#;DmVfx6-{RFUuxmYa9&qjxL}4HlUK`|w?3_nj82OTO^=N1UK$ykSsEGIt28n) zyEHQTEH$6duOFC4p!9{S0Kmj9_zdCP8iBDXPN~P!O^^~7FFH)Xi&Iu2|F`j(fN?HP zS&tq!;nx(&yai9wI7-dC&b-VpGE9dgDncdG5!b0Kyr>28drxh! zZu#jb`!OO-m4)BO(~)4^ij1lZ%Ql6U5HdZA5JSxJquOWOO_R< zq0p{i(UvmbRy=)@7g(CqJ_9dGQ*pycSr{X6&z5xLLLq<&M0||AUD#W8S~Ah8fES`<{nP^L)XGRwmCl0}h1S<%W; z4uF>wP`K2$vy}9AY$dS}^sN?J@EtBA7A74$9Q3VIAb=D~ISMgSC?gg=#7jjA!6J^< zwzAw0@j^%e1JR4)0Lo!LMF^!u^`*X#nbX3Blmog;ea&S;37{?l0v-TLDD@pG3$G3O zfM!JrrQy%Y!mLeELRrz;phF2kpfq=lPy$}St0p@bM&CdP$V3Q&Me9p_TiG&u3_?JO zQeO*794aqbuNwYbY~Y>hF5LkfU6hVY4tAshrl zfUFHpK^q}{{~0kbO&?#YB)mv@i@YIH%B=9Rda-MNejt1!jyoU8r&?w z8VQRfVB%RU0rJA4hNc)#I818jz!SE*c)}s_1PGW@;|anU<9Wj2{~=H4|KH#VobX>2 zPbfYfPuSw-2}izCo^aW5ctW9@3fw%w8Uc$ZfcwOFLg9>Qm#6q^mbc+J3%G;g3>~?DqI6bi18TBuR7!v)|CQ)hv}KEZ2yP|qZ9#7dc@gkdFOWycbyaC&Oq>%? zN0cimjx`BA^whF5w(>9Y6?~g zT~(_~;3`q;6|zdevQh!nlB=u~^GC#~B}tVKWu1-yg$z`3Q;AGVTq;p%!HFzNLaHj4 z#RMbabcCXkhe(5tNLlEJ!448IM(HuZCqmDXcS_VLFkNEKk#I5TCW0*nr6tZ3lq6e4 zvI-ifcF>3UrYzDEM2(b=j#gaPuuhVb!Ie6ve0X~#OeYVZC9{=PIZ!G7dv zkBu4Ot_JEcP=_IPBwl@w_4h4RqXFtQ_7A9cPuBgy*S7 z_^}yi5ntSN(Pn;85$npK(~@+hHiIB0>B@7~OW>D?0?NeGzl4AZ+U9pn^cD$zZPt^3_j!pNEZ#p$u)63G2)9aV4mq~m5ewPltew|6~Ue5*Z*=Z} zk*=w67@3l+=^q@g>A;Da9@w$zN62>EgK-3CP1x)CL}#*I&#_)6?e#*kIWP5R#Wy`8 zS<{1$*YtN!)U@8Q>0gq=xqIC?S<~+mWyyLy*Ls<>*DE-%-d^X#HSO?t2)dh!&Av$> z5h*hgwp3v!73Mu1tFCo%vNOosB3(j7OCc5uLq(g+#iS15k~l|=tK1-D%tu)QyLg71?@ZR6vi6%h#h7 zg-TB8@O zH6S;t8*x%_6+Wx+S%c5o6s>KWMxl+xC*0x*sAt3cC%%X|FD~60fherZA%bdZ^^a}JT82o7v z7&nz@kqMBsLm)w>jpA;w)8nFbns0H(KI%p5^rH1z(P|>0s8K6gpQP7%^*Z!^Jxu1S z@c|Zq8I8Z&wu{VE*x2sD2H*Kyn2_HtEXZ#c2IMy`L*A<}vQB<4<|QS9q4n|`P6UEk zaX#Cw`QURU7~}xA;rAjQ0z`g^f{Wj9N09OwZpOxBO4s~z7n5q?k8sdP?PT=4GTrET zO{UTF`fQ`;EditF9r;Gj$%W8~iuJ5^E$fJubyUkbq-7mM4Mw<=5k`S93WQN0i~?a4 z2u}tD-;Y7M9u#~BDELZH@Sss1^7QvjsQ)FT&r_yI*BE#yGM*aaDKVZ3fu2PqLCy=_ zqwLX6NungoB$Vi#WB&?$XfRT{8a<(2_k?=gb2LU69q5S;B<`1Iyj+U7z0WWE$C%O1 zlEFO@|7E%#VXkVykf2ip5P|*S1XC`9vNl3&f+GiAf#`|l0%^q^G_eZgq#QeBv z6xeY9?6};BT#*ea3N3Mok+H$>m%fM3W*y-t@Y#vCVCExAlD;mC0cRaA$(+_-(l7 zpKJXKUrn=sfI=Msp1{+2`8lvX1|CN^#+=V1`%rquu-Bm?1+L8uJMI5 z2;rtc3Iqh0rA!EK()z42&hZ&Jdb)uNO>?f!G;*%XHgax8mOVz!_^iPKWGzm0(|2NRPZ4GAUKbK4_S=V)621v%oyVpH(EuVGgU<6bH_?ElBo^u3_S^`b0XzzRoc3{_RLnHu=5yw? z$O^kgTe2x~GQ;5-T?Y)|ul0+C_##C{?qVZyAM_~%8VF|Uxj1QmUACSJZ^H2bUM!ls zU&}q9^}9Q!QZ^V_n~bb&M%JECpWS-n9z=cD8(X21YK^9jb6<^E-nVFtxR3U8t#J$c%*m!5IJW54m@2U>=0J@TOFLXJmb zA#+mH>PC+jKi>8eH6HTShO(B1vQ~kRLs^?Z$Du5Q#BT^??G`F-KLHhQNzo#g<#{NX zWK0qU`C{OI`_=c%v2UFS`cs^PF36RA&cWY^B=n6jH7Cmq>y>AI3ip(g8Zn!{IQ8>1 z&T*RXqY1Mc|KSeC{~56)=8wng-_IG2ErR}~(fuX@CIh|#VT8|+Vi8D@n=~Ubwj3tI zQRW!tuAfgYN-Y(IR>C3Yq>$nw6)X<*S!wiXF#2pV`fM}$ z?9noohWw>_@i_$b0c)#E@C6YwJssb;7Hcq!*rf=JjOAr`Vn7Su3o&p~!f^)1Hje+3 ze!n{TdQK;F-^jt@2|V4%xh9iL-N?Ba9XZh^!eA24$6fY1gO`mEA(*Wp4-Vg(pc z*zoNQW#MLyt~zZ7u)TxZL#FCkyC_oitOJRqs>442#pEmSi>a1S#8L%Gy#kSNpn_mN zQU#%oxXI{=SqXRbo|u^7-A2y|qiD(~)cSy@!wQfCD?rW_C0fo+nwA4AKn~vb!5NIS zJR^dIE1b2A2v@IgSu!GA!oq9Eh;SVXjY-elYecB;KzGn{KQkhC;A@|ryI;@!T!)~A zVo?a=jS)%9qX8$v+Ok0PtPCRoMI#&b98tauNE%T=!rx<0!qd{V@D-WpjTXKMggFF_ zzy>oR-vCjdQzQxy1Um+LAT6+SB!%iNaT*mSyQow^ihmH?_JI}uAc!@hg*cfYGa6`R zY$bpI9yW#s^&c@p(4B}F1ubY?GA+omA9Xe0vuH%1twk`6B11u$BlDrg8sU0JIAv|p zvX*LDxR0tK#W<&nk%KLy92mQD(7_y>$;uf4=r_PT0@KhAoIGmkq-7jT*07hqD7gWC z6&Z?-fL}#M!>9wBSOhk)@Pbgrl2ArNC}RU|WzaJkv}jkY2p*K1HQy#n>^rc*K6B35 ziDBE4Lb}j+b?i34w*&CC8WHHy*aV7T6DTqYeb>P@x5sweM#Zvw`n;ZuE#@59ei8|n@CxMlq=xcy=gNZuEayjYP!ENGraCL z!5>ZK`l92v}vV#Di*xd8JUHT?h}VA!ZivO6*d2P^FkzN|oEC@SsXD zvu6a#lZ}9_A0@)S=E6UpiGO^nMwi%);z2b!kBt)l8}K0b$AbcunFUnSE`zuAlE-aG@Sq@Jeu4iPyC5DE{LC!y zzigMngDS<$QmWQ2g$GrNnWa>NT?!AX6f;YyCc6|KR4HbbQhV)Ecu=L7nWer$Zb7ms zv>U>MYA9eeG@=9#!c6g?z++|suiP$$2UUuhrBfAlDLklB%q*oQ*`@HHN-;A_{c?tk zsxRFX#5%-+>6<>0VF7$!p&2YP@uPs4hsHoPT-BpWT))mK`5!~vS?jQWWl!A6T z_>;IT0!9S-8Ej10sNt3a{1!^M?EovGMq`s$3Sn*1B2z{wa}(^=+*E)j((`_3fY?Lo zIjDphE)FAVJ+FtpGa4qULt4)}p!Q76r|Uz{$)j}3Oa)E*zH~jjOUnsnYA{o2IpYJo z2rCD=wK7v-4KWC7h(YtY{f2KYB8u_NMMMd{xrorPh`>h1y`zZTQZ@gQXspsd>iFm% z^f~FDDEa80DEY92ijt51L9^p(E(iUK_@s+D=wgI+HFPrv-HgDv#x+b1`pM-1x5jeN z(Fl!g*ci*13_TAPH!c$z^xQ^07Z0oT+%=P_Q>=LX)POn@N z(B{A1^}AVsrSC~Dp-K|1pO&6#`Ty7Po2-BTD)!f8@cj#}UuW*pU@GNCC>DWO?Bj|E zdK+ZYi3oEE+C9#a=bK&VXb`;treo~sMc>Ms4bEtOCVmabr$t9#4e?f9HmWjj5VH^r z=}1Lh7?yRWMEh#ddkf9CStfc35AWvFqbuFK3icIA8TpJkP@d%s_mk3wYCWIH2*!%2 z6DS1B2J8_5^Y3sNGSBm~zE|>003@$y+kT2~dg2YY|JlEj-Tt@uzia;5!57ddw(4Db z{_Go}xRbD9sc3cgCwb-OAtwoE&%_}1Juo$_iN<10G$Gr}&H_ljFG5-Ep{!cmoy;{? z_$aKch*$_~tAVa^pFkKec2Ep3U~Cy+ReLp9*GJeguo>{@V8BPQdaV`z2mc@hD+I6d z4}v2Am=*R@z!|~iM}obNhKCwYQR5^xR)IJ{ZCUv-x?<($O1IdYmZ^}&@jY3jc^F_k z|FZB0UEkS;02YP5vXjjdm@UL+L4yUY6*N`APGN-Mkx3H-SU*lj(DVSQHiiKvWR(ud zLDLYd0JKNY4gvdvkva&KT?vVK4Jv~y=y?k&yF)4iT8unAArb>C8^_8fv9hTSuFTWu zB}D6HgbT~)^k>XNoG~&HI3Y*`JD8$~U^w@Uc-kJ<%}#=i(xI_~p#Kdw~Kxl^m)kA~})elV~Ou!1ByOfjiF{+Qc>k4|QAmixxp-pG8{Rx8cM? z*AY7Lj7=9m5mWh@zz=k6CY(Pl#}aY#8Ji8ymRQ0nqQf@I{GSo~|lc zV>J>h1+ONIk~HSjZQ!-|wpeZ2O@w?)R66scX}hJ64; z$&`?9t9p?L9#G8xAw1b9ga`BP{}7(Q3E_cA{Xc{!|Ag@5{~yAGOJ3v1XDFEVVDN0Z zPLG6O8VwG7J5~m+Nk;paOaA`1_75~##9=Uw3=xi~hZpN@JFtgKw=i%=IN51!AB$Vr zUP6ck6V^T;`>mLMG1sv*wp)|rA*-Y-n#Z-Q35!GV<2xAg;YNdo6y4XzFRn+V1=J>v zV>16yI~>*IcgXiX8@9dwP^p!?JeG{MY`3+LZ-q*R_Zpg9Dsz8H$hQhilWr#=-+Vo5 zHJ@bh!*19fY+M0hUrD+Gk9*+=Slp=Ukc?K~3}+?7jB5G3p_NU-tTxcQB5OB&7Xh=L zwU$q^l%m(7?MJ5;kE~93;0D%Xu_bY_Lv^2$vxQBi_0SD6%q#$+?w!?Z2M( z!O>Tn+aZa}n_2P62r<`maX%*RpVa>lB^@um$5MT{an40X&I9QoYL=Hnv%CtL<*Yut;^A$*Yc5_~PEasWk>N+rEds8nWy(okYFL zo@)NNxPQ4;YwIWEk3$#m;Jh_mTtY2wg4?e-bC?bZ3mC2ReNH;09cG?N$M%{V z&`Adc5{)1<>8ZUrdlzRc+mX1v`DKmkA9fKxFF6f|k&aabEc zL6cUx!`p}zw58HHvz<`)7+1s=(*X+FQt9CqfYKcDVL!`$4N#mW1$U|#KtcCS1&X6k0Thgh33jYN zL6cUVk5rvZ2PkOLO5ft70~EAnr3-RIDiIw{TZm~&*jNu>K)?4Y7;NRu!|_}b&%Hm$ zbpZ-;U0`5QfVCE6F_sAJL-E3{Hht#O$l_{BJ*37x0*A;zFYc zE8+vr>R5RpLFboEQej*ku9b5`zO_lJ%O+$~CnLi=Rj%a1Q%C)SK^I}VAbW*lwogOhZ6YjXP47Ux9$UNp$Zk7A&Fxdjn6~u_@*RM@)%*~v z!URnaD+%j|wvnv<4Ui^S+;Fq>3;1-lBKAKwOaBLW>S!OS-h~Ax7Tz8M2PB%9j%Y41 z&yhve$+#89QqXgL0B)K z!EuQN{lQ9r3dbcD_(QEsnj+li06CHikAEg*;r-3>G$4?rUZ?QAKXXq_t&@4$s|K zIwoz4z!eiYDnA>x+R=TxocwG*w#4)8hJceJ5x7#O?c45Nf|HL0EOs}c<4G&z@LXBi zy$s<=7(xRDyMHdNakAXAdmTdmRR*L3C}@U)4L9!}OZP|Sf<7y3lERLPtc{x`G2Ga~ z&w~TPV+lD68wv-&jr81+fic_w;Y$QAOKqcieKT-_a}{dUsr&ZWKv=l3QK1&oz>R~0 z_f!|T(gG|_*KJWmBbt9MPi_cq92__@iMWA=Y~jXIxLF}^BO*u?Z}4qOU+UU6ZW>~^ zv8AyG2ZYCxx*j?JZbV@faumT5!;LM5fdckRr2{whz)iM7TB{9&g&P|cBAiG!4i45R z+(6b5H%`}W*=U3}12?p732q!5I6{)Rfv#`i#*vQ1jnXG@HyC=*|^yd!;P&T zcyK^?EH%KwMvQKx=R%7c92^jgSpHTg9k_u->GFWW4WzFE!orP>3K5#bje~>D3O5j@ z#EsK+TUgUc+dp@VjT;9Cj?^Y@un4qpBfzSLfE$t3YAL1k6Rw_Y+%&~-100C{;lTmn ziKV;f0CXcg7gR{G7~R;43Qi+pHGT;EXad5C*;fCpe z#p(JEE1mvT{<(MBxN&fx`XUtqH#Xl6VckX$BptZ61q%Gkh5;*@Od8*DT?IBk;kYFB_wWia!E?=Y zHTF+8(+cm>feu3#!u;oYX$99y_?Gn&)=it;>!oDs8JuTkK~SpwQI~$C?B_9Q%C#zX z8ens#={kFV&_68;-VDpk&aKKu8{zH=_-@-12jAxixj@iF7j-M5mKF+-ExWWV zJ6%go^yDvzdNLoOb!>b3N%X{7lDo7&b$$YNS6lxuUHXTjt#C~Luzs)RffkScVZCtl z4=W)~|4_<@&@vXfSS`AhkC5+>nlDTT!Q0A5$mgs&Z_W<+)LJv9e8{TvX3RTV`EV~c zDPUvDN62?njy=(iEz{Hj675-vha|Y$BO`#MvC{2TZYaD2-FGV_eJ>Hcq5SFRyC{)PQ#X}VWm53eP3=a7my@yD#^^B0u)_ag2NKr}-abSsV)&r#2dWPE=K%~U<3}`+a z2@5GU4r2P3(|M;oOUJPKW~)~r+?eduF{}(OJqrLqzFO(l=ux2qQk3n1lwrwTqys4+ z4j4qB*bphUf)VmL3J;NjUP~d^ViiJ}A4pN!4RK)A$#lXH%kStpL`qD-fXc&>u#jTo zAf|XZ)b6xrDHm4XY`rQJ-UXz%l?y9_(=aK;+3Rx5B1DQZhLAF(m;eBzfHmnV=wi%+vsos!LAIi!$rOd%P#&S+Qqpq z9)2rt2DB8}`t9i1Kr4B+`8rgJjs_fYQJ{MQT@y~xE#Y?5s6}3prq$^HY}Vpv!RPd3 zD3p^0JfnyMeMm2{PxV#iX`zDcMkF`X2X5)()Ac@>y}?WzJC!pteRlG=h~8&E&No=+ z6HtNcMBw&N7S>r1%^+%Mhx?Q0Beu?Qe1D0=`rSw|o}3TQH(0~RVz+7nHZT!`2=R_Ki{atj(XVee-l{67Oo2{S;?3*}EeCuSJpnZ(W5$XG&#Wlnr z1CC?;#sq^MhWp3qG69Od!0s6?c?$XGE|GSjZ`h?pK^M_#7gy#Km5%9%IjgF58Bw^M zZHsa&nxMvnaD0Jw986x=DlAnBZ7dN8mCB^n0;tl~8a(TCR2kdxj+rg6-K9cS7~%DR zzJcdJsV&(x#GnV1R#T3thn&p?B_UDy(UdK7m6Zdg1si%;6D;V_hT`-}#fw3YrmS?Q zshf!q$2wpc7R0iowgtBU`VA7=3eej%#GnV1R+Umxr2~2wlpg49;fTc-WMHU>B1dqK z^}Yo?+EA=o?RLbVM^je1(-fh%MUl16(|xc%S#eYRb1`jkoEx!VM6|K$7#Fwe3;DLi zK(s`g)sB1ZY=rl_m(RlgWzQCDI{c+^PJxkgHw?A+Wzv`ik1HL1u2{*L`0U3@?n@(l zJ|3|lhUprfhFn9kJqx-H^N;6nxWCthPH5qMq41GV_lkd zb@*(-ZZf_;#_k##7>x#QhbTCi9X8MT64(0CYG3yMQYQqAo>TH6_h9*lW6(Z2S%LWh zDi=ru5L<#;^+z~j{}n*ytg{^ZF8Ak0faBeS{Xa}*=nH=C`(bin1BG)2JI!hX#h$|< zWpkBn_*-!dY#r2tMrR_1EF&z!Wagy(!&p3tRXlRN5^dOr*k`9x!Mm77jYE> z02i=ur`SAO#8h4$uQG;4z@>z%2HXEy9kGM;U7X8S@Blmav=YE+S#T7b1QGzA41R*c zCW_U&QB@)YZ2%L4*qqjt2kyC@o8y5Zc>m$9fMMFda4KWIgj73`Ab_6H* z0M`uRYoDI60O0?p7QR3Rih(8)4N}bhV?@4fLl?rm)EJyl#`~d+HKB|TwfaRw1|T*3 zW!r2}joc>aTiHhLb|ZJIk-N<(s5SbmF#4=E`fS!RR_XrIT6{hO2gDaHlD?P6kz3&7 z2aR|lxZnx!=+&is{(Z&sQ_hHP z3}rOnAPl4zgh46faNLaZ^nC_$S^mF?;QNa3f2weKK_!jJyHDA zZ^ES*jBzsL*8T8-NjZMJ)5UUKo*A}6lY)E@TE-RMbgv3zTEd`WphkKPN2dqO4^*;- z0D+8eswyc>R>_N!yH_4f!{7;?njT|dOCA%7j2#z(|C@g<4qn@P8X;d3)?xu2VfXM2 zwE)M3#XAq4x(J$!%**hBI|LZWP|b>6T8`&Y_*CFC4gi+K0DR7h-`Sy;-Hrp`9xHMv z<~VyFK{tQ~Myde}K*gi!bgXvuDc^j6N$!Tfo2=m{r1Hd=sX9T!y*oDCDZb&q5xa01 z@6hAy<23vwYk{*_vL62zqa=EJ9A=Wc$HmDS{+(2wxX0BeXt-a;hBM> zr{QiVX!y^JpXu%KubAZS@fFD$eu*H46ZiNDd6yj3MI9SHJHFxN06t-lC4Oy^hO-tQ2M03*iiN3CuD`!wOm~A=F=K2+!Tj^@I4jW0##8Ei{CD=<+zH zgD`=J01ItF!#TBbSPe(PkG=x2AKkDu!pYkWn~HR+qQJ%hX~(;@&1iIrhcb-yJpsl_ z7)x7ad2(FxmDz?4j(CqXy0rxXqauD|VXcfdT+pkP^xoJi9KW#&KB#_U>5S;>vX7=A zVjE*_d*gWOSs&`~Y|nz5Q7f0IT%KAge*9=dA^+bRKNmV&;}<`69mkFFI3r^*Y!fhS zQ!s4PN^smkgRRLuREmrkh}aI&+i3Wj=r|u65%FZ&%HYb_8zEK!25bxtR`J`7)!Yvd z5>j&2Yq0(KnnGF4p{#u&2Dyb{%!5Y=F>kaGdwmo>Qypbm8$T1&NWlkGD=@X97MTU6 zUev4??bM4};;4&I2sJ61j-LdHkQgI-J7gn|)HCogj$9(Bzcjo=D|$~FeqS$It{1J) zi&n-XjSUMa3N}s#ZEQFJ!nDYl2+ITS`oApci+_#$x3RD8|4U0>(FOz&dXa+yE=+C1 zs2s)cBD`p)R@AJ)SI}4IT}8&x?+{G-n~HH<5X_krbOjbQBf#CTO7+AS#rR;|hp1yC z!2aT2${R_wBZnfU*b;m;YTn49^I#2_uIJY3x!}tfG3 z^fv$Vm;NrF_z4YYfN2Lt4K2Jp6y5{7?0lGQ7&nx`LU)G>8jONw1MU-lf|!H(wiMQ# z($xTIIY8S3s|dn{h6;}2a^tF3^t#qScv%^aL*h;jy>55FC^&2s910bDi5zjD%|dNz zeH(;f&J+GLyq3Nk@X@W=OJfU`@kRky*CM@umZb&wtUX?2Q(9_j4kQX_8bn0Xsy}q( zKXTR*4ZdCikV4`4dcodM_&vQ(Bd&W+@gd4Iv4Rgg0}wK)ZpT0btJE5IY8l6>n|)O) zaBVVjG2VK?UKXnUAVteq67$antT4ZErvqNG7PICajT|&IB4;9+6jqvpGeZUQLw#^o zxgpf27UzAq!c<|~;inPG!eQhyAzsok)v?%-+DjGwSigVn8K_Awfc+56U@&7Ox$GOU z`$1>!%BC6taeF3}mT#qi0t~Nt#W`wfor5J&55Zz`L;x|&^KsS42^Z?Tm_*aGaCtdz zB|((m*Rz4wmYyElff|3TQ!~wf5~gq(*4(s*)Yrd^+`O=alli26txGDri0D=OTNFW?e zGGaaGThIL{#&J)Y*Sh65SfERO>xjupuDFN>`Tt(1@c&77*6X=jzS{0cb?nZ1bf+Os z+pwdChq8{5*`G~j&wm^FZ!7=p;=coDt1#0Dm5k8R9m7m-a5H~fyNZ*#w)Wy-UmpKL z@O>(FRPZz^R2dt<8Fzu5W1FyaD^);z-XLV(kV3pU+~G|VQgQ5z zHMxX@iMBqlgha?c2Trzgj%acnJ*IbiS-7?=yj0I!q)|+muu->Uc>E`{H!+n&K4`B~ zBJh_HeW3ka*&cvw$X$YPy`ce1z!||7wP;$t9{x~=F%|8fHtp~l2Z!=V| zL@RjD@Ey_HB&aoD2J3R zdU7*F$U&pvh!P~8F^KWEcje%D5;%7OIvYYl))A&%9#|F;h`-116C7zoe{lPgDJj7b zn6tra5wQo%H2meYTJ*e8NFHVsVAE+=x_Qs{a0h+e(bUSEqD_#y26kA&drHGQg6+NF zg|on?4cJ_Ta}EpBZ@4k|qu`Cfn}VYmA87%Q&GLN1=YRkG8w3Bwz`rr@Zw&k!11Ez4 zUrMKxZ=HYP)If@URp`<{%COsRoqStm;Hum1!;i}+-Zg3RZTH-B+nB(x@wZR7xw2~V zZAcz*+vM?M$KN(4w_D1PiIssV&dgSLkp39J^4;fK*&9Jh-HF&;uh^s&> zM~WN7@~T_z9Dgf859HrGcI@~Gw-unE^*89w@e{_NA2&}J6S&ps*4T-Y1NV-XUWKkZ zuXNb(6z#&x&l}2i_3O)+aGqVmzrX)q!~m9cykoQd=s&+~@2oxW$8IlQ-ZS&%d&WL~ z_|jp27_w={g0iZIK3&=)ymn5_g~PtN|GVc-d%AFJ`t0(&%-Pc($?5UzxW_krGIZ~! zy)La8(|h1u+j`x-_ip4*UA}70^1fGm^Tiu~kyEpJ?x0Dde*4}VzIRT4?a@``4b>AK zJd_%I_}1aK1@>nD_0M0T^#hk5Z6CaQU;95V9RHi__a^#)D2qm`C{y-0$e<=U>Hg z%i1b|Zr^}oBznK5X{N=#! zsby-uH*EXYW3O$4KJR)px_$286t~{V*O%13vB2|6#pSXryZSZT^--EX zp7hR`-2(fq?dnqBb^UucZZ&cIi772>B)i!)4VZD>-O8y_D`T@C^|#GCm@|57m(C-q zo*yt_Ozym!CGV73(|7fTE2Bn_omc0G7f&0*&^omlyxtY4fXm)mOS z#ja)VxVB6=EB{^l-^whE&&XQfV=t-w5B+%AyVfU@Ta_zx+UReven5xQ>pQO**6{Ej zWlz-T6LZ$+^Y^VsO>RF@oZ*`z)Xsr6+^e9coavwXSJy@E9nfmSrV^?YlM9oknT*>O z|J~?8*=}>@wo)v2(|fY{H2Pbo?pOE_uZsMl?p14-occ26{qgc8rgj@T;r*RcjaFPd zw5)kE&t7l7{T+ME^w{v+qmw>oE_|3Vu*Ju0o4r4}Eo{mscBu3~_u02swf^k2>E)Vj zD;`}dbL!?9+W~s#4S(1EcgshA&s_6!LyeCAaEhC8&1XobH+?OXv;Up2{AQ)fRY$EJ zxq8a%`PX*sIwc#IP@<%p$(T~H=G}dKeEz+)aq}I|FZ`7nZIAsCKIQ7tzFn`?kkwcn z<@v7unzXRpr915C$=5rbzEjq8_SVgtH|ss@=`$kKfV-P}uW9+|m30=Dif{>h7r05^ z{K@!@6TCi_)_FN6p-df*E){)0gfw1tG-_?_J$3CLI;qlTh2)%lpK<+y_2Rz$=3nqJ z4+@FRd6@ROZjYqRRuKb6<4cSG+ilSn6Vz|>>aF-SRsD1Oy*J!t=JaQ;rWY%{ni=U> zH>lOL)CGR$M=g?f+tjS}&gGwVt$peuL2>RDoM)S*+=07&W<+$j{CsTW*?ybS_S?vJ z?(b3Ud+tsb{|9?Yjw^9&ynKvLCj2K3yU&-52;x7uUL54M_1d$YU%u}9+wM~3*xl88 zH}X9A>CW>5wy}q@x{s?6Fdz%{^|@WEX8Pp}dfn4sJ@m(i`j-+$2V{h=cp)FJ`?+bx z`K=W^&bYp?DB&|F!Q^-Cf44LUGYdd(5NBgOUlM+wX zIJ0G@Y{Ml-tD}8?Y%sPzXLM?4AJ~^N+P-;3bz`p7--||!sG8jP`%d!+*SBs43$K6d zsdw_B3%9>Tv+xexOVb^2{cCnRboBeKnvkWJ{?0R7A=Ggg*{XYEYVQzi4!qc;j$DQAxT2cM`c1s=K zDelh#kA2-L;iZp8D|f6rd^w8W+b*&$yJc#Lncy3ze1 z#}&3#P5WP|aN?t5fcvqsZrx09Kf3wj$+EBilqqGXbM&*zqrX=qKGo5;t<$w>*OFV@ z&sFTT_%B&Vj~64J@3AO%Ej9CBo}aeq!?*ow?>%kT;~*Z#^rV574aVslO2bE4e{T!x{R#_~(>F%^=y&bShc#70_(+p8_8Z3?E!E?ay=k33 z4MQh2P?quVw4b-&IM+YRz}E-K;a^Vu#?QI)n2!v;*8IB1{Md{#ALD}c1D|xgeLdm1 zYs<>@{wb}i8sjyvyy5SzhyK^|lA1_UHP`s*nZ0sQ-*%KSIZ>#%-x}_qQiVk@8^1b?DlSMTz0j^GmO3; zY~s_!d60kE+kY>Jkt-wb1p7WZf?}o&l|XfTwl}Y#A?f?^-EnHd8~igPKl4LgF6K7`&c>iz^)l8UDsu*o~t*U)S0qo zSc|t`N4;-ta3L<%amF-d=f*P!TYFmS89*Z@>9$RS{&cS8{ts2FCV!r_4BBk@eZ`94*jh%R_Oo*IW4&6*;_XYXyB_;$ z-u69X7ddQQxp|N4si|>EE2hbx^m}>KcSHK^-u=vSrI-mF>LZ1RuE z^RnuG88xk<%|@T)cef|U>mAK(KkRE}QcHicNfnLSBdP(h{eC+iqFY7jGi zbK@SigPQBjdUsg%xlYcC53f?XcfGr?{cm{1R0*a|48?Tb!_|?>xm1#D+cUH?|kxKh2}UA5b{sGyL|t8cuAu*w(0ZhgiTmqY19~FdHoyi1~#Qzw|85o z^f|dC{%fpfdiQav>3)mm%<#+Uf4+SKJv*l?)3rCpELdMBFv`cn_2lhsvy7U5_N_n9 z?Y7C7YIPzW*^g~9%dqF9(W~{Y7&ChFBYKI22lFD8eN?l`{S zM=xK4iaiFb@3Y2gNJ+=fdoE9G)_us>e?PB$u;WVuGjESMZFl!tJ!5(HrF|t^cb(3k z3n?9IMwU&i=bJFRf3&sOp)I^Ql+e| zk?pz((@gJtO6#!d`L$X-E-c&j?}qGFRtwT1HXF<}x0Y4!cV&LX#MgcA|IqQiz2R1B z^txcP=IthqZ{+*%=$NU0K7E;cGIiS0xbW+{^uDi%cvfX$xkJ_exqqTwm6ryVmrj3) zUXV3^uJ?$zqvo@$x_JoAF-#(7$8}?*PgPTS@l%+dQ zs%JXae%IixYr3}17#PvrrRUh*Ehji^*tF)_ly6OaUccLRQ?Y4&%z#05%a)l|o>$H} z^;mooi}BD8PIph3$Yv`xI%Z=mV%&#iZKFJ1NUe+)?uZm;k9=E31>?`%U-sMOJ^hW$`x8I%(GCNr&|{C|lZN%!m_GgQ@{JCjK4?;P@71Y8j6MEM z{V_Zx;9!c$owsFTH?4O+^H-~xJv*IfWilga)T!2cDkSx(ccFR7Wy-SK=7(<%*13H` ze_e~Eb!!c(7@TZ$@ya|t$EWGe4)t2h-yY>{erHHo4~th`^K3j`Uw$|K^u%cqPE@(43;zA?HmFUspZ=ZnU4tu?>5^m^TR!dc>p=&(ygJnR{-ticS}#kUY}x$l=T86HBz{P^WqITE_Lql~Z#HsC*PqjR zSJ$#})o<)J>(wUMa+BS`hG!F>IerYvKHKZds!dV%$9whZH*1Z3nEt>$O{2#}milr& zEmP%D*KiU#Mw$!u#Nzp-HoA?HFbLSHO{JwMO5(llgG)k(#SaH%|?lTk%H) zlVxrAM^!f5A3ChLW$=>mmp%p5`~2mcxnuH&rJK*%eXM)(L8Uh*4?7&TJ#(m`aVdl2 zTSr|mja!m7Z|=e9z_0$NS~oGxT`;p> zxE&w%Rq8bJp=$o38sE&en~iJxZn9Nt-QE0qhZlEi_2AO@YrXqj{dmm6$l{nui5W-# z_H~*(ZQAC6x6e6ek8o~Y>(a;h4;Gc*R%>9VYHyoP88xBwwKt7+mEAUJ>XI{~N*Ht; zSkkk3Z?|=4Lm$^k3mcldes5-H+qbLxJ#$qSvQ7en;Qdf2LZlJFT0yRs6j>!7nT8ys*77U*XYd z!(SbTze$?B^SJAD^E)GhKGxpdeOl!*z6se?YFK(1&l&l`?0U1g6+SoUK6{(ueTn^b z96vsse{aaqTj{HoHuPL~^y9=zfgX0If2WQ8lAdg2z0$L%!n2W8YQ1TWd;UEBvC@x5 zF5NHWoT>G^%jMo-e4CMh_t!OQS-M%nGPnMHu-Ny0#-d%(>%tai@~UUG8=vEcms`J4 zzfAj_k0Zwauu0gM{B^Q%(2SiuZ0?i`E}f_sJZ{~?cU-pL``DDjRv9%uj|kXrSTgly z{2`C%C&Pj&RbP6j=cGL?c0c)iJTd;yDP`6ioLv6l8AfXSN*E0N zbiCK8$E9}}S0225PidoBmv-+CIo$JMmtD&{_v(3L`Uuw_-paXcbcY4b_~%QuTL@EmD!=bs%_UyhDzz3uAT>?;q-j2oEQwS)PDs>eIt@1i#< zZdR~&{lBAcVg~s(_xSX7TY{QS{44DF%M!-*O`}}fHa=$a_hy|gJNplHiJUxWd!_2i z8rLrl-P`75qbT#YetXhO8ZYY7C9zR#1E)1kcC8xzojB)m>s77GWb>=%E#9#@Mss@Q z;=9h>n9@09T=2I8Cb<}F;>J94S~{$4$5s0dyDpqJ(t6+L25rCH3$5T)@5)tc#lk}2 z`*KFX?|ha_cYc0l-qqc~i(OBDXjSI*_3dt_=e7J`z0i4ea+h0wJ7w1_e<8j{og-Td z-M_ud*MiUR#Hw79%`7ge%qgqhuPa)H40ow`x6SifTbI`;Z?ddPYaj1t`;3k?9y$~6 z;EPubvb8##&RUPHCk?JfWsDxHJa4k>^wOmhUzL3O(LFNWK5XLi!6D0L9RExI`{hoX zH*dQ>>_mgi6IJ}mX=b3e4{o>dU#|S%S5E)LD?eH6oVwRw!IWm-7g#N;>wBoy;eq`t z?`nCk_caIp&#rR}Gc>~wS@!-z((@%3YWdEa`seHL26LMAh+b%VU9T+PXXWkNo(Amd z1`+To{5vGFwMV<_Wv(h=nAyzUbIkI3-8a0BX&1Ib-FK?^E!8YPr+!YkW)pI}+~g0FIt~xIwY6iLeH*HDT>4~OvlnG`Dr@@L zwfrl7Dfpu1S|g{S_wCL{M6BQ2(j~b{#G#nv4c(oO4|6bT=o+@}ddsU_?#2%I+qc~B z+H1!5tK;9jpJZ&5EL%P2{in(c_e`zx{nPG*$Msw7oxgF_^X<-+moG_PJlHX?QsDlO z$XiFe+svCgUQhpXtrf%4Hmy75cj?ITGbx+(oq=eVcvQ#hfW>FDK8A4{KtZ+8y%+lc5_j+ zYqPf`f2g177_-jbmRbyvw<=X@Me8XaGj;SQ8Koa`tTQodQe?aOj&B>ieEvYOGPsLw z(1wevZhCgPsrX-y5C7}(>EPAg@2gRaSKnoh1FC-;-KTG$(dO{UPo654q?XJIR1Eac|E2X^5ZZT-uI^?d&LP*1!xd?Q(cqEK=d`xizi+3HLSqK3ptUv3tjbaaDL{8~5SQUte?ouiO9crf+D?r=@Pr-+E~Fk4hsd z2iBZ-FB+HLVY~t@TWE#h}%O-np%g{MYU8ciW3*`Dd;2x;=QS z=j*TAHpr_@RNCIFH^9GslkQ9XD^{B~pmUmoRlDfmCO68}?$m$STtA2PMmkmGEzEJ3 zhP{}YQ%c2$V&9^)icf-iL!F?_6e@lrbOQ7SbYdA5&*4tHx3P+MXLsDos`$0gR1+2d z6>4It;!7EGoHNt{x)ACBJqY!LeuMf#tCm;sQPAhmt3CJ1lJaoh3AYAm#JQx%^Iy#URGIykBLO6VzTpuM5;W+)%1qcg%oAGAdH z3LJOa1>vFAu81FM=%(VcpbpSnXnUw>MU)fN3tF=c?4j}QDn1Qr<$?4IItI9qETo?t%0`pF=IpIc}Q|{4?#T;*UUU^-}TA zpwpqIRXOe*)CDTX+BgZi$QR`U4H|&*!MoEoe#i&(gFo^MtsI1QQk~<*2cuk}jR&EA zp;ZT~_#EiS2o>*a!Evo3ReU^jdld2m{d1^_H?F~Py`oio59r1i6~7p|b~xGvwDc$y zZ&MTP1{x0iFb3zL0poEVT4e&t7ndnrC!rpo*QVk;R5=6Zp_^tSyd}pCny2DNL$jb8 zp?@q?@gJdO7a@FY^p|Du4}H5*#UFr1tX1*4bvQ0_1HwZaY(jpaM>ZqB(D7T5Up$_b zZCCN$P#5TE=tk&T=yzx))M+=$+X{R?${SklFxo5hI0e#HLlNbKcMc=$dmBHj!&!j6n1ETz38$+9}>E`{TKaPloP!gLFcx=OCTd95?Pg z(h1%5Ma7?oet>2{`+URs#vJ$PJK}|YR-xXZw{&<6_#8J_jwjR51$sPx19}%8P;+QW3!V>!_JSrrH$WFd|jk)E)W(>I>~=#q*<~d!fsqr=WYFSE1?9`_OFYduT3{ zs|SD0I8Fg=2sMYgL+e9*p>EJvs5f*b^eOZJbWDBtfzE8e^A^qVB@?I<^cB<#TD2kS z8Cn|}3+)0;g!Y0ig9bzQK(|7(q5ncnTX38~Bcua52I>u+0S$#NfhIsVKo>*zLQ|oq zpy#1ip;^%T(67+4);wRz8TAIWfF?j4pgW-6(4){$=%3I8=xyj?=#R#zcc|P3?GS1Y z&4spu8n@)QSf~?p4b%^M4w?vk4o!ttu;uwos0Wm5#c^R!E9eZUH*^U!6k5g}@j z1+CzL@^I(4I#5e!3#c>H8`=XJ3=M~lgieCafi8n?hwg!1f~G^iL9?M2Z4p1THPqAt z>4Q2!C$__Q20aK(fL?|!hCYI(Lcc=KL;cz#Tw9J?-2q?0g6@a$V zLTC>38?=%q+Er(sw}DzhJ)zB@e$dX)c<5|sDs&t4JTx7e1+Co$;oGBsLK{NoLtUWn zpgo`g-e|Ybuh7xZkzJ8b=tAgP=q~61XeKlR`UUz7y1EDQ)dB0mUdRV@Jaid!KlA`J z4Vn(kfM!9H`k;NXV_)>&jvSXV81X}MprKIXaKsPwh9*IyV$fe1k3)RWjNzzXXwC?< zZ!gTpBhhZ4-lLEXXcBZZ)MhloLsOwQ*zp+TzZ1ek8$!LuqWqyY4Vx#LHfKA&s3xfngR8K zrY0gB)Mh%uvGX%fZqTS%C^u*hG!>dL8*k1)Q|BOF#-Zk25iSYgp~iC&9-0bGgnG|I zcxcpolr!VWD3@-qTZnRndN1Pnt^*iK7sZO&A5Q}1Wo!A^~H|UkoJ9?CXTQ8=BE-8h|iW1T9c z3=ETW42&5!$Dc9(gKt1KOtjl$EWkNtpP>4|+b~0gaa%>D4rNLXH%#DKS8h??-m)gc zEIfurE)jNy4A5o;^~K*b#D^}8MS;S2rp#4QX^PxcVKGsc?Yb(A-4qDKY!mUf9=19< z*ujY@%wVFtogz(F_DL{xWu}}9{`TN+6yBP@1Z93_%iI)|X356x&7r?2Aw z!9I)UCuQcYaFLBGqWC$M=1lmSXpo=gS*(6%vbHycwULQ>?g|^(3?m`QXlY{Ga4jjc zMN>J}8iu$=qI48!w=VhZmbHBs*fxf3b!O|XFz$<1ilrad8{83mFyrltYNIlrf@S7< z>TLR@%gpa$g!e~yEFT2FG8cs*E6-H$+2HdTN2&{?ZAO0D9B}$BY(K&F9NB6ugM9^y zfaTd#m*XatQ1Mesh;jU+9@;5V4djDMXl=3_^W_I$$IGesk;U{ipa@?q@9gR2>nc>H znt2b*&%03nux)6r;wK>7PugsB!SctoJj_k?I4=4>2+Qi-8(|M1Y$S`f=z8x{FkZ3V zS$QQP{8NN?Mf!iy2QV&`meVx^Dk@tU}Q z0#`pNPv3(166cX1gbhI09V~4A*zT^dkgb2@ zGQb)c;r+1B@~E}HbSd0l+!1at!r7z0{j!{s4CEE%Mb}vpe08gXCw;}KvmBXxP7!s+ z+R|P4+F2L(1Thx=GF>V9^1Cwa>0)JLWr+JS^;JBUSw)wPd%?1?fvF$DjYPOR6i$7- z!sITKT~t`8Ed{truuDc5iw3yY#KNGxOq2~0=6*IVrsA9p&Xs59Fy4CO3`^g6@J8Sv zjBBsEC|S>f>9b_E7?*qzE}HGDw@q(ZnAA2samosI>tWaP*Y$$AN#3%EY7x>J3ty=X z^Vc3)+u6#(Y2Au&R*h8r{=#Y9E~S;_=?3if!|qIByCagF70%_tu7|aX$E^pU%+%{< zm|!Tbm)MwSWrXWy*#0NuX%ND0`47UfG^QZzJA@U+grAK0%r+giHtd-}!L^3FB0)#q zOKq*eY`?-b2+tE%V4t-Y^}Nd#=4YO@Im?o`*8*GCHd&kNjyA{2#F_XW#Rs8m%!n!MxbY#m5g>`}b9)wLp*j*G>`}&t0Xj%WV_`kx3wH?M8 z@}Z>-BL1a?;`fBTLn+)tMA!z!=?f_Paz%6o<|_fdM%x$DS8CC|T;MAmzWO=fUOJ`g zC*|_SsL+LhYe_HoFfNU!#dy9{qBvuDf=nJND{2*r@1nq;AN*xCRq^kz|BEr$U6HCI z%hWyu3+vLY@HH9Fw1Swg{ILpaFddnvk<_zYOR{(~VS5y|r${yINhZrK!j~0%MZwo|_=;ocQqM!0Ox9N|56pHMZ0(!Vn66>lQN#8; zY$J+c%j)R&idEKn@*Y1a>v;Hji09ZX=RJpvFjut1Zp1k10up6GS7N?%jezJz2^`>^8p zO45~gD#915^KAIK*;~c$!+z1_gKII38)a57x4;6ZOCLNRXMI%|gWHtW?=(j~TX&Y; zO#O)l@@g{NVHD>&FW7ePhwHCmrP)B<{U>Q&3}11058>d?)4WMrnr|TN2ZVJjD$SPp zY36Vt@xxEWk1tM|Q+4IRMWk7%fB3QsQt`8j<15QRKJ+KP7QGt}9AD@k)<5x; z4PQeCsrVDc@s(pJUsr@L)*fssaNLt2G=HP76<9Y%65jdh(S& ziFYG>t;BmIanv4u7Viz%K8Njn-2eFH*o*w0($hZpvOHK+#QJo&iWffn@vAu@RVEu= z#Bz}N3Wu-8BUSvk;><4@2J*ln<`-5E2jDAUtcoAR%2w)^-fI2QM=tbB7o5(9ZOV8R z-~AV3TUL^eyoHv*jiueH5}t!kQ1Nq%GhSoP9#+J7&3v(Uqdrc=^=WZ@Wtqx172%8J zEe*bgOh*6xMcy*%$iuW`O;}g!nsMCbDJmY1(uMY--nQcMo3!vu#0IuUU>nM8^Vg*8 zUY1f?fwTJ_ya=Z?aAWx-c$axvRlhS5*kTEi;1|GePWZ7t2D=(ZLGAN3a%udOYEtut&x zSvmZqt(}J5XxN2Od8xNw;T~I`f^Ch}=~sl=i7?RxeGAu;1Eq$Ha}Dvw-X;73+u23U zr|)%)FxcSf3EBz%dcfXiDf#wPdzOc0@1l0yHb7x`>MZQpHCZCU%tx5nzizMWdR6YL zTf{&MUo6d;@OO0u-se*vAJx|_{pERSX7L+V#rs&Raeu3r@q7Ow{@(D{Zhi6MXKfxo zz~W}${X=h7#=n|pGIV4^i^x7(@7;i}Ia@GSk}oZ5{4^bzU!jLzEUa-gyhpnY*Pk?J zYYCeolLZzE%ktobuzh#p+Tzz`h#^oWyH`XWuzced!`HRF^lVb8F!skiHx@@K_;+yY z;>_(y`m*&!#KC;!!q>e+Dn6n(^H7?;yjv0T5cB1U2Y~!x6>nCYbfw8itq=bZCuQ^m92X8fYg*)t;FqPhp`YwZ2@8MjsZ@Zz)| zT$k$0i*Apse4oMJuDdF}GE19uy`VlH8tMqw3qd$-YKiLwygTkwoN+u=U*1EeIH2#Q zNYj_KSETC8l8Z>mFv+SdtU?hFdslzf0~LR>IPqi{%Kc=DOhY$CmLcB8!C}Rs$}d(j zYY#KTBjg`(eSVC08d!Ns`B%SQiWaYzn5}7TR5{*_e}w%!TeY&1?GTp^Yz%XOt>Ft5 zUx(S^*$|tj81D^U0UWoQ1>PQ<`HliF1s=nAe%lq+U3i~}y%U8Oom@*YAIo6d9kzX# zEqu#*C=7AA!|ef226to}0Vd+M7R#q6{xWcGJI*!xbsNMDF&%l;BIXVjzd17X$4mU? z0{a{WL1ScsaR=}#;NAjnD{w5gI4^L6SNQD#<3hS*9!4xPIAnPb$N4rmznjkIz2k-F z7u_}Tz8JO_VXIz#EbdhBG%fr*_&zN>3w$d$Zh;A9*qQ3*EBFR2_QrMbY)cEb0-vvi zyMRyA!h3^{184OmJu_tKk0P#~FIF~*;KN{_%8B|Cs;?x zmY6SCFqk1T!p7{_?_>IaAz8x4*d(xUH9J;Li^2TB)XS+Mcq({LaCa6uznqYZ&Whfy z3cql*{4@J3*bjnzJ5Bopoz99x9ZdB)kx~#AuW>!RZ>$lo1-KRXaPTPdGf{>|06MaP zLU#pYVZC5;3N~n3LY+%vK@j*=4WG<@H266!JQ@5bxRU(H@KPCTM_a*l-_jf@^snJU zzh}PA<6I4#Q!g*Zv%o8YcVoU}xEP~8{1v<_cu#?Q<@I6qJES?_Lfm2>#-l~C53@dQ zgY%bhK9GeI&wEMdS$y6&UpA+3Izqt>wD1J*AMmG{?#19Az%~1AD)<{M_UFN~weT$P zJ6iZx@C+^7xB=D{;F|r`inw|@(X==h@FTEaMCFk;rlrYb;_G-A&T)>}hQqcl%3z@u z+v!px#5vY)7sGZgZ2yyf%YF+M^saEfeFZ)eT)p42i{-^&wk%E9SbsDH+Y83h%{IoY zz&3%Q*o2KSXRzaHc0&Dw?FLgXCm=4A!WX9>GJE#BwKVwY#A3_u=j=VT&WdN~w>dh| zQjrPmANKb&;+l@V?b8=I-^k*HA*}a0D1Hw`n zFgv`lh8Ka|1UvTXu3%>hgBGx3VaT7lT^GsD8Fo!!hs%tDcJ2x%Eb!TqPw*E6J6qWO zDt$9y=U9yN?SWm(Vx;db?3(=|eM0@CfbC(Y5c9#N2;m-@Ie0zrII2(e`-L` z4KZPT$NYG~KJsHhKUUyD;KASrng0TPSy#4A>clw3Y?EO79=3H@oEo-Q)z*ltO&@{n zt4|msu`kX=omi0z<7Xz$-N(5*>>QS^h1NZ|F{&e*R;V$uc5I5laLi}?=C$asc$7S+ zP*_$5-UwUc3!ZDUu&kdUMU01n*8#`vXo35Hvo%ZtxE1(1#?|wiViv8~XoenxzXN8t zx-xTBoQGzZ1t?O?I;jTQ8>EAUKkJW3SO&)&Oa+y(rm zhCM68-r#?NM=^VCip925N@*>nx!DryIC&Hz((0;vPJLjv4|6Y{sbJ5ll=SRbKz`rVf1G^^u z)jeOlYc`{Z=cCNmdH6b*i)Z}B@r8GqmKNd57`{v$@T~xj7yA`E#-9~fH8Ayd!E9Z? zO~HkBf#XW}>j73CtQ_`f{1nDQW*>!fB{h6c0@K&xdl`5xPi+oineSAvA7I*ir{mlQ zoGS}Ewz2U$3oHk$2?MCRiLxN}YYz+IM>{s{g=Ii2P8q_0`5jJU-A~?^6W;3FqWFqM zNCJKfl!z`s+3)yu)17eSRCHiT&>1UC#4vTIOn&!Lvqc&j2xFvwY+tOgG)a`C#7)=%R(Q zZwGj3;p|%i?X__BZGpC0cr17uaP>N5>6r=MK?`3C-W6Q4jT`|V2(H;iZh*&X@$(9N z6nF>3#pY(2hr*Cu)9T`;^EPk~#)UBgB{o*PLSX5##QAGl;&BGg(h^S(@TcJZ%pWU! zUooC=a2q{*$&GO-9^A`hcl?C><9wiWo`u8BDmI3#1z!OkMB(Tf6Zbr1{iNj+3!e_V zJFwGim+YGhx3utF@EcmVX>;7K(BKa6-w>RA)5C?OL;IX=6R9zyvvdU^++PT%S>L0< z?`d!rPcnEmc%PrdgIgKwZMD3g?y)g29pUu!`2yERSzxumoS83d%^m9vz%bNO8yD|QGy8Cy`+{>N*g0&BvF~WS0%QAZW95_#_Du4Ne##~_c77{(k^<$& z);QSM`Z*2kNLhS8fGtcs@}ARRDxArF$2&_8mw~hKaKdShfv3XHyR^Q1q@q+w!(Lb1 zFS}oIPj^4%epc{p*b@FM@cVq`7#oYv94rCMOduk&0hFq-CbeG;$q(eNv_ND^H|+rV{93iMSWiU{T?y)y^!<JT7}&@<{hM<#84RCBm~b*|fs*3;39i zeYUya&mC+rm~c<a%Z>nU%$LV7EkS)-3!qOHBAF;Ohy+8AT%>{p>!P3Bd8Te(| zhG@DKe5Aq0UHBOOYaeVz@z8YQf^fR{UnM)9pZjYcC?i>bHXokw5epyti{~Rzn~$0B zkpmyMi|1prHXj-AF%#d#!LabFGA*)=o4TSs!pAh`1Dgx}*!ONs9savE9u6Pg@G(sX z4u4+X?usb9)dLM9c-sz6F#4_Mg@r^TECwl>T4%U}a+32J30Hv@vD{mfSjj++)lc z%)+H$JMjkF3RbRQU1*z++Obzl9u=GlI2ZU$hL1#7p08URAGnM4Rf~^I_?YB|wPx{r z$SrjWI$`NBcgK4G_{LJ}U;DtgAUD=*uU7Ek2OrDac^;3Y3$=MKjk#qeY*S(DBWK29 zze@qjZTFwI306<+8&2$7OzhhhY;(ciS1?_CZ>bpb0(;+ZY{AkIeK_NS!393xs!-lE z)*cvZx@q)g(Fgl(lrMZF7t2TCv6toVG<+O@j{qt+#xlX!_ngKszt~*x_X;ctY&rwK z8h>15y)_feeAu*wANVLn`)2b^mNp-;@NvEiFMgloSLyH$)pQ|@DF@&q8$KTBv&@M3 zO9#v7N$a4W`xWl~Mru^53;Y=40ae3Z_-+$(S_~gcG<>kK?*Sj@;iHdQ+Csrh2jENh zzwS#cZSwP4`r}sk$WZcp|6ivK{V}3Yxw#-5`|j5(`0y*1kHTv;mOlsFyfE}b8&K<~ z?7Ly6fi$=Lyxds3aMw(n3;fQ64{!KL`gPivj{%xKSlTk+V=H`2V&#m@5`WoXPQg4s zl7alRvH3Abnjfujs#JUQ57=p&e+Hy?kW7X4=LFleuoe0~!m{vQU^x=Yk4G5$2dj;B zEKTvSwH!$ItMYAGnx;t+F`p^0TL!zC*jG!t;8PZp@09sWhpll4FTT@V(5J8Dlam`Nenw%fz&?WYQw!@1<}#cYpXq?R;LjV(3#=YX%dhgCp)31V zFxx^pli|ZN9^Z_k`HH1|BUq1-y!Z?a$BZ2T;U}r8|x6N{~wNf9J*g01KunI<}(lJRM8roI}U#8%AvVmyV@$i}e5d|7THsX~yix z8ZEZAq~Z~-XQITCZP%tva|0lds@0IcU#3es%iEp6+!&5Zpx)T3DygG3|;^nG}U@&p^+jr)F3~>+Q>>UXmt}N%G zBeq)`;t6DblI*4XW}}Gvum*;Wl{?#J5;v!WOr-%~1@SdjqMh(PHSF#pUe!+IQaZ1I zv-EHev4hP|_&=TxkbNTAPayu0xDji>*d(q%c36JdcemLl46e}f#J>}FrSX-0(}BgC z@L7!41;=c2CZ56~!j>fp*Hz-eOZqq=7fy5Rw;#+;+8fa@gZzwwvh+ARiFU%b&9KY9 z9mx1U%|sq93b&B_r2Z@Vk-FLE>fD0FLd9DBNGfZxC-u{4w#eSZlIvf++Y#r&#{;HMkl24Xdy6 z9YsT_{WyZN@^^V4CNPcsuztqu6Ug3=(yt_*^jJT35soUr2lp*(ZUs zc>NFy+x&$8tBCAMP&t5x)C`U4@?L z|8!50zY~S4Lws*95d;x$N!+Zr2u>6CB7ToJ5+`gy5~qcF9NEVc?@kk})Q{#6KcvCe z5wFotbRb=mpC)cUK;*)=iLiT%_&KG>h3^bu_ZjhIe~}B{48rbr;(mc57ry_4T?N*& zto&;Qi(L4Y4|c193+;!-Z9nle*N8Z$!QF_bXpFc0$$tkb0O9*S@HLS5&QQ^@@U0%~ zjv%faBJww)aFdC5iV%VD4IS*xA-;L2$ff$KSxU@T1Pw@RzGvGavi~PWJQluNgWWa6 zzsHJvrYKwmbTn2zQ-_IwtuxrRi|jM0!gDb$u{kW0_R4?O2L*ZL2*tH-&Z<5FllAqSZJ5CmX@XZwL_9H%is>p@!qhNO^aoG%! z3*S1y?o{G)W{X_-ZV7h(0B7wXL8ITL5Kp6VM(U57iMwdn?WIl_;Ds*2^rNQN%-uv+s8@o=KdBdu|05FpM{# z396;87{CekLVZ0K6PQ5h>_j|;E=1DEe*kfdrv>ffh;vUwZb5#ggR^);sY46j>42{# z#4`_xj-~Qs-~MFwZpTEPN`8(Le}7!$PekD^OMcQsAeCnp@#QB)E|uqd;?-y%l*&^d zH%M5#zQm>StVaAa@i2<_8Lo4feWNoXpkb7=Ci|7dg>Ol~#F==b^9AiYO7_Hs??S-D zA6%#xjq93F$D5Ny)>OO=S9CjXm+vv^KIJZ-FKKcBcGIBJ>un>b$}uo$^TojPg^Sbm!5flBhFC)OV4!6D8zC|SSQ+#AwTTftZY27*eG)8d8h+%KX&5? zn=$q4*2Gh20+F6k_9UKmP_&n>BZd&?j)`1)t~NpPpC)p5O3!@a30FlvkN7spPlm{) z@#h@zw96uwp6mQ2+0zBeKJhg7ShBw+0%@K0FL4XvvnbxuWvJY4iT2XEw-#}~nLiS`LBA=pxhXB`o_RG!C(TTuPdZ7J@GWIsc+ zA1Dg<4{`2>2r7&2xfjG!j*EOG@jzAq07nLWLp+7} zT;g+x`w1_9VO*v3$5Q&UCW}By|0>DPLXiva0>Re~;t2~xE|tS1sb6UH#|e`Ec?JC+ zB|lkK;@v<;$tU*hcFe`VuO7M14|;;)GN%@XaU_r|`1v!`Uz z^E)!<*td6?pA1^ac+-ih#5rmwgNR!bPe>8{OS}bfKN`pU$i9o@hsyajabJm3yP8fs zg1E~P(ZBEx6wdu4<(DRW53(OF*-sVishYVY;wjffPPb{e<={G81y1w1*m}ti+4m>^ zyNIWq5dF*~eq7=-5Bd_%AkI-aNPgJA!^_HF^ZC&iftTZKXdOasIHPi6y`=3B8vbUgguA~!g#Qlznev*hMQ~R`_c~)xw-jY3)b2Qoe5>KFoq%>Zg zzl;#rAedIBg&B6vBp!!#QI#j|+7nfQhlB8ejVEyPE5 z5qTZr$HB|8`yO;(LdtIj*(Xr{65bg{m`B88yhT6V$q)PYkXd@n8;jhQxB(^}#;0}@ zxm0e|!OP+~V@i}b?^uw%9ogG76zwrw2wNB8inaxL1aa^7BCk&Nqlnk_5xMZ&1e{wy z{4AwEi0n5LPoQ`g5kF46Z|8#k|0Ets@k;IIKDY&iRhRE%-@3NwN6MEG9>lZqv9}cY zZVJ>36M+S1#u-!jNWQ9&J?m#|(-DJl4T;aA0dk>un)4uTp;2FbiI1RsH6r^E;*T2? zOy@}AF&gQgMSKC(dpq*8l6YS#PYlb#www5IYKQ*B&l9hu;s0;q37%qla2rI}UJ6iF-;-8v{_EpKgEDTsVS9C6zuNuU!bt>q;5pk=IA`caXtAXo5)^5|NUJS{@okMnUPcrylX!V5w=A-M&A1HDacKQtoA@WP|De%MN>mWbnfD<7WZ#(h6KdyD z`rU{hru<5L9ve?&#+>xN4Y}cZ3ii@bzJ`&Xr+vkEOVFv2#A{H$7)pE|@#ob=`$xn# z7vcXX*}K|`_90|{nfP&PpVIhqhxo=iR1TtWPl-37_AkxL--%zKa-hppPEk=TPh;vA z7)FH6l6b@BqT_YMTM_r6@z#!bPvW~#u57y`3fEt7Jcnmn37hHJJzXhc~f$WpK zMBqj|iFj*A5lHp8iMSc%OB!b`2>vT`=QaA#4YIGT(NEd?8La<4)##_6h+nWN7_S@y z4zqvWLF8zz!d8~}cdADzo$R~O%szn%IF;;e8AtoK*dh9n?mxJb{aWf@jmdrxaRtRI z)yr7oJ!l+gLiP)ZZ|))bHz&SB;B=rSm&pDL#aolkb;J6Er88@s=w~qTk7S=h?M-T* zhL~Siehq1Ul(;$Zu&xEm&6+qz?VP3`&Ig>0dz$l90NIZvKQ5G>XyO6vg&AzZckZBL zi96I6c{AZG{*6cCZK&L&c#jgVNaNT-vcF0^hsFV8;#tJ+QakTNJcsxKjroQBn2wc? zt42F4j|)b|C(yVjt<&ldZ>6zrX##FRajA@8Fxm-_X3viH(R&lNN<=HKXnXphHC!nU96%hwaR zl%6xh=M$IKnb(M0+z{>QH1`x-NdFI!OY`YRvj3*q%U@Pgw5REcGb3(G?SzIo zt`TvrhG-u{>2FQE42=WQIN6POi&~<+7ugRc?sr?{(sjxx;$^A*Nd0;~@$ocnCzGFz z#9!0=D5diVaXlJP_sPWoSBRIe6XT`g;2sk9p!&@#H{y+Ge9)(O*$>EByS3M7&t}Bm zQ$vvIu`%(%WdD`?I1|6tOH9vIarw^mWSotYbRW-|><5s2v_`u!K$A7bIx0hBeLIxw z>uHSVn!LX?&NI2wPX`Z6azipITel+Gr^%hCLf?jUR(iJ#Nx zcm0WX((oTi{1W-^Nq)u?-$?CWO3xh050z(KvR?PvHdyOZp5HRfgZ9wE!`b!vyw zI`$Fq(d1uhZ|{kBqW&UrIi7bgKQ>s`vn`d{n<4QQPGW`J5l?efi4URtN`9<~->3Xa zyd`lfs;>#;rzi1y)ZV1|F_3sF`H|-RSmKX0+HE57!Jc9|rS_Igd1w`@ynD*e(*U-(HMY>W?>xbNxgvGdR5m#ET1@y`YKCYzbW|_Mfe6e zE6;QqSEYEHk$q2%_TWMMD)o11-KZo!mD+=U!QW?iMmgY<8 zr1|9r<4EUC8c&p>Irj{lwG#_^UoMIGC$f*F{ws|eiW;Jy%^LZtN?g@Jj8|$uw#3hB ztb02UmuZXxJ&DIqyXr&f8AQB=M*E2)-j>p1O!l*hm!bJcx}I7~Jd?^zO6NInp&V#^ zVncqeOa2{n1R{+C{}TU);ze^3Ha)DPSUDfl7*EO(|3UUrd#g@-8m(uBi_vgR!P)vy za~y6%_OEGNlG<}0;_In=q;?oYyfvjq%I{F(JNt?eO8s{#xCP}!T~?93twwzvBOXZo zh=yNkzXLl>DCUyJ)PhRuC^=NAxqF(zA6}b=v6E16S2Q}6S6^Q3jIY|A;l6XU!Po;6A8S&lJ zf2H=&nRrc&b)7#rD^C|HPYX&^B-!`ZD4$7W@26p(ME12P{ZUfpx(7zb+FD0)N~M25 zm@?Ksgae3)9X4>Fbs!fM9331I784sBt&AO_3=EHq2#$eeP^2;>JTkyPTp1J_86Bha z9~RFAMh=Mz4~`8EvbND~p&S?%5vKHyj`km+433D69>EQa_8$_g3>r3M$Or^cJ5s_j zwt#6scvwVGOjrog6cZdB7aYxYLjCQUI1090NN}ujc$|ND-i|UnY)AkX5E&U8ir4}p zgMzuJu&Aiuh!EsuIGjZeSH^^m4CW%j0s_K=Ic6S%G(-+jh6cy`#{`CjanS+F*y!M3 z#DF+srTB(L1^LGYOGn5QV)g9g+J@y1yWU;eckU4x;bE%`j0g_lhWJM*2SzDl!-fRM zDEp4&kcT1uTtINFKNl4l6N_X<1St z^Q;P#9jpo`&JKyQb<9f~>P=`QuGY3qgv8l8W$I|Q#6g8 zee=?Ru^I)Je;A`g-D955Pq&>&Lc(M6tuU>J?$31uVr0v6%*AhE91gh|MMcuyC$zrCi75TNTVTS*Yb& zBKc(}cAJ8^#uzFrR5WvKi#3q49czQ(;aG8mM@B}WV|u%`@7BI^J2q4e35(533;Ke8 zVBj#6GFvB&433VpR;y|$JHk97m9}l+AZqIa?Lm}U+l;N!Jvb(Bo>Xs3+PQ7~h>;_; zc2HYoz_7r*Qm=@^=rMd-i-Q2&@v7GPjlaCi{s=GEQSPHFFCtL%m?FQ{us z2rf5RS$cOA8eG2DLZ@sw|CpHIApzkdaP86&*8#!>jM6^}i|6>TA^x#pmey5~42y`6nz~@k#t1CH;mLu;m4Br4XLD+Jq<>Iw z5Y9x4k>s0(Yl4_z0opyqi@w#)7a$R_EOJ6(0z||TFZk3r6cVeEJSq*A2Vob5fw3Vn zFJyFZP@u4MQip=2nh_cSE7CkCbP7cr836~9Zvn?d`LpIFR*^=OB2+7&A}Z3}mcomb z2`lx=#HE5%%{Zf8nN4i%v{WYfQmf!R|Na3QO$bx4$@zvvw<`HjhX^Ouk^E7PruN zw!`N?8O_D*&M(-$hFkM##!QxugxTL;#53*2T1Dv--yHls;{o0NqDiOD}u zYvJs2E8kkPAcY`8f~n%L(hA37FH8y&pI>9PSl}t$`*e2g)c*f;cJ;n-6j6L7AQBP< zbO=#gK|zz1Kbn-WgJWBAJ`{|gTh6}C`LKO=*}DsNgD8;@6$K3q1)`v#LKGA(Xy_0X z1x3pI0f-KX-<$V-GjHZDC}JgNc4y|zym|A!cW!elAhH=!8nY@+vBUr(kdZ(RscfOa z@5b79@4naU>@~`ud+$65xQ#_8kigK1y9f8Sw+@pNBJ03_x7bHN3N6thaWz0m|SX|&>sKD+*iuFO~(KW&PA6?FoVGE-errLyDkjm_-SkdVz=rV4G8#drHSmn^+ zp$^^{{>}d3ohD`R4)5HO9NuAH(k+TB+)Yi0@R5!<*N!zqZOs?NAP`2eLS`lr(;jks zk+ljL2pxQ*SwuhI)+3?j^wh6Lvvsmeu5V_G_PsE0P=y%^(<~GuJxtaTBdEPm1p4$j z24VE|NKxaW_>Ao~l%en-B&J-drWMtxO;fnOXhVo1-Yxb3j^Xi1$TLoi$D~UfYT6%l zqwx@G>J;M%Dx`(sVUZUJfkoS5ZrFr2K=83hW5Gb1WCG!&T4JFxG~-Glw@Tg^r-nXWMBVmx}3kwe8;x6)WPbZjrC{W1J4B-8rC zwz)VTAWsMJe7HQOWD`V7fUk8DhyWI93(~m9W_R*UfgJIHp~GD>ntRS8Gh}s!o!B551a20Ej#3_D zsZ!GMhD6*cHMLpO2Q){ic%Lo7~XloXH<&BGY0Q4E?S2>xRuEp7w)!%x`pxUCm zK8%2{O_pv9y{Lh5)GWCilM&v3L7W@S-Qd4L~Z2G!ZqbM2Jg*6neSJ z^-{1(4J@<*WqzM>gEj~^VOR9QSroRyY{%C^R@Zs+6rjzMNhHa25{;OsXT0pd+XarU zK84xiuwkLEH>t@GBMB8bJE&ZB;FA9nM~$k50B)-sizR#`?TW}m0whB#plahqZgc)U z27}-1nLPkzZn5nvSbILvp@I!SG(ieJ%XKgY5!V3knGFr=-1R>WZfP*()?_cX756SV zvnw1V8yfhp;`cnq$kDyRb)i${H9jMAAsLPKf=jSKP9@1100fD)*W9{7JV8SZakU@F zEbbuv{Q#TAW}fF|-x(<2J}6JaKnWH+ctc>xk6?Y6FvgZdH=|4uTO?M#Z{7df9ZT#I zCDa@p1qTgW+(xq#9Uj;+gp!umQfj8RZR#*`xn#$n4U+Gvq;dWL;#8hBSsA5fF1hVo zzEn8_G61_XX2)xOq_0!RZWMWzUr4ry61YOsWL~YCNsoAE>ECpEgqAAqF1s+r%$q@#}|D)pMo%MIfR0uUSZ~C9gYd8KX6J$ z-rt7(jU@2F1HfdnmISY(mIZTVp-E!i&UQ2zz+i`}dUkm}?;oL@cUqqC4l!NZO{;o5 znYZ*24)N=c0yc*qRYKO+td8ZcjjDIq-Bcz#x25P(ACU5WL3> zeY}6c^pm-a4CExyP{S3dYVhVc4pH$C*Z(i^^M*=Jz}Ydq>h%|NES5NapIv%dE>+>U zTqVC@A^Z)4XW2_#!yV7JvRcl+dH4?uo+Xa0W!H<@6aN1}JbQS4-jq3I7f1geDvxKdR8nO8lHLyUP9l8m%6_XK*YzKL;s>uf8kb z?^_3!#Fy}VX8A(_|Ea;V-1;@&@tYjT${!1O{=S#x%SHck{XZK#_m97OW4R^*?yjD@ zm)l?PCm0fbjo-7f@=YF`md?1!`F}^9hre$ho? z{toHP{(NTn6b*7Vm+<^P$sf-N{}*6|_XyALdOR`sC$@beKXP$i^3nIGlP-k6GD*B@ zl%j77kh=)a@*lK$cz&1T>2tzg1F(nZ?@nL+Q0FI}M1k}2`4s?59r5Qm`kuiHfBUQF zA8`Mg&A(RE6COXW$u7ZFE?<)Ov_=2DY4h)G>N9>0Rn>;8)KLG5d=Pi3=k?i_I_kG? Zu|`YI%UU^Zz6$>OPYVA=0Z=?v{{sDupymJo literal 0 HcmV?d00001