From c0a67e0a23b796861fd733e2beb8be517a8f23b1 Mon Sep 17 00:00:00 2001 From: Indeed Miners <32953696+IndeedMiners@users.noreply.github.com> Date: Sun, 6 Jan 2019 16:49:08 +0100 Subject: [PATCH] 2.7.1 --- doc/compile_Windows.md | 9 +- doc/img/interleave.png | Bin 0 -> 12549 bytes doc/tuning.md | 55 ++++- xmrstak/backend/amd/amd_gpu/gpu.cpp | 218 +++++++++++++----- xmrstak/backend/amd/amd_gpu/gpu.hpp | 22 +- .../backend/amd/amd_gpu/opencl/cryptonight.cl | 157 +++++++------ .../amd/amd_gpu/opencl/fast_div_heavy.cl | 12 +- .../amd/amd_gpu/opencl/fast_int_math_v2.cl | 93 ++++---- .../backend/amd/amd_gpu/opencl/wolf-aes.cl | 72 +++--- xmrstak/backend/amd/autoAdjust.hpp | 52 ++++- xmrstak/backend/amd/config.tpl | 22 +- xmrstak/backend/amd/jconf.cpp | 53 ++++- xmrstak/backend/amd/jconf.hpp | 2 + xmrstak/backend/amd/minethd.cpp | 73 +++++- xmrstak/backend/amd/minethd.hpp | 1 + .../backend/cpu/crypto/cryptonight_aesni.h | 10 +- xmrstak/backend/cpu/minethd.cpp | 17 +- xmrstak/backend/cryptonight.hpp | 18 +- xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp | 6 + xmrstak/backend/nvidia/nvcc_code/cuda_core.cu | 39 ++-- .../backend/nvidia/nvcc_code/cuda_extra.cu | 27 ++- .../nvidia/nvcc_code/cuda_fast_div_heavy.hpp | 29 +++ .../nvcc_code/cuda_fast_int_math_v2.hpp | 28 +-- xmrstak/jconf.cpp | 14 +- xmrstak/net/jpsock.cpp | 3 + xmrstak/pools.tpl | 2 + xmrstak/version.cpp | 4 +- 27 files changed, 737 insertions(+), 301 deletions(-) create mode 100644 doc/img/interleave.png create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md index 1b5787879..8fe4dcf53 100644 --- a/doc/compile_Windows.md +++ b/doc/compile_Windows.md @@ -30,15 +30,16 @@ - CUDA/Runtime - Driver components -### AMD DRIVER/APP SDK 3.0 (only needed for AMD GPUs) +### AMD DRIVER/OCL-SDK (only needed for AMD GPUs) - Download & install the AMD driver: https://www.amd.com/en/support **ATTENTION** Many windows driver 18.5+ creating invalid shares. If you have an issue with `invalid shares` please downgrade your driver. -- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe - (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855) +- Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases + +Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light. ### Dependencies OpenSSL/Hwloc and Microhttpd - For CUDA 8*: @@ -115,4 +116,4 @@ If you have an issue with `invalid shares` please downgrade your driver. copy C:\xmr-stak-dep\openssl\bin\* . ``` -- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD APP SDK is installed) and CPUs. +- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD OCL-SDK_light is installed) and CPUs. diff --git a/doc/img/interleave.png b/doc/img/interleave.png new file mode 100644 index 0000000000000000000000000000000000000000..a4b59b282e4c24606d2b9a026fc8e6233a65e01b GIT binary patch literal 12549 zcmcJ01yoyIw{0i|O3~u3Efk7V+}dKLP$<&kp^zfQod7|Kx40CSLW{dQ6btU|5eg@oNH~qzEhURdr0*V007`Ayp~Y|0PgVs z0BGC~Fi}Tz6D}K29j1|@ybP*GeUQ@iUZ@V+_O-SH0D#|q_lNfV89p`YAeN)TTUo3X zTr^^Cz`9v8HtGO9|e;hwb-TVfE6pwz#Ied{F$dkL761 zH8p{ELMJ#VDCjlDiYhUx54sofPwms+cL)C7{*%eoO${36Six7`0Ea!G9~UE627sA~ ziAgP;Vjf39rU%ag={hZRl}tXEps%~cd)d5Rb_I8}?C<`me6(9(rAGRbj9AtC(Ib&% zgVCZDz7aqRm0ypcv}qzBKLRl)?bQM;H{3a-VY4e}KIGAY733 z=QrDJi*av7Cl6E_u^@z20*})Sa>j*HY`b`R|5zI`WvoTxDvk@{4G>$r#t435GigHC ze8O&iaZLdVvYAQ^G`NVFi{PBI<`MfIM_1=BXZM{4`i?DK#N`V1cI0S|Xu!T~%GUb{ z|03+-rAN;CnPEr^JRG+3a*`jJzjj|gv?rK`04VS!DA}wjuQEqpA}mIq(!Vz&U*icB zKJ>$`uuWw`ARzDiPe9k}K^6~0+;c0BM_cLdmf0zj;tnR0xYoblhJX;0UaaT6lg}+& z@!48FE{nQ57r=5$H4nD=)u;B4KAFprCdeuc#AYT>Z*#=1uaZndgyXxDbBPo9otVa> zxQq@6)IaOYXDdLNIwRaZ%B}W6Vx`V7YLa7Vr%s^j z>O@^MRn)Hna?<|CiD&{qOl%wVE~)7q*J%fXk2chw%mnjddRdAKg?T0M=V_oVbuYKs z6+u#}>Fc7lzh7ary(rb=*;cq2slo;rG#oryZXITc^U`+_pguaxnY6>rZ+1&1bIM<4 zo66#)BhqzT3l>-$WfzRy84Is*Y9-jDQMq%PNVNmxz|yi*We3fW1qkxKwnDzVi(XR2 z3Myr5Hw^CsK+h>giX{MInw(*MjcBhD;I7Bz#@=z+i&xKhmF@IjVP@NVu*`Y#0!Gu^ zqQp^Gyu0Yg>uo_lDo^#DwE6&rakQY^nb6pSg%!RB0@Py|3=z@5A#{*}u3*yGN5bN2 z41KEh81d?k7qed?4NGE|ADD^xyzcyW@2)|NrUYPT@F5Vw>6pxVlc97vfiOWuETKx-t zKaNkkO5DN1S?>E@F(Fs=H;&D3HrbN-V!SC-_XjEwSYw=dgY zD*?9ixXxHDGHmfEH>P?;^hhM6T{#8qL7KO7HWjO(jQt%@6*4_j0?sTkM#;qV7$9j>+CCR$h|}Y0RMoly@5RR(HF0m{3p; zfclV)X7LpAlgsMER*^EO5v-mKyl7>^15Bkz`U>4vM}B-N!yt9rer7^JzzpPmqsiu_ z=Qilu5^&;|zk9hN*c4ZlGl#H@aN&GcL>CcpueL9iT8C+m>~-FAN#FO{^;-J})Owzt zni&%Aw|*C&M;!p%O+rZ_67a2AcHpw?s@AQXPJV*PT`(-o!IG1axf|m@wWxVTeg4+| zG5OVSvFCN$TV~w*Ah4|&KA3wKQV+%WtY)6iRNJU%Xpk_=hh5DFP@=~9C~`5~R=5v` zyJ5c`kN+VL>UIzR>N0qK{t3mMPx?Zr1ip>uYpOrmJ=z)HjXvb!<_?dKrx5T9>f%+) zd_%h1aHJf7M=b|~hVnvVE=8$z9~s4>;(koeQj2c_pYfYq>E7>}Kn2wK12Fru+R<)x!{%TPdfMe1;O4y=*L-Wjyly^YPw6PN2B`HHhP=dY4q)Zm%cf*%=MU1>ke$(=Bp;Uoo!DkJ0`Yj3 zcrlwNeFqPgj@_Vf66OZ!peL-{}cM)EO? zl6s}e`7JMWXgTk*R(?t#9$$f>ehBX|)wO^*%JL4EQOHb*uEevZD-QkW*#?`dC(kuq zrYK(qEP9C~B;A0~4tPC7cs25dgYvCigKRwCytj!93QE>Plk4$tWQ=Hvbl4tGhq+3F zx2_XDtzw`}mBYywAU7}TstVr5<+DVPfB1$P(jhE9RSr=&{)IL-x2B^Z7THTPH2acQUJcjaOrInGsX9K!#X?4WCVDVLf_h_=!?}Le4?MKA)T!Hd=(DSM2%IHC$ zhu^c##x56|&D+ z0KpEFxMTN#HGwKsh~Kc>&Rb7y`RZTstsaE$pM6a~y9ypCq#A}SNkA@VY$Y2_0oREg z=2EDuVrw!6Muflsv9>^#y^!YcQFteLU&6uY)?7NI;v>4X9W$M^cS|KP57c&Wl~W5| z=6ouaJep1HZBYb=o=upM2+09skI3=OpqR)sJ<;Faki)I2cG*c{^BN-BiWplRHvZ%qNNtY);fsD&7g*YxIcC0jj!x8 zFE$7pP2b0f7)^??dV!PKa0brXrI&*60I=03gT&{Krua-61p zugaK%OW-R3(!Z>0r5BI-}_YXX$U!Rzh`&=EOm~aR0Gc#oJ92p0{J(}K6Sf< z2=k}!1L&15zgG3Tmpo?;#B7u+GqJ!lob1pEh6gAB%e~9t6AVX_DqG-f!jVI(NbeHR zGCbL(69;fi$NMFwPokzM*h&A@7d9`YEMex9(9A-EU*61Kg2CM*;0m@Lp4ydi3(&LC9@Yty?{IiO_;Bnh+?cD7H)DHl@QLvDOmje$NP6TWsbtKv2<) zpZSgS$qI!Lu?iy;)-r?Z`DS4BGlGdJ1E5X@w0U-OedgJb{0Ou^VWvEp_a05_0uj3+ zO2X8+|IQc)Hhg?z6!7sM8CyrH8_1S@(zF#pF7F0OMKQMeV(q=u()gV|N5KMc!qiHD zD{e*O>PXk=E4X$F8M_ksm(@&26F;8!Y$}DmC2eSDrv$gS29&Jgv5g8AgJ8PkLR-kI zgq*x~cjlwU>dc*Co6tL)ijkdHmd+*Dg^Y*VnZ;0G^B_eMo+yhJs?~E_5korY=0;=t zSp+EB;Z`r>FnI%MfS<)|0f6b*jdBszhH4^VNvZGoP*n*oOwc#4@?!H7pEC5-9U%f1 z20bXl)ffD&$U`RWd36T;_JajjfE5vKDu0+Od&^=RW zF`e8HZQJ4(mrP3?OdF=4T}cJ#%vD@rtg^{i`AkFGUpF_$Y%Q%Gk!Q)K@@%9e@@@{U z)6FLQz_p7|rEolpMFK)tKbaA$2 z3yl?O(t0ObB?-?Z3|j+5#e5^x`=YlsM*hs3El{B4Wv6or1d;bkT>AJmMHHP7|F-XN^@=(5 zw$yOl^u;eMQ^#BNH}UN_pP$8%)QhLfmir)Vn2lY!kHB+tV|@KyXgTwl1d4k%yR zn2VD7=}(`fjLuMJZ<^RCcHx5}R`-;Updcv($M(hbiwJCP&ZT$qXD{R)CaLq;ohb)Y zmatm5{IZic&|qJtF-8U2oAkJE1n*MZW$%5fPvO@56<6Y_xSgTW<4j9|+vF#BT&#}s z;OKbNAQoH|i)VZk9@lj5$7rQjdYSw+_n)Q7CGwq`_cX&9&`0ORxfpAEAovdqpILKM z;9WhE?ZxT^mu@ipWofLjjbztPoDbw&Zw61&dGbC3kxEygmhD43Wp*KEqZAe{c6PlN;)fR-zjk^a3%L;F zHW#gZP2$d&w2A~~^wKB2+PI%~h)TPl!UycWZ?POsY;me|Z#+foveQ+msbWS0J{_zD zW|uR)@5sWX6ApX64qFQrXfk@ zfktHnbZ9pc$>or?7Y;kHKcICep%4uLM+IKG1RX~-@CLYzKZ6O=(G-WzWLLl%$DI=cUp{9<1q}Uvu$Tx_~=)RJ2 zDtj0xjqbb0UR^A|wVUqcl`WPvIvr@=UE-XEK(ZLCd2;C)7&vEtyRhaDC*fGMnP=+7 zi4|EpZ^t{i(>|~L{}Nt*6cJNrqV*GowS|aT2`(G8nteO>I#85e0!uC$EGr}Pt96(#>R)TCF1xJJ z9N92_3r_cYhNuyc!(Djfk#rx6EV}W`7t@q3B}tsS32Nn$;axXU933f&mYk99$A~jt zrnGJ&9+E3f+h`#1RzDAB_}7B!GgH?lywrOkU3RhbjJ0To&N_U)qjZkhq9!i=jm?@W4$qza_T?pbu5TC)-j9I=Pks3#E?;A4*xd`kv(v55 zq5y5$jz4$2{`eb{a-xI_O5qlILnZL_-!)`%2pUWLAtxQ^p$T6c!ftmyc&yx3Ppj44 z`EZg^p9i`me4-UuRQ?1tg^-cQKs0#SX@2$Q_?erU@5{{VzlY zvFRMZ*Mjw9$Z~48Zpnl(V#h`Jr|R1fJMc;{0;{jO^dB;qvRn+S|2u;@FWom(8k-#R zw&5_m)KRQZb-97ASSr;4Yq4vrHva+b%JoE@<<+S}QB#SSWH=TG_iKOsVVB})*Tb4- z59{bRsDc`!6un4&{Yx-~`oZzFI zG)gzSXYnJ3gP+>d(t&c+rU`N*B2&AIk0gI~h%i4iP#(l4!{!#zjkPz)FV{*`Kh8^7 z{>b8xrTl4N?5l=iT~LRVQs!qTgJHVI@WZup)jQUWH7@m+nur%zBPE)h`yIa6^g1*r zLwlr-5qNndt9QFXE*qO&a2arEP1V&_?P9n*qtJ|aCB4uCIqWqtZtq7)-&f;1TdL!laKgs0*A_=;XGqXhaeMhUF)wh>fIe|j zXNiBH0RC&;;2=g79-ipGjCu|kwDtIz=Wv0Hz?;Bgmaz1f{p^wIl@j>l(K<(`b6f51 zjao@9v;3YSn~job=)Vg#pk<}~^H*2>v$*1WN}`UNuYIX7upHor#qc5%5RmRfG|ccs zb8s}ny0Ued6eRw*YO{yjqAB~mCC79wy&c?-{#0LY@Sg<&i#;vUAd5XeO@o)Bpj;Ny33id z-z%;ry}falO7^Y8$6cW6-_ogM2Pihr+uIhmEkqFJp^{%A?FM~>N>3dm!ab{S_YaSg zyq-YWio#o$G>QiOX1T+T&&5w3yE+?`FGzrRrxUg;wGQ}7JMfMu%-p*(BN6@+<)WM# zQ|`pCfjt{yqktO3*Wr$dVK8Lws8#v`69pajlw?(_N8fFCkQq3EXsDjZ-Ba>ZHu+jj zlk;y>zrpCAGQI6aKM=ps0H18q2G^7K7v3zLUuhIk$)D836Ish!@4qhuzH|0YzVeXZ zVN<-3r$D7Lwj6ZyuxEudK2JoO@bX@e1qXF0WWQ2g3ALEIBJA!Ze9xn_8t|qm$`Vl} zu^$(q6cxvT#z3N@(Dp_-cbf#?W^JR7hcj=tx~cM$^5?x?LMkox zQsA6cDG__XviF?m4xl)@QMvN4jW$|u(XxDMyH6PPg?-$J*AJQA?=(h@epe;=MihjOg7Q0#eJ==-02S zM(&XTzoP~BLS%&h&X`#?NU51Dg~NF_&N2**X!HYBLiOHGzCf&J10vz|mYhXc9Xhd% zNQQ7OE&ZCO5*B;5QH)uhrgh|cy(U)(l}LlWWD4rO^;l}U(E(Jp4wX3kC>{bnaiWry zU9M{$`T3GsGX9qMOG$8tQK90@Iz8n=Io==#Fc9{9dA#$I6rWU}ZvSDPDE{)kT)@>9 z8{ZL*b%PmaEoHa3m>WR9=r+Fg*~V~|+7rM6#0t19U=dPHH7My!3Yd+(@hva*@Uhye zx6==%#*eJq!YV=xr%o@BRpirQi^}sIR5RRr)HC?x*|YpE5G=q6gVKTk?m2D$8!qBk zxEuHQ4|5b~S;`;wS|4F?nNvY`IgU0DkJK8FX z9}Z+y0tM*?XvHG&g0mbtTvO}-)XoZ)6aH9fUTI@Hc*&C)>`>u3UL>*gmJ23eAxGy} zm>CttieIUK~Z=5|^C6^pe`MmEybRTcF%U_2YA zSeruXQ_-F9IN3lnn&tV!4{rpaOHR1Mp4_UEs7$wX#Ui!qZn${(q3fA0 z@Yr?)Y~VGpV6g>uwgOhFO-28y&~R5KSvGs1fP$|7ZkgmC4F6U``KzAtf4YjY zQ7E<0#0KG`NT!g=i9E_*dR_{l%V+xn*NqT-6oq*qaaT)0RkQ7xv@E7eL0Hs6*5XC8 zDPk^MNF;Iw?aqmri;O*S(c+}6A@XF7l z>ouu)dU{&lzJ6V)=ND&bmz#jaw>L>1iOd|wy=RZv+%&s5-fCk_gS@h~<}&a^ zJ|PE!m$8dAm6UK%bskWEqQL6v>V6k<_)vx{jz9UMNYDwvYyB?ep6j>@5_0d}-d@nn z6%s|jN3#d*@@E^*7Ie2$$xsG&1uc?~$0DeNhtc*T*`GLiS4p_5R{dN1S83tz?cYxR zW%&1#e|`UNnEYpkxh40$cuS(lJc(gJ<%fO-)LSL}`lo>P_rlwM&uRYeyz|$q@>t61 zQ&!0>ox{09)iiAvyk9zq^tGRoe;jK~{Spp6+>4W6<~Q!5{-W=wxC4!qctg#^WN!%` zzi&?n%9+m+i>qMkP2xn(#vSSFRL!W$$b?az{C>?#x$4dM*aEqos`z>vOiwA2+aPQ$ zJnkVG!s}SGB=;e6)evVv2wwEE{keG3JF_Qv>>?Ur&3<}9M3%4WUG`fIFaFz;PrWbKRuiYKBqB%jXY=G1V;BE!Kh|1S$H-kPBm7zW&NLLWc?7csMzyOsCK)o z)Ugqnw$nGrS|S$eVBegjTW`^Yt<5uZIjqpc{yFs!g8}v8$N2t-b(c0>CdCQX5-d>V#dwSPR>VgGt%xcc zu_BA+1_$rLL|fTklJaU!gra2j_7?|1I1X-13rA!{*)(pZGu zk?F*|P35}@mRSb&p0K1c5w$cm8&ozi>So>EFu$-#f zkWc7ul`Y;p^}?z(3_fVL-Ah5&I=oQ7#ADX3>Vx=NEt`bY{GiLiO+)F6=YEe_<6*(f zJt7PrCO9%3eZjYoV0CUvMYWKPpz3FL zVN@-Wk^l(WypB7Y`X2ex#AN+tcyX$j`(oS(yebAa0z6oSQFM`P1K)5mX0#{0=2#uPEUO=hrN|P1Uze7ctKDa`pN$t6C$U$Lx2G zf{P#GW&Bj=4`VDcuZ(K@S}EVCL_O6}!z#6&EGA_#H?(j$QBKAxwCDY*agOiQoCVkY z+c7qIl<3LgPVQXXO(8q^dZd~cD(Z=3jiEDlGL(Y7d6pZ0@~2W*(@lIQwUOvKMd!+d zuOn7MZy{X+dS?%WRO|Q(sUJ@|jDLJr9{{nveaoR&|Gl8V@q?F`Z!f0g@A<*- z#}sZ`qyl7gRHH{t4@MyoCmc@hi|GK^Mb7{THmMd7#LP+y6 zWagNnrO3M@H@Uqv=B@bir>Tndsd9yh*cE&3{Mn(9u($cFbj~BzcC&2n6QIj2gQgDf zj^Sg=NEtmnbUL?7)|->%U|dQ!`kw>leO=|VJf6@AA0xcDa9V}a_0Tc#Yp9P!bBrM> zu-IFM3r^^ya5LV;@^Am+vHX|#Q64sGZ^}x`BpIK_G89(Th$fL_zc_B1v;1e@OpoE^`bbCW|FNxC}5v94se|9untv3)&$yT>p zTTXK$Dx0l2ZarN|B!8ZYq&hN+p;AsG9ps1A+#B^3J08Iy>CQ>JdLwd$4h^U+K{CHy z(J69HHy5oIO(mm!jA=Ux>y8pI%6XuJKyk}9O07v17a z3||a*h^`GeAJZQ@M)E+1){~bD-t@P{uKcF!TNpU!^~(4VLl1&DUbgi^s@clB1mEax z@O`)8RJRsS^INYbv<+?&av#?aPX&`@VfHS>PVoH)bedm&$1D9h_677qY7LCaTn@<# zC0s0#2yYVgo!f6ZY^vKJiV|Irms4iv4x?5+@p0a>Z3!)S!}hlR8A@Qi#(-mY%~r(+ zgwFCVjt1{d-1dcyeaey$39*xQx1aur!-VAwKU~PU!KJxIBTz|H+zz9)ymqD3Oi5Ip z%z7Jt32*ZK#N$QlqVFUyOLhBYt=W1xd$0Mibq7gZWodwWftK!O_S31eQB2lAbLLeU zyM7~X822~1lW3kSB0TfS$AZxC78;F3h3fgf)A%E;3#IN$C;B04jZ8b0oxM&NQF zc;1d`@2u}#_9Y;7Px8m+8&r9GBIezV&}snZcMng|YNb!DQl0y45!Gh8Qgo;qH9@w^ z9uWK+V;@;(`+0*(7Qgd7wP4$f<%qqaWFxe$wn5w#+v?^RP&&}$LLHvQ+Kk-lg72__ zkjD7DnFVp`b(D324S`->6(NGsQVatY-$=AFk=l-VG`+KXg)4c2kxAYce_+_8*;%N4 zLb-K@!+Cf3^V{t&j~=mYK$R~N`;&yokEYt;Ke^enK&{5($!vP#dzRbnon(DB2SXR@ z8>h7Wp>2zjw?2%alcPi3it-`GP@nhr2{x34D@!eK^7zp-T^LUkeTlu+vGiFmvtjxGFKEu0LCiN~%xd zz7?!^UE7qWpp@JkE0G{}?go`1DkpO=44-Hp7Y?r6YFU(29h1RSZDl;OWlN87;zG@w z3b%3oB;z*qmI!;3@q8n(`H;xuT|tDEl3K}hE}Gt9AcdaadQ;@FMmVz|OX6k@MeGW z(FS{i#bLvN{|k?y1@_)_OLLmX_Y|acB#_c3b4G6}ozZ>7nH1kRM&vg*gHp1F`PqVN zlzz)~PnjW|L@|2*fLpRH8(dE>TlRA=R!+jZ&}1t(&fp>2)P09V@R?Ahaikli%FD~x zF*@Ekh_zaVLJ2&82_1K~tK$>2;UE zZXLy-4FZQJK=B57??l>Qv}q89>!ksB7I8<%g}$}7`Jo|Rt|Vb>`YG8QdWyh(Z^J^h z*zhl-x56uRcug4?4F?Dt6e?v!^{M%{6iG`Q(i|C2Y{#0jf zpHAs_>iS9XhgfR9+)ZrHstVUG)YXNv^(_*s%m%3~%*$-&$uO((sHWxE;8&|5cz$|k zUA>*}3Qy>@PKSGy8Vdyx_?n0us71A7@%7S3~I}G(u*#hlc7>B9&=Sf zHZ5<+njx7~z}5TjPs-}YAl=)sO(xc$ tqIMMJWg-m=h(NKt|6ejy+2oc$Me6YgZAbq%)PE-c6l9fUie7#6`CpOnCE)-7 literal 0 HcmV?d00001 diff --git a/doc/tuning.md b/doc/tuning.md index 2673d68d9..6d07d4ddc 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -10,6 +10,7 @@ * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize) * [Add more GPUs](#add-more-gpus) * [Two Threads per GPU](two-threads-per-gpu) + * [Interleave Tuning](interleave-tuning ) * [disable comp_mode](#disable-comp_mode) * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern) * [Increase Memory Pool](#increase-memory-pool) @@ -83,13 +84,13 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to ``` "gpu_threads_conf" : [ - { - "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, - { - "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, ], @@ -107,19 +108,49 @@ Therefore adjust your intensity by hand. ``` "gpu_threads_conf" : [ - { - "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, - { - "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false, - "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true + { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + "interleave" : 40 }, ], "platform_index" : 0, ``` +### Interleave Tuning + +Interleave controls when a worker thread is starting to calculate a bunch of hashes +if two worker threads are used to utilize one GPU. +This option has no effect if only one worker thread is used per GPU. + +![Interleave](img/interleave.png) + +Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread. +To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough. +In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value. +If you get many interleave message in a row (over 1 minute) you should adjust the value. + +``` +OpenCL Interleave 0|1: 642/2400.50 ms - 30.1 +OpenCL Interleave 0|0: 355/2265.05 ms - 30.2 +OpenCL Interleave 0|1: 221/2215.65 ms - 30.2 +``` + +description: +``` +|: / ms - + +``` +`last delay` should gou slowly to 0. +If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5. +The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations. +Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. +If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value. + ### disable comp_mode `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices. diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 6e1c70b05..408cad97a 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -18,6 +18,7 @@ #include "xmrstak/picosha2/picosha2.hpp" #include "xmrstak/params.hpp" #include "xmrstak/version.hpp" +#include "xmrstak/net/msgstruct.hpp" #include #include @@ -34,6 +35,7 @@ #include #include #include +#include #if defined _MSC_VER #include @@ -43,7 +45,6 @@ #endif - #ifdef _WIN32 #include #include @@ -302,6 +303,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ return ERR_OCL_API; } + if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx); + return ERR_OCL_API; + } + ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret); if(ret != CL_SUCCESS) { @@ -410,14 +417,17 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ strided_index = 0; } + // if intensity is a multiple of worksize than comp mode is not needed + int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0; + std::string options; options += " -DITERATIONS=" + std::to_string(hashIterations); - options += " -DMASK=" + std::to_string(threadMemMask); - options += " -DWORKSIZE=" + std::to_string(ctx->workSize); + options += " -DMASK=" + std::to_string(threadMemMask) + "U"; + options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U"; options += " -DSTRIDED_INDEX=" + std::to_string(strided_index); - options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp); - options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u); - options += " -DMEMORY=" + std::to_string(hashMemSize); + options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U"; + options += " -DCOMP_MODE=" + std::to_string(needCompMode); + options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU"; options += " -DALGO=" + std::to_string(miner_algo[ii]); options += " -DCN_UNROLL=" + std::to_string(ctx->unroll); /* AMD driver output is something like: `1445.5 (VM)` @@ -699,9 +709,9 @@ std::vector getAMDDevices(int index) { GpuContext ctx; std::vector devNameVec(1024); - size_t maxMem; - if( devVendor.find("NVIDIA Corporation") != std::string::npos) - ctx.isNVIDIA = true; + + ctx.isNVIDIA = isNVIDIADevice; + ctx.isAMD = isAMDDevice; if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS) { @@ -709,7 +719,7 @@ std::vector getAMDDevices(int index) continue; } - if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS) + if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k); continue; @@ -722,8 +732,8 @@ std::vector getAMDDevices(int index) } // the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation - if(ctx.isNVIDIA) - maxMem = ctx.freeMem; + if(isNVIDIADevice) + ctx.maxMemPerAlloc = ctx.freeMem; if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS) { @@ -731,11 +741,20 @@ std::vector getAMDDevices(int index) continue; } + std::vector openCLDriverVer(1024); + if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k); + continue; + } + + bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos; + // if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory ctx.deviceIdx = k; - ctx.freeMem = std::min(ctx.freeMem, maxMem); ctx.name = std::string(devNameVec.data()); ctx.DeviceID = device_list[k]; + ctx.interleave = 40; printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str()); ctxVec.push_back(ctx); } @@ -937,10 +956,29 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx) // create a directory for the OpenCL compile cache create_directory(get_home() + "/.openclcache"); + std::vector> interleaveData(num_gpus, nullptr); + for(int i = 0; i < num_gpus; ++i) { + const size_t devIdx = ctx[i].deviceIdx; + if(interleaveData.size() <= devIdx) + { + interleaveData.resize(devIdx + 1u, nullptr); + } + if(!interleaveData[devIdx]) + { + interleaveData[devIdx].reset(new InterleaveData{}); + interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms(); + + } + ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU; + ++interleaveData[devIdx]->numThreadsOnGPU; + ctx[i].interleaveData = interleaveData[devIdx]; + ctx[i].interleaveData->adjustThreshold = static_cast(ctx[i].interleave)/100.0; + ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold; + const std::string backendName = xmrstak::params::inst().openCLVendor; - if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0) + if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0) { size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize; ctx[i].rawIntensity = reduced_intensity; @@ -1116,11 +1154,108 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3); return ERR_OCL_API; } + + if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) + { + printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); + return(ERR_OCL_API); + } } return ERR_SUCCESS; } +uint64_t updateTimings(GpuContext* ctx, const uint64_t t) +{ + // averagingBias = 1.0 - only the last delta time is taken into account + // averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined + // averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined + const double averagingBias = 0.1; + + int64_t t2 = get_timestamp_ms(); + uint64_t runtime = (t2 - t); + { + + std::lock_guard g(ctx->interleaveData->mutex); + // 20000 mean that something went wrong an we reset the average + if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0) + ctx->interleaveData->avgKernelRuntime = runtime; + else + ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias; + } + return runtime; +} + +uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) +{ + uint64_t t0 = get_timestamp_ms(); + + if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0) + { + t0 = get_timestamp_ms(); + std::unique_lock g(ctx->interleaveData->mutex); + + int64_t delay = 0; + double dt = 0.0; + + if(t0 > ctx->interleaveData->lastRunTimeStamp) + dt = static_cast(t0 - ctx->interleaveData->lastRunTimeStamp); + + const double avgRuntime = ctx->interleaveData->avgKernelRuntime; + const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold; + + // threshold where the the auto adjustment is disabled + constexpr uint32_t maxDelay = 10; + constexpr double maxAutoAdjust = 0.05; + + if((dt > 0) && (dt < optimalTimeOffset)) + { + delay = static_cast((optimalTimeOffset - dt)); + + if(enableAutoAdjustment) + { + if(ctx->lastDelay == delay && delay > maxDelay) + ctx->interleaveData->adjustThreshold -= 0.001; + // if the delay doubled than increase the adjustThreshold + else if(delay > 1 && ctx->lastDelay * 2 < delay) + ctx->interleaveData->adjustThreshold += 0.001; + } + ctx->lastDelay = delay; + + // this is std::clamp which is available in c++17 + ctx->interleaveData->adjustThreshold = std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust); + ctx->interleaveData->adjustThreshold = std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust); + + // avoid that the auto adjustment is disable interleaving + ctx->interleaveData->adjustThreshold = std::max( + ctx->interleaveData->adjustThreshold, + 0.001 + ); + } + delay = std::max(int64_t(0), delay); + + ctx->interleaveData->lastRunTimeStamp = t0 + delay; + + g.unlock(); + if(delay > 0) + { + // do not notify the user anymore if we reach a good delay + if(delay > maxDelay) + printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf", + ctx->deviceIdx, + ctx->idWorkerOnDevice, + static_cast(delay), + avgRuntime, + ctx->interleaveData->adjustThreshold * 100. + ); + + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + } + } + + return t0; +} + size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) { // switch to the kernel storage @@ -1154,12 +1289,10 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS) { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - clFinish(ctx->CommandQueues); - size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 }; if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS) { @@ -1181,64 +1314,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo) return ERR_OCL_API; } - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); - return ERR_OCL_API; - } - - clFinish(ctx->CommandQueues); - for(int i = 0; i < 4; ++i) { - if(BranchNonces[i]) + size_t tmpNonce = ctx->Nonce; + if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS) { - // Threads - cl_uint numThreads = BranchNonces[i]; - if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4); - return(ERR_OCL_API); - } - - // round up to next multiple of w_size - BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size; - // number of global threads must be a multiple of the work group size (w_size) - assert(BranchNonces[i]%w_size == 0); - size_t tmpNonce = ctx->Nonce; - if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS) - { - printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); - return ERR_OCL_API; - } + printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3); + return ERR_OCL_API; } } + // this call is blocking therefore the access to the results without cl_finish is fine if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS) { printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret)); return ERR_OCL_API; } - clFinish(ctx->CommandQueues); auto & numHashValues = HashOutput[0xFF]; // avoid out of memory read, we have only storage for 0xFF results if(numHashValues > 0xFF) diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp index 63c5029d7..80fcbefde 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.hpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp @@ -12,23 +12,36 @@ #include #include #include +#include +#include #define ERR_SUCCESS (0) #define ERR_OCL_API (2) #define ERR_STUPID_PARAMS (1) +struct InterleaveData +{ + std::mutex mutex; + double adjustThreshold = 0.4; + double startAdjustThreshold = 0.4; + double avgKernelRuntime = 0.0; + uint64_t lastRunTimeStamp = 0; + uint32_t numThreadsOnGPU = 0; +}; struct GpuContext { /*Input vars*/ size_t deviceIdx; size_t rawIntensity; + size_t maxRawIntensity; size_t workSize; int stridedIndex; int memChunk; int unroll = 0; bool isNVIDIA = false; + bool isAMD = false; int compMode; /*Output vars*/ @@ -40,8 +53,13 @@ struct GpuContext cl_program Program[2]; cl_kernel Kernels[2][8]; size_t freeMem; + size_t maxMemPerAlloc; int computeUnits; std::string name; + std::shared_ptr interleaveData; + uint32_t idWorkerOnDevice = 0u; + int interleave = 40; + uint64_t lastDelay = 0; uint32_t Nonce; @@ -54,5 +72,5 @@ std::vector getAMDDevices(int index); size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx); size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo); size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo); - - +uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true); +uint64_t updateTimings(GpuContext* ctx, const uint64_t t); diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl index 81c0d5ff9..6a3def72c 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl @@ -365,16 +365,16 @@ R"===( #if(STRIDED_INDEX==0) # define IDX(x) (x) #elif(STRIDED_INDEX==1) -# define IDX(x) ((x) * (Threads)) +# define IDX(x) (mul24(((uint)(x)), Threads)) #elif(STRIDED_INDEX==2) # define IDX(x) (((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK) +#elif(STRIDED_INDEX==3) +# define IDX(x) ((x) * WORKSIZE) #endif inline uint getIdx() { -#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2) return get_global_id(0) - get_global_offset(0); -#endif } #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)] @@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, AES2[i] = rotate(tmp, 16U); AES3[i] = rotate(tmp, 24U); } - + __local ulong State_buf[8 * 25]; barrier(CLK_LOCAL_MEM_FENCE); @@ -416,16 +416,23 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif if (get_local_id(1) == 0) { __local ulong* State = State_buf + get_local_id(0) * 25; - +// NVIDIA +#ifdef __NV_CL_C_VERSION + for(uint i = 0; i < 8; ++i) + State[i] = input[i]; +#else ((__local ulong8 *)State)[0] = vload8(0, input); +#endif State[8] = input[8]; State[9] = input[9]; State[10] = input[10]; @@ -474,12 +481,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, } mem_fence(CLK_LOCAL_MEM_FENCE); - -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) + +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin[8][8]; { - /* Also left over threads perform this loop. * The left over thread results will be ignored @@ -530,7 +536,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad, )===" R"===( - + // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) # define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4)))) @@ -562,11 +568,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong b[2]; uint4 b_x[1]; #endif - __local uint AES0[256], AES1[256], AES2[256], AES3[256]; + __local uint AES0[256], AES1[256]; // cryptonight_monero_v8 #if(ALGO==11) +# if defined(__clang__) && !defined(__NV_CL_C_VERSION) __local uint RCP[256]; +# endif + uint2 division_result; uint sqrt_result; #endif @@ -577,10 +586,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint tmp = AES0_C[i]; AES0[i] = tmp; AES1[i] = rotate(tmp, 8U); - AES2[i] = rotate(tmp, 16U); - AES3[i] = rotate(tmp, 24U); // cryptonight_monero_v8 -#if(ALGO==11) +#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION))) RCP[i] = RCP_C[i]; #endif } @@ -600,9 +607,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif a[0] = states[0] ^ states[4]; @@ -630,7 +639,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states tweak1_2 ^= as_uint2(states[24]); #endif } - + mem_fence(CLK_LOCAL_MEM_FENCE); #if(COMP_MODE==1) @@ -638,7 +647,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states if(gIdx < Threads) #endif { - ulong idx0 = a[0] & MASK; + uint idx0 = as_uint2(a[0]).s0 & MASK; #pragma unroll CN_UNROLL for(int i = 0; i < ITERATIONS; ++i) @@ -646,26 +655,26 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states ulong c[2]; // cryptonight_monero_v8 && NVIDIA #if(ALGO==11 && defined(__NV_CL_C_VERSION)) - ulong idxS = idx0 & 0x30; + uint idxS = idx0 & 0x30U; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; #endif ((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0); // cryptonight_bittube2 #if(ALGO == 10) - ((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]); #else - ((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]); + ((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]); #endif // cryptonight_monero_v8 #if(ALGO==11) { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); - ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); - ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); - SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); - SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)); + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); } #endif @@ -682,23 +691,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # endif b_x[0].s2 ^= ((table >> index) & 0x30U) << 24; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; // cryptonight_monero_v8 #elif(ALGO==11) SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0]; # ifdef __NV_CL_C_VERSION // flush shuffled data SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; idxS = idx0 & 0x30; *scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL; # else - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; # endif #else b_x[0] ^= ((uint4 *)c)[0]; SCRATCHPAD_CHUNK(0) = b_x[0]; - idx0 = c[0] & MASK; + idx0 = as_uint2(c[0]).s0 & MASK; #endif uint4 tmp; tmp = SCRATCHPAD_CHUNK(0); @@ -713,28 +722,32 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL; // Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4 // We drop the highest bit to fit both quotient and remainder in 32 bits + +# if defined(__clang__) && !defined(__NV_CL_C_VERSION) division_result = fast_div_v2(RCP, c[1], d); +# else + division_result = fast_div_v2(c[1], d); +# endif + // Use division_result as an input for the square root to prevent parallel implementation in hardware sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result)); -#endif + ulong2 result_mul; result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0); result_mul.s1 = c[0] * as_ulong2(tmp).s0; -// cryptonight_monero_v8 -#if(ALGO==11) - { - ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; - ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); - result_mul ^= chunk2; - ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); - SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); - SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); - SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); - } -#endif - a[1] += result_mul.s1; + ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul; + ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2)); + result_mul ^= chunk2; + ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3)); + SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]); + SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]); + SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]); a[0] += result_mul.s0; - + a[1] += result_mul.s1; +#else + a[1] += c[0] * as_ulong2(tmp).s0; + a[0] += mul_hi(c[0], as_ulong2(tmp).s0); +#endif // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10) @@ -742,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states # if(ALGO == 6 || ALGO == 10) uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; - SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; + SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0]; ((uint2 *)&(a[1]))[0] ^= ipbc_tmp; # else ((uint2 *)&(a[1]))[0] ^= tweak1_2; @@ -755,7 +768,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif ((uint4 *)a)[0] ^= tmp; - + // cryptonight_monero_v8 #if (ALGO == 11) # if defined(__NV_CL_C_VERSION) @@ -765,22 +778,22 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states b_x[1] = b_x[0]; #endif b_x[0] = ((uint4 *)c)[0]; - idx0 = a[0] & MASK; + idx0 = as_uint2(a[0]).s0 & MASK; // cryptonight_heavy || cryptonight_bittube2 #if (ALGO == 4 || ALGO == 10) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; - long q = fast_div_heavy(n, d | 0x5); + long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = (d ^ q) & MASK; -// cryptonight_haven -#elif (ALGO == 9) + idx0 = (d ^ as_int2(q).s0) & MASK; +// cryptonight_haven || cryptonight_superfast +#elif (ALGO == 9 || ALGO == 12) long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))); int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2]; long q = fast_div_heavy(n, d | 0x5); *((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q; - idx0 = ((~d) ^ q) & MASK; + idx0 = ((~d) ^ as_int2(q).s0) & MASK; #endif } @@ -810,12 +823,12 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4 xin1[8][8]; __local uint4 xin2[8][8]; #endif - + #if(COMP_MODE==1) // do not use early return here if(gIdx < Threads) @@ -825,9 +838,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #if(STRIDED_INDEX==0) Scratchpad += gIdx * (MEMORY >> 4); #elif(STRIDED_INDEX==1) - Scratchpad += gIdx; + Scratchpad += gIdx; #elif(STRIDED_INDEX==2) Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE); +#elif(STRIDED_INDEX==3) + Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE); #endif #if defined(__Tahiti__) || defined(__Pitcairn__) @@ -847,8 +862,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states barrier(CLK_LOCAL_MEM_FENCE); -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)]; __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)]; __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)]; @@ -861,11 +876,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states if (gIdx < Threads) #endif { -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) #pragma unroll 2 for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4)) { - text ^= Scratchpad[IDX(i1)]; + text ^= Scratchpad[IDX((uint)i1)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin2_load; @@ -875,7 +890,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states *xin1_store = text; - text ^= Scratchpad[IDX(i1 + 8)]; + text ^= Scratchpad[IDX((uint)i1 + 8u)]; barrier(CLK_LOCAL_MEM_FENCE); text ^= *xin1_load; @@ -892,7 +907,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #else #pragma unroll 2 for (int i = 0; i < (MEMORY >> 7); ++i) { - text ^= Scratchpad[IDX((i << 3) + get_local_id(1))]; + text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))]; #pragma unroll 10 for(int j = 0; j < 10; ++j) @@ -901,8 +916,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states #endif } -// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 -#if (ALGO == 4 || ALGO == 9 || ALGO == 10) +// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast +#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12) /* Also left over threads performe this loop. * The left over thread results will be ignored */ @@ -971,7 +986,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u const ulong idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1019,8 +1034,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u ulong outIdx = atomic_inc(output + 0xFF); if(outIdx < 0xFF) output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0); - } } + } mem_fence(CLK_GLOBAL_MEM_FENCE); } @@ -1052,7 +1067,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1106,7 +1121,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1182,7 +1197,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global const uint idx = get_global_id(0) - get_global_offset(0); // do not use early return here - if(idx < Threads) + if(idx < BranchBuf[Threads]) { states += 25 * BranchBuf[idx]; @@ -1238,4 +1253,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global } } -)===" \ No newline at end of file +)===" diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl index 21268fd78..161f2f55d 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl @@ -6,23 +6,19 @@ inline long fast_div_heavy(long _a, int _b) { long a = abs(_a); int b = abs(_b); - float rcp = native_recip(convert_float_rte(b)); float rcp2 = as_float(as_uint(rcp) + (32U << 23)); - - ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2); + ulong q1 = convert_ulong(convert_float_rte(as_int2(a).s1) * rcp2); a -= q1 * as_uint(b); - - long q2 = convert_long_rte(convert_float_rtn(a) * rcp); + float q2f = convert_float_rte(as_int2(a >> 12).s0) * rcp; + q2f = as_float(as_uint(q2f) + (12U << 23)); + long q2 = convert_long_rte(q2f); int a2 = as_int2(a).s0 - as_int2(q2).s0 * b; - int q3 = convert_int_rte(convert_float_rte(a2) * rcp); q3 += (a2 - q3 * b) >> 31; - const long q = q1 + q2 + q3; return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q; } #endif )===" - \ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl index 2c1b13865..c170387b4 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl @@ -42,6 +42,9 @@ static const __constant uint RCP_C[256] = 0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u, }; +// Rocm produce invalid results if get_reciprocal without lookup table is used +#if defined(__clang__) && !defined(__NV_CL_C_VERSION) + inline uint get_reciprocal(const __local uchar *RCP, uint a) { const uint index1 = (a & 0x7F000000U) >> 21; @@ -66,63 +69,61 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a) return as_uint2(k).s1 + (b ? r : 0); } -inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) -{ - const uint r = get_reciprocal((const __local uchar *)RCP, b); - const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; - - ulong q; - ((uint*)&q)[0] = as_uint2(k).s1; - -#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14) - /* The AMD driver 14.XX is not able to compile `(k < a)` - * https://github.com/fireice-uk/xmr-stak/issues/1922 - * This is a workaround for the broken compiler. - */ - ulong whyAMDwhy; - ((uint*)&whyAMDwhy)[0] = as_uint2(k).s0; - ((uint*)&whyAMDwhy)[1] = as_uint2(k).s1; - ((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U; #else - ((uint*)&q)[1] = (k < a) ? 1U : 0U; -#endif - - const long tmp = a - q * b; - const bool overshoot = (tmp < 0); - const bool undershoot = (tmp >= b); - - return (uint2)( - as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U), - as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U) - ); -} -inline uint fast_sqrt_v2(const ulong n1) +inline uint get_reciprocal(uint a) { - float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + const float a_hi = as_float((a >> 8) + ((126U + 31U) << 23)); + const float a_lo = convert_float_rte(a & 0xFF); + const float r = native_recip(a_hi); + const float r_scaled = as_float(as_uint(r) + (64U << 23)); + const float h = fma(a_lo, r, fma(a_hi, r, -1.0f)); + return (as_uint(r) << 9) - convert_int_rte(h * r_scaled); +} - float x1 = native_rsqrt(x); - x = native_sqrt(x); +#endif - // The following line does x1 *= 4294967296.0f; - x1 = as_float(as_uint(x1) + (32U << 23)); +#if defined(__clang__) && !defined(__NV_CL_C_VERSION) - const uint x0 = as_uint(x) - (158U << 23); - const long delta0 = n1 - (((long)(x0) * x0) << 18); - const float delta = convert_float_rte(as_int2(delta0).s1) * x1; +inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b) +{ + const uint r = get_reciprocal((const __local uchar *)RCP, b); - uint result = (x0 << 10) + convert_int_rte(delta); - const uint s = result >> 1; - const uint b = result & 1; +#else - const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; - if ((long)(x2 + b) > 0) --result; - if ((long)(x2 + 0x100000000UL + s) < 0) ++result; +inline uint2 fast_div_v2(ulong a, uint b) +{ + const uint r = get_reciprocal(b); - return result; +#endif + + const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a; + const uint q = as_uint2(k).s1; + long tmp = a - ((ulong)(q) * b); + ((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0; + const int overshoot = ((int*)&tmp)[1] >> 31; + const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31; + return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b)); +} +inline uint fast_sqrt_v2(const ulong n1) +{ + float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23)); + float x1 = native_rsqrt(x); + x = native_sqrt(x); + // The following line does x1 *= 4294967296.0f; + x1 = as_float(as_uint(x1) + (32U << 23)); + const uint x0 = as_uint(x) - (158U << 23); + const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18); + const float delta = convert_float_rte(as_int2(delta0).s1) * x1; + uint result = (x0 << 10) + convert_int_rte(delta); + const uint s = result >> 1; + const uint b = result & 1; + const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1; + if ((long)(x2 + as_int(b - 1)) >= 0) --result; + if ((long)(x2 + 0x100000000UL + s) < 0) ++result; + return result; } #endif )===" - \ No newline at end of file diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl index 50e861e23..b99b62d5c 100644 --- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl +++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl @@ -74,42 +74,62 @@ static const __constant uint AES0_C[256] = #define BYTE(x, y) (amd_bfe((x), (y) << 3U, 8U)) -inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k) -{ - x = ~x; - k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)]; - x.s0 ^= k.s0; - k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)]; - x.s1 ^= k.s1; - k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)]; - x.s2 ^= k.s2; - k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)]; - return k; -} - uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key) { key.s0 ^= AES0[BYTE(X.s0, 0)]; - key.s1 ^= AES0[BYTE(X.s1, 0)]; - key.s2 ^= AES0[BYTE(X.s2, 0)]; - key.s3 ^= AES0[BYTE(X.s3, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; key.s0 ^= AES2[BYTE(X.s2, 2)]; - key.s1 ^= AES2[BYTE(X.s3, 2)]; - key.s2 ^= AES2[BYTE(X.s0, 2)]; - key.s3 ^= AES2[BYTE(X.s1, 2)]; + key.s1 ^= AES2[BYTE(X.s3, 2)]; + key.s2 ^= AES2[BYTE(X.s0, 2)]; + key.s3 ^= AES2[BYTE(X.s1, 2)]; key.s0 ^= AES1[BYTE(X.s1, 1)]; - key.s1 ^= AES1[BYTE(X.s2, 1)]; - key.s2 ^= AES1[BYTE(X.s3, 1)]; - key.s3 ^= AES1[BYTE(X.s0, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; key.s0 ^= AES3[BYTE(X.s3, 3)]; - key.s1 ^= AES3[BYTE(X.s0, 3)]; - key.s2 ^= AES3[BYTE(X.s1, 3)]; - key.s3 ^= AES3[BYTE(X.s2, 3)]; + key.s1 ^= AES3[BYTE(X.s0, 3)]; + key.s2 ^= AES3[BYTE(X.s1, 3)]; + key.s3 ^= AES3[BYTE(X.s2, 3)]; + + return key; +} + +uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key) +{ + key.s0 ^= AES0[BYTE(X.s0, 0)]; + key.s1 ^= AES0[BYTE(X.s1, 0)]; + key.s2 ^= AES0[BYTE(X.s2, 0)]; + key.s3 ^= AES0[BYTE(X.s3, 0)]; + + key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u); + key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u); + key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u); + key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u); + + key.s0 ^= AES1[BYTE(X.s1, 1)]; + key.s1 ^= AES1[BYTE(X.s2, 1)]; + key.s2 ^= AES1[BYTE(X.s3, 1)]; + key.s3 ^= AES1[BYTE(X.s0, 1)]; + + return key; +} + +uint4 AES_Round2_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 X, uint4 key) +{ + key.s0 ^= AES0[BYTE(X.s0, 0)] ^ rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u) ^ AES1[BYTE(X.s1, 1)]; + X.s0 ^= key.s0; + key.s1 ^= AES0[BYTE(X.s1, 0)] ^ rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u) ^ AES1[BYTE(X.s2, 1)]; + X.s1 ^= key.s1; + key.s2 ^= AES0[BYTE(X.s2, 0)] ^ rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u) ^ AES1[BYTE(X.s3, 1)]; + X.s2 ^= key.s2; + key.s3 ^= AES0[BYTE(X.s3, 0)] ^ rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u) ^ AES1[BYTE(X.s0, 1)]; - return key; + return key; } #endif diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index c5b331c87..ba4cebb7b 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -134,6 +134,13 @@ class autoAdjust ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 || ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8; + // true for all cryptonight_heavy derivates since we check the user and dev pool + bool useCryptonight_heavy = + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy || + ::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy; + // set strided index to default ctx.stridedIndex = 1; @@ -144,19 +151,36 @@ class autoAdjust // use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2` if(useCryptonight_v8) ctx.stridedIndex = 2; + else if(useCryptonight_heavy) + ctx.stridedIndex = 3; // increase all intensity limits by two for aeon if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite) maxThreads *= 2u; - // keep 128MiB memory free (value is randomly chosen) - size_t availableMem = ctx.freeMem - minFreeMem; + // keep 128MiB memory free (value is randomly chosen) from the max available memory + const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem; + + size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem); + + uint32_t numThreads = 1u; + if(ctx.isAMD) + { + numThreads = 2; + size_t memDoubleThread = maxAvailableFreeMem / numThreads; + memPerThread = std::min(memPerThread, memDoubleThread); + } + // 224byte extra memory is used per thread for meta data size_t perThread = hashMemSize + 224u; - size_t maxIntensity = availableMem / perThread; + size_t maxIntensity = memPerThread / perThread; size_t possibleIntensity = std::min( maxThreads , maxIntensity ); // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; + // in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units + if(numThreads == 2) + intensity = (possibleIntensity / 8) * 8; + //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity if (intensity == 0) { @@ -166,18 +190,22 @@ class autoAdjust } if (intensity != 0) { - conf += std::string(" // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n"; - conf += std::string(" // compute units: ") + std::to_string(ctx.computeUnits) + "\n"; - // set 8 threads per block (this is a good value for the most gpus) - conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + - " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" - " \"unroll\" : 8, \"comp_mode\" : true\n" + - " },\n"; + for(uint32_t thd = 0; thd < numThreads; ++thd) + { + conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n"; + conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" + + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; + // set 8 threads per block (this is a good value for the most gpus) + conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" + " \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" + + " },\n"; + } } else { - printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str()); + printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str()); } } diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl index 421e0ed4b..26a8ef48a 100644 --- a/xmrstak/backend/amd/config.tpl +++ b/xmrstak/backend/amd/config.tpl @@ -7,6 +7,8 @@ R"===(// generated by XMRSTAK_VERSION * worksize - Number of local GPU threads (nothing to do with CPU threads) * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner. * strided_index - switch memory pattern used for the scratch pad memory + * 3 = chunked memory, chunk size based on the 'worksize' + * required: intensity must be a multiple of worksize * 2 = chunked memory, chunk size is controlled by 'mem_chunk' * required: intensity must be a multiple of worksize * 1 or true = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks @@ -20,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION * to use a intensity which is not the multiple of the worksize. * If you set false and the intensity is not multiple of the worksize the miner can crash: * in this case set the intensity to a multiple of the worksize or activate comp_mode. + * interleave - Controls the starting point in time between two threads on the same GPU device relative to the last started thread. + * This option has only an effect if two compute threads using the same GPU device: valid range [0;100] + * 0 = disable thread interleaving + * 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished * "gpu_threads_conf" : * [ - * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, - * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true }, + * { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false, + * "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true, + * "interleave" : 40 + * }, * ], * If you do not wish to mine with your AMD GPU(s) then use: * "gpu_threads_conf" : @@ -34,6 +42,16 @@ R"===(// generated by XMRSTAK_VERSION GPUCONFIG ], +/* + * number of rounds per intensity performed to find the best intensity settings + * + * WARNING: experimental option + * + * 0 = disable auto tuning + * 10 or higher = recommended value if you don't already know the best intensity + */ +"auto_tune" : 0, + /* * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel. */ diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp index 152f8add4..d3dc00d01 100644 --- a/xmrstak/backend/amd/jconf.cpp +++ b/xmrstak/backend/amd/jconf.cpp @@ -65,6 +65,19 @@ configVal oConfigValues[] = { constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0])); + +enum optionalConfigEnum { iAutoTune }; + +struct optionalConfigVal { + optionalConfigEnum iName; + const char* sName; + Type iType; +}; + +optionalConfigVal oOptionalConfigValues[] = { + { iAutoTune, "auto_tune", kNumberType } +}; + inline bool checkType(Type have, Type want) { if(want == have) @@ -106,7 +119,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) if(!oThdConf.IsObject()) return false; - const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode; + const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave; idx = GetObjectMember(oThdConf, "index"); intensity = GetObjectMember(oThdConf, "intensity"); w_size = GetObjectMember(oThdConf, "worksize"); @@ -115,11 +128,31 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) memChunk = GetObjectMember(oThdConf, "mem_chunk"); unroll = GetObjectMember(oThdConf, "unroll"); compMode = GetObjectMember(oThdConf, "comp_mode"); + interleave = GetObjectMember(oThdConf, "interleave"); if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr || stridedIndex == nullptr || unroll == nullptr || compMode == nullptr) return false; + // interleave is optional + if(interleave != nullptr) + { + if(!interleave->IsInt()) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be a number"); + return false; + } + else if(interleave->GetInt() < 0 || interleave->GetInt() > 100) + { + printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]"); + return false; + } + else + { + cfg.interleave = interleave->GetInt(); + } + } + if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64()) return false; @@ -137,9 +170,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg) else cfg.stridedIndex = (int)stridedIndex->GetInt64(); - if(cfg.stridedIndex > 2) + if(cfg.stridedIndex > 3) { - printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2"); + printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 3"); return false; } @@ -179,6 +212,20 @@ size_t jconf::GetPlatformIdx() return prv->configValues[iPlatformIdx]->GetUint64(); } +size_t jconf::GetAutoTune() +{ + const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName); + if( value != nullptr && value->IsUint64()) + { + return value->GetUint64(); + } + else + { + printer::inst()->print_msg(L0, "WARNING: OpenCL optional option 'auto-tune' not available"); + } + return 0; +} + size_t jconf::GetThreadCount() { return prv->configValues[aGpuThreadsConf]->Size(); diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp index b852c5940..51a0c79ac 100644 --- a/xmrstak/backend/amd/jconf.hpp +++ b/xmrstak/backend/amd/jconf.hpp @@ -27,6 +27,7 @@ class jconf size_t w_size; long long cpu_aff; int stridedIndex; + int interleave = 40; int memChunk; int unroll; bool compMode; @@ -35,6 +36,7 @@ class jconf size_t GetThreadCount(); bool GetThreadConfig(size_t id, thd_cfg &cfg); + size_t GetAutoTune(); size_t GetPlatformIdx(); private: diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 5e70f25a6..b0f4e6ecd 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -58,6 +58,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th iTimestamp = 0; pGpuCtx = ctx; this->affinity = cfg.cpu_aff; + autoTune = jconf::inst()->GetAutoTune(); std::unique_lock lck(thd_aff_set); std::future order_guard = order_fix.get_future(); @@ -100,6 +101,7 @@ bool minethd::init_gpus() vGpuData[i].memChunk = cfg.memChunk; vGpuData[i].compMode = cfg.compMode; vGpuData[i].unroll = cfg.unroll; + vGpuData[i].interleave = cfg.interleave; } return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS; @@ -186,6 +188,19 @@ void minethd::work_main() uint8_t version = 0; size_t lastPoolId = 0; + pGpuCtx->maxRawIntensity = pGpuCtx->rawIntensity; + + if(autoTune != 0) + { + pGpuCtx->rawIntensity = pGpuCtx->computeUnits * pGpuCtx->workSize; + pGpuCtx->rawIntensity = std::min(pGpuCtx->maxRawIntensity, pGpuCtx->rawIntensity); + } + // parameters needed for auto tuning + uint32_t cntTestRounds = 0; + uint64_t accRuntime = 0; + double bestHashrate = 0.0; + uint32_t bestIntensity = pGpuCtx->maxRawIntensity; + while (bQuit == 0) { if (oWork.bStall) @@ -220,7 +235,6 @@ void minethd::work_main() version = new_version; } - uint32_t h_per_round = pGpuCtx->rawIntensity; size_t round_ctr = 0; assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID)); @@ -236,12 +250,15 @@ void minethd::work_main() //Allocate a new nonce every 16 rounds if((round_ctr++ & 0xF) == 0) { - globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16); + globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, pGpuCtx->rawIntensity * 16); // check if the job is still valid, there is a small possibility that the job is switched if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo) break; } + // if auto tuning is running we will not adjust the interleave interval + const bool adjustInterleave = autoTune == 0; + uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave); cl_uint results[0x100]; memset(results,0,sizeof(cl_uint)*(0x100)); @@ -269,6 +286,58 @@ void minethd::work_main() uint64_t iStamp = get_timestamp_ms(); iHashCount.store(iCount, std::memory_order_relaxed); iTimestamp.store(iStamp, std::memory_order_relaxed); + + accRuntime += updateTimings(pGpuCtx, t0); + + // tune intensity + if(autoTune != 0) + { + if(cntTestRounds++ == autoTune) + { + double avgHashrate = static_cast(cntTestRounds * pGpuCtx->rawIntensity) / (static_cast(accRuntime) / 1000.0); + if(avgHashrate > bestHashrate) + { + bestHashrate = avgHashrate; + bestIntensity = pGpuCtx->rawIntensity; + } + + // increase always in workSize steps to avoid problems with the compatibility mode + pGpuCtx->rawIntensity += pGpuCtx->workSize; + // trigger that we query for new nonce's because the number of nonce previous allocated depends on the rawIntensity + round_ctr = 0x10; + + if(pGpuCtx->rawIntensity > pGpuCtx->maxRawIntensity) + { + // lock intensity to the best values + autoTune = 0; + pGpuCtx->rawIntensity = bestIntensity; + printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u", + pGpuCtx->deviceIdx, + pGpuCtx->idWorkerOnDevice, + bestIntensity + ); + } + else + { + printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u", + pGpuCtx->deviceIdx, + pGpuCtx->idWorkerOnDevice, + pGpuCtx->rawIntensity, + bestIntensity + ); + } + // update gpu with new intensity + XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo); + } + // use 3 rounds to warm up with the new intensity + else if(cntTestRounds == autoTune + 3) + { + // reset values for the next test period + cntTestRounds = 0; + accRuntime = 0; + } + } + std::this_thread::yield(); } diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp index 32e66ec87..74ab5fb60 100644 --- a/xmrstak/backend/amd/minethd.hpp +++ b/xmrstak/backend/amd/minethd.hpp @@ -39,6 +39,7 @@ class minethd : public iBackend std::thread oWorkThd; int64_t affinity; + uint32_t autoTune; bool bQuit; bool bNoPrefetch; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 2b1741764..06cbe8740 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -182,7 +182,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output) xin6 = _mm_load_si128(input + 10); xin7 = _mm_load_si128(input + 11); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for(size_t i=0; i < 16; i++) { @@ -326,11 +326,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { @@ -377,7 +377,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output) aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); } @@ -716,7 +716,7 @@ inline void set_float_rounding_mode() ((int64_t*)ptr0)[0] = u ^ q; \ idx0 = d ^ q; \ } \ - else if(ALGO == cryptonight_haven) \ + else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \ { \ ptr0 = (__m128i *)&l0[idx0 & MASK]; \ int64_t u = ((int64_t*)ptr0)[0]; \ diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 55879110a..20203a3c5 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -395,6 +395,13 @@ bool minethd::self_test() hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx); bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0; } + else if(algo == cryptonight_superfast) + { + hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast); + hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx); + bResult = bResult && memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0; + } + if(!bResult) printer::inst()->print_msg(L0, @@ -520,6 +527,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc case cryptonight_monero_v8: algv = 10; break; + case cryptonight_superfast: + algv = 11; + break; default: algv = 2; break; @@ -579,7 +589,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc Cryptonight_hash::template hash, Cryptonight_hash::template hash, Cryptonight_hash::template hash, - Cryptonight_hash::template hash + Cryptonight_hash::template hash, + + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash, + Cryptonight_hash::template hash }; std::bitset<2> digit; diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 6b1afa928..e905caa9f 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -16,7 +16,8 @@ enum xmrstak_algo cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks - cryptonight_monero_v8 = 11 + cryptonight_monero_v8 = 11, + cryptonight_superfast = 12 }; // define aeon settings @@ -34,6 +35,8 @@ constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000; constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000; +constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; + template inline constexpr size_t cn_select_memory() { return 0; } @@ -70,6 +73,9 @@ inline constexpr size_t cn_select_memory() { return CRYPTONIG template<> inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_HEAVY_MEMORY; } +template<> +inline constexpr size_t cn_select_memory() { return CRYPTONIGHT_MEMORY; } + inline size_t cn_select_memory(xmrstak_algo algo) { switch(algo) @@ -79,6 +85,7 @@ inline size_t cn_select_memory(xmrstak_algo algo) case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: + case cryptonight_superfast: return CRYPTONIGHT_MEMORY; case cryptonight_ipbc: case cryptonight_aeon: @@ -129,6 +136,9 @@ inline constexpr uint32_t cn_select_mask() { return CRYPTONIG template<> inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_HEAVY_MASK; } +template<> +inline constexpr uint32_t cn_select_mask() { return CRYPTONIGHT_MASK; } + inline size_t cn_select_mask(xmrstak_algo algo) { switch(algo) @@ -138,6 +148,7 @@ inline size_t cn_select_mask(xmrstak_algo algo) case cryptonight_monero_v8: case cryptonight_masari: case cryptonight: + case cryptonight_superfast: return CRYPTONIGHT_MASK; case cryptonight_ipbc: case cryptonight_aeon: @@ -188,6 +199,9 @@ inline constexpr uint32_t cn_select_iter() { return CRYPTONIG template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } +template<> +inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_SUPERFAST_ITER; } + inline size_t cn_select_iter(xmrstak_algo algo) { switch(algo) @@ -207,6 +221,8 @@ inline size_t cn_select_iter(xmrstak_algo algo) return CRYPTONIGHT_HEAVY_ITER; case cryptonight_masari: return CRYPTONIGHT_MASARI_ITER; + case cryptonight_superfast: + return CRYPTONIGHT_SUPERFAST_ITER; default: return 0; } diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp index e478600e3..199025635 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp @@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory) for(int i = threadIdx.x; i < 1024; i += blockDim.x) sharedMemory[i] = d_t_fn[i]; } + +__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory) +{ + for(int i = threadIdx.x; i < 512; i += blockDim.x) + sharedMemory[i] = d_t_fn[i]; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu index fa7e09364..87c1befa8 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu @@ -9,6 +9,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp" +#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp" #ifdef _WIN32 @@ -121,6 +122,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val ) #endif } +__device__ __forceinline__ uint32_t rotate16( const uint32_t n ) +{ + return (n >> 16u) | (n << 16u); +} + template __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 ) { @@ -267,9 +273,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 ) __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state, uint32_t startNonce, uint32_t * __restrict__ d_input ) { - __shared__ uint32_t sharedMemory[1024]; + __shared__ uint32_t sharedMemory[512]; - cn_aes_gpu_init( sharedMemory ); + cn_aes_gpu_init_half( sharedMemory ); #if( __CUDA_ARCH__ < 300 ) extern __shared__ uint64_t externShared[]; @@ -340,8 +346,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ]; u64 cx_aes = ax0 ^ u64( - t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ), - t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) ) + t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )), + t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) )) ); if(ALGO == cryptonight_monero_v8) @@ -523,7 +529,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int a = (d_ctx_a + thread * 4)[sub]; idx0 = shuffle<4>(sPtr,sub, a, 0); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { if(partidx != 0) { @@ -647,18 +653,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int { int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); - int64_t q = n / (d | 0x5); + int64_t q = fast_div_heavy(n, (d | 0x5)); if(sub&1) storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); idx0 = d ^ q; } - else if(ALGO == cryptonight_haven) + else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) { int64_t n = loadGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3)); int32_t d = loadGlobal32( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u )); - int64_t q = n / (d | 0x5); + int64_t q = fast_div_heavy(n, (d | 0x5)); if(sub&1) storeGlobal64( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q ); @@ -672,7 +678,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int { (d_ctx_a + thread * 4)[sub] = a; (d_ctx_b + thread * 4)[sub] = d[1]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) if(sub&1) *(d_ctx_b + threads * 4 + thread) = idx0; } @@ -718,7 +724,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti cn_aes_pseudo_round_mut( sharedMemory, text, key ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { #pragma unroll for ( int j = 0; j < 4; ++j ) @@ -756,7 +762,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads, bfactorOneThree, i, ctx->d_long_state, - (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 ? ctx->d_ctx_state2 : ctx->d_ctx_state), + (ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state), ctx->d_ctx_key1 )); if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep ); @@ -818,7 +824,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) int roundsPhase3 = partcountOneThree; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ) { // cryptonight_heavy used two full rounds over the scratchpad memory roundsPhase3 *= 2; @@ -840,9 +846,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce) void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce) { typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce); - + if(miner_algo == invalid_algo) return; - + static const cuda_hash_fn func_table[] = { cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, @@ -875,7 +881,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t cryptonight_core_gpu_hash, cryptonight_core_gpu_hash, - cryptonight_core_gpu_hash + cryptonight_core_gpu_hash, + + cryptonight_core_gpu_hash, + cryptonight_core_gpu_hash }; std::bitset<1> digit; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index 433e175dd..45afec9ac 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -114,7 +114,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric int thread = ( blockDim.x * blockIdx.x + threadIdx.x ); __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -160,7 +160,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 ); memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 ); - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { for(int i=0; i < 16; i++) @@ -184,7 +184,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 __shared__ uint32_t sharedMemory[1024]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { cn_aes_gpu_init( sharedMemory ); __syncthreads( ); @@ -201,7 +201,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3 for ( i = 0; i < 50; i++ ) state[i] = ctx_state[i]; - if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2) + if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast) { uint32_t key[40]; @@ -298,7 +298,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) if( cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ) { // extent ctx_b to hold the state of idx0 @@ -349,6 +350,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); } + else if(miner_algo == cryptonight_superfast) + { + CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, + ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 )); + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<<>>( wsize, ctx->d_input, ctx->inputlen, startNonce, @@ -396,6 +402,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce, cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) ); } + else if(miner_algo == cryptonight_superfast) + { + CUDA_CHECK_MSG_KERNEL( + ctx->device_id, + "\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**", + cryptonight_extra_gpu_final<<>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 ) + ); + } else if(miner_algo == cryptonight_bittube2) { CUDA_CHECK_MSG_KERNEL( @@ -676,7 +690,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) if( cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || - cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() + cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() || + cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ) perThread += 50 * 4; // state double buffer diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp new file mode 100644 index 000000000..555ccbef2 --- /dev/null +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp @@ -0,0 +1,29 @@ +#pragma once + +#include + + +__device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b) +{ + + uint64_t a = abs(_a); + int b = abs(_b); + + float rcp = __frcp_rn(__int2float_rn(b)); + float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23)); + + uint64_t q1 = __float2ull_rz(__int2float_rn(((int*)&a)[1]) * rcp2); + a -= q1 * static_cast(b); + + uint64_t tmp = a >> 12; + float q2f = __int2float_rn(((int*)&tmp)[0]) * rcp; + q2f = __uint_as_float(__float_as_uint(q2f) + (12U << 23)); + int64_t q2 = __float2ll_rn(q2f); + int a2 = ((int*)&a)[0] - ((int*)&q2)[0] * b; + + int q3 = __float2int_rn(__int2float_rn(a2) * rcp); + q3 += (a2 - q3 * b) >> 31; + + const uint64_t q = q1 + q2 + q3; + return ((((int*)&_a)[1] ^ _b) < 0) ? -q : q; +} diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp index 796b7adda..0d54f1436 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp @@ -7,8 +7,7 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23)); const float a_lo = __uint2float_rn(a & 0xFF); - float r; - asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi)); + float r = __frcp_rn(a_hi); const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23)); const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f)); @@ -18,21 +17,22 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a) __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b) { const uint32_t r = get_reciprocal(b); - const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a; + const uint32_t a1 = ((uint32_t*)&a)[1]; + const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a; - uint32_t q[2]; - q[0] = ((uint32_t*)&k)[1]; + const uint32_t q = ((uint32_t*)&k)[1]; + int64_t tmp = a - ((uint64_t)(q) * b); + ((int32_t*)(&tmp))[1] -= q < a1 ? b : 0; + + const int overshoot = ((int*)(&tmp))[1] >> 31; + const int64_t tmp_u = (uint32_t)(b - 1) - tmp; + const int undershoot = ((int*)&tmp_u)[1] >> 31; - int64_t tmp = a - (uint64_t)(q[0]) * b; - ((int32_t*)(&tmp))[1] -= (k < a) ? b : 0; + uint64_t result; + ((uint32_t*)&result)[0] = q + overshoot - undershoot; + ((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b); - const bool overshoot = ((int32_t*)(&tmp))[1] < 0; - const bool undershoot = tmp >= b; - - q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U); - q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U); - - return *((uint64_t*)(q)); + return result; } __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1) diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp index ca2fa9585..2a2dc8dbc 100644 --- a/xmrstak/jconf.cpp +++ b/xmrstak/jconf.cpp @@ -98,15 +98,17 @@ xmrstak::coin_selection coins[] = { { "cryptonight_lite", {cryptonight_aeon, cryptonight_lite, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, { "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr }, - { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, - { "cryptonight_v8", {cryptonight_monero, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "cryptonight_superfast", {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, + { "cryptonight_v7", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, + { "cryptonight_v8", {cryptonight_monero_v8, cryptonight_monero_v8, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, - { "graft", {cryptonight_monero_v8, cryptonight_monero, 11u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "freehaven", {cryptonight_heavy, cryptonight_superfast, 255u}, {cryptonight_heavy, cryptonight_superfast, 0u}, nullptr }, + { "graft", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "haven", {cryptonight_heavy, cryptonight_haven, 255u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, - { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "intense", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "masari", {cryptonight_monero_v8, cryptonight_masari, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr }, - { "monero", {cryptonight_monero_v8, cryptonight_monero, 8u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" }, - { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr }, + { "monero", {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" }, + { "qrl", {cryptonight_monero_v8, cryptonight_monero, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "ryo", {cryptonight_heavy, cryptonight_heavy, 0u}, {cryptonight_heavy, cryptonight_heavy, 0u}, nullptr }, { "stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr }, { "turtlecoin", {cryptonight_aeon, cryptonight_aeon, 0u}, {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr } diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index d20ba082f..406c535d2 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -706,6 +706,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes case cryptonight_masari: algo_name = "cryptonight_masari"; break; + case cryptonight_superfast: + algo_name = "cryptonight_superfast"; + break; default: algo_name = "unknown"; break; diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl index e86e2a537..58762de56 100644 --- a/xmrstak/pools.tpl +++ b/xmrstak/pools.tpl @@ -24,6 +24,7 @@ POOLCONF], * aeon7 (use this for Aeon's new PoW) * bbscoin (automatic switch with block version 3 to cryptonight_v7) * bittube (uses cryptonight_bittube2 algorithm) + * freehaven * graft * haven (automatic switch with block version 3 to cryptonight_haven) * intense @@ -41,6 +42,7 @@ POOLCONF], * cryptonight_lite_v7_xor (algorithm used by ipbc) * # 2MiB scratchpad memory * cryptonight + * cryptonight_superfast * cryptonight_v7 * cryptonight_v8 * # 4MiB scratchpad memory diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 47d676350..09bfe8d8f 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -3,7 +3,7 @@ //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. #define GIT_ARCHIVE 1 #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH) -#define GIT_COMMIT_HASH 871371622 +#define GIT_COMMIT_HASH 7d6cd31b1 #endif #ifndef GIT_COMMIT_HASH @@ -19,7 +19,7 @@ #endif #define XMR_STAK_NAME "xmr-stak" -#define XMR_STAK_VERSION "2.6.0" +#define XMR_STAK_VERSION "2.7.1" #if defined(_WIN32) #define OS_TYPE "win"