From c0a67e0a23b796861fd733e2beb8be517a8f23b1 Mon Sep 17 00:00:00 2001
From: Indeed Miners <32953696+IndeedMiners@users.noreply.github.com>
Date: Sun, 6 Jan 2019 16:49:08 +0100
Subject: [PATCH] 2.7.1

---
 doc/compile_Windows.md                        |   9 +-
 doc/img/interleave.png                        | Bin 0 -> 12549 bytes
 doc/tuning.md                                 |  55 ++++-
 xmrstak/backend/amd/amd_gpu/gpu.cpp           | 218 +++++++++++++-----
 xmrstak/backend/amd/amd_gpu/gpu.hpp           |  22 +-
 .../backend/amd/amd_gpu/opencl/cryptonight.cl | 157 +++++++------
 .../amd/amd_gpu/opencl/fast_div_heavy.cl      |  12 +-
 .../amd/amd_gpu/opencl/fast_int_math_v2.cl    |  93 ++++----
 .../backend/amd/amd_gpu/opencl/wolf-aes.cl    |  72 +++---
 xmrstak/backend/amd/autoAdjust.hpp            |  52 ++++-
 xmrstak/backend/amd/config.tpl                |  22 +-
 xmrstak/backend/amd/jconf.cpp                 |  53 ++++-
 xmrstak/backend/amd/jconf.hpp                 |   2 +
 xmrstak/backend/amd/minethd.cpp               |  73 +++++-
 xmrstak/backend/amd/minethd.hpp               |   1 +
 .../backend/cpu/crypto/cryptonight_aesni.h    |  10 +-
 xmrstak/backend/cpu/minethd.cpp               |  17 +-
 xmrstak/backend/cryptonight.hpp               |  18 +-
 xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp |   6 +
 xmrstak/backend/nvidia/nvcc_code/cuda_core.cu |  39 ++--
 .../backend/nvidia/nvcc_code/cuda_extra.cu    |  27 ++-
 .../nvidia/nvcc_code/cuda_fast_div_heavy.hpp  |  29 +++
 .../nvcc_code/cuda_fast_int_math_v2.hpp       |  28 +--
 xmrstak/jconf.cpp                             |  14 +-
 xmrstak/net/jpsock.cpp                        |   3 +
 xmrstak/pools.tpl                             |   2 +
 xmrstak/version.cpp                           |   4 +-
 27 files changed, 737 insertions(+), 301 deletions(-)
 create mode 100644 doc/img/interleave.png
 create mode 100644 xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
diff --git a/doc/compile_Windows.md b/doc/compile_Windows.md
index 1b5787879..8fe4dcf53 100644
--- a/doc/compile_Windows.md
+++ b/doc/compile_Windows.md
@@ -30,15 +30,16 @@
     - CUDA/Runtime
     - Driver components
 
-### AMD DRIVER/APP SDK 3.0 (only needed for AMD GPUs)
+### AMD DRIVER/OCL-SDK (only needed for AMD GPUs)
 
 - Download & install the AMD driver: https://www.amd.com/en/support
 
 **ATTENTION** Many windows driver 18.5+ creating invalid shares.
 If you have an issue with `invalid shares` please downgrade your driver.
 
-- Download and install the latest version from http://amd-dev.wpengine.netdna-cdn.com/app-sdk/installers/APPSDKInstaller/3.0.130.135-GA/full/AMD-APP-SDKInstaller-v3.0.130.135-GA-windows-F-x64.exe
-  (do not wonder why it is a link to a netdna-cdn.com but AMD has removed the SDK downloads, see https://community.amd.com/thread/222855)
+- Download and install the latest version of the OCL-SDK from https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases 
+
+Do not follow old information that you need the AMD APP SDK. AMD has removed the APP SDK and is now shipping the OCL-SDK_light.
 
 ### Dependencies OpenSSL/Hwloc and Microhttpd
 - For CUDA 8*:
@@ -115,4 +116,4 @@ If you have an issue with `invalid shares` please downgrade your driver.
 
   copy C:\xmr-stak-dep\openssl\bin\* .
   ```
-- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD APP SDK is installed) and CPUs.
+- Miner is by default compiled for NVIDIA GPUs (if CUDA is installed), AMD GPUs (if the AMD OCL-SDK_light is installed) and CPUs.
diff --git a/doc/img/interleave.png b/doc/img/interleave.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4b59b282e4c24606d2b9a026fc8e6233a65e01b
GIT binary patch
literal 12549
zcmcJ01yoyIw{0i|O3~u3Efk7V+}dKLP$<&kp^zfQod7|Kx40CSLW{dQ6btU|5<CRA
zpfB{hcf9Za@BQz)?~VI1274#xtZ~jh`>eg@oNH~qzEhURdr0*V007`Ayp~Y|0PgVs
z0BGC~Fi}Tz6D}K29j1|@ybP*GeUQ@iUZ@V+_O-SH0D#|q_lNfV89p`YAeN)TTUo3X
zTr^^Cz`9v8HtG<iqpX&rw2hUOv9%*W+TPgE(fAXC%V$S326=_I?=-*QlLG(@00o(s
z8sHhkybFZHc%}1Tu)MsUt-_D!o*&EmccQP~w+B4LdXV}}OIbCht?iqS+qUaAQjC<^
zR6=|(0OLNXEXjT4Z%jT6vUxs)L=SQ!KJ|(>O9|e;hwb-TVfE6pwz#Ied{F$dkL761
zH8p{ELMJ#VDCjlDiYhUx54sofPwms+cL)C7{*%eoO${36Six7`0Ea!G9~UE627sA~
ziAgP;Vjf39rU%ag={hZRl}tXEps%~cd)d5Rb_I8}?C<`me6(9(rAGRbj9AtC(Ib&%
z<dcS^BXgG0ikVEy*>gVCZDz7aqRm0ypcv}qzBKLRl)?bQM;H{3a-VY4e}KIGAY733
z=QrDJi*av7Cl6E_u^@z20*})Sa>j*HY`b`R|5zI`WvoTxDvk@{4G>$r#t435GigHC
ze8O&iaZLdVvYAQ^G`NVFi{PBI<`MfIM_1=BXZM{4`i?DK#N`V1cI0S|Xu!T~%GUb{
z|03+-rAN;CnPEr^JRG+3a*`jJzjj|gv?rK`04VS!DA}wjuQEqpA}mIq(!Vz&U*icB
zKJ>$`uuWw`ARzDiPe9k}K^6~0+;c0BM_cLdmf0zj;tnR0xYoblhJX;0UaaT6lg}+&
z@!48FE{nQ57r=5$H4nD=)u;B4KAFprCdeuc#AYT>Z*#=1uaZndgyXxDbBPo9otVa>
zxQq@6)IaOYXDdLNIwRaZ%B<e!f1E0jj1Y4<SW7_%<SguC^6~LaS6YxCEmHfR?PCXQ
zZgw?ydd8oz`wlcE!c&u!KUfn^I(08L29wL5Vtnk)kgwg7BCah__i|f2qWUa?tSM6D
z@e)dt;D6{{fS~8wJxM!-`<LDy^~|t3qT}fAPJdY(SumaWa>}W6Vx`V7YLa7Vr%s^j
z>O@^MRn)Hna?<|CiD&{qOl%wVE~)7q*J%fXk2chw%mnjddRdAKg?T0M=V_oVbuYKs
z6+u#}>Fc7lzh7ary(rb=*;cq2slo;rG#oryZXITc^U`+_pguaxnY6>rZ+1&1bIM<4
zo66#)BhqzT3l>-$WfzRy84Is*Y9-jDQMq%PNVNmxz|yi*We3fW1qkxKwnDzVi(XR2
z3Myr5Hw^CsK+h>giX{MInw(*MjcBhD;I7Bz#@=z+i&xKhmF@IjVP@NVu*`Y#0!Gu^
zqQp^Gyu0Yg>uo_lDo^#DwE6&rakQY^nb6pSg%!RB0@Py|3=z@5A#{*}u3*yGN5bN2
z41KEh81d?k7qed?4NG<qN7Kna`SWDLNZ+oIMlOCgKS_!?=PrNhwW<RA{%akqsMSqP
z`hMTbdbw=%i6Y>E|ADD^xyzcyW@2)|NrUYPT@F5Vw>6pxVlc97vfiOWuETKx-t
zKaNkkO5D<K?|oX!#gbdqQY8op>N1S?>E@F(Fs=H;&D3HrbN-V!SC-_XjEwSYw=dgY
zD*?9ixXxHDGHmfEH>P?;^hhM6T{#8qL7<n3tG%RX?Q0n{IhkL{{OxJZWKP$f3|)S4
zWbf>KO7HWjO(jQt%@6*4_j0?sTkM#;qV7$9j>+CCR$h|}Y0RMoly@5RR(HF0m{3p;
zfclV)X7LpAlgsMER*^EO5v-mKyl7>^15Bkz`U>4vM}B-N!yt9rer7^Jzzp<AkyBNz
z*)2Xc8wwy;zm;LwF)G!9M^z!^1I{el_=6{xh2Mft4m`2+l~n#*%q3^*x5d{pw^C;U
zW;dahy(86Jd-AHZeZ?6nh=5KB<b*~C+D?dH3=5agC0cXZry!sUm+lY9=C>Pmqsiu_
z=Qilu5^&;|zk9hN*c4ZlGl#H@aN&GcL>CcpueL9iT8C+m>~-FAN#FO{^;-J})Owzt
zni&%Aw|*C&M;!p%O+rZ_67a2AcHpw?s@AQXPJV*PT`(-o!IG1axf|m@wWxVTeg4+|
zG5OVSvFCN$TV~w*Ah4|&KA3wKQV+%WtY)6iRNJU%Xpk_=hh5DFP@=~9C~`5~R=5v`
zyJ5c`kN+VL>UIzR>N0qK{t3mMPx?Zr1ip>uYpOrmJ=z)HjXvb!<_?dKrx5T9>f%+)
zd_%h1aHJf7M=b|~hVnvVE=8$z9~s4>;(koeQj2c_pYfYq>E7>}Kn2wK12Fr<j}TOl
zL3s}msYJyeXn0zoVfNRpIZ4CT?;ztUY-g@EdV4(omP<PL4`gVV)#2Zv%ipo&ukZha
zDF0I?e}UQm{OxFPQ1L#-#ifZSyjHO~hq0&j*}&&+M4Xd&o^crD;u%@-Ocqo^;k2<b
zmO3@Qg8;5JmJ|391sfC@*T4)2{;+4DS+mgL9K9>u+R<)x!{%TPdfMe1;O<fEq9;BE
z1>4y=*L-Wjyly^YPw6PN2B`HHhP=dY4q)Zm%cf*%=MU1>ke$(=Bp;Uoo!DkJ0`Yj3
zcrlwNeFqPgj@<U$@tS-#Iylev^no?XEr-_=^Io-+2TKR>_Vf66OZ!peL-{}cM)EO?
zl6s}e`7JMWXgTk*R(?t#9$$f>ehBX|)wO^*%JL4EQOHb*uEevZD-QkW*#?`dC(kuq
zrYK(qEP9C~B;A0~4tPC7cs25dgYvCigKRwCytj!93QE>Plk4$tWQ=Hvbl4tGhq+3F
zx2_XDtzw`}mBYywAU7}TstVr5<+DVPfB1$P(jhE9RSr=&{)IL-x2B^Z7THTP<I?Dp
zY#V>H2acQUJcjaOrInGsX9K!#X?4WCVDVLf_h_=!?}Le4?MKA)T!Hd=(DSM2%IHC$
zhu^c##x56|&<qz1r}btxj4;*uuR&dV;=d`v<ESdlNiTIya_}_{!ClyaBAM^q%~>D+
z0KpEFxMTN#HGwKsh~Kc>&Rb7y`RZTstsaE$pM6a~y9ypCq#A}SNkA@VY$Y2_0oREg
z=2EDuVrw!6Muflsv9>^#y^!YcQFteLU&6uY)?7NI;v>4X9W$M^cS|KP57c&Wl~W5|
z=6ouaJe<P<6&63}I7MHd)v^@+9hI<Wcl|?B<LHt}^Tlze-RTc_!qvFFPm#o@*If$J
z?ayv06st$6NC)5t7cdc*{ZOTp`fo#w3pRzlP#dX@ZJc`85ts)WG1c%ayy1Chf%j$7
zI_9)3K8P;<{_k3s>p1HZBYb=o=upM2+09sk<izfB)1}l11Qk3~z1g1G#H8)48Yh2f
z#Fw`5@}u|ae+qum6{Wq$^Fv-8GvxAki4cVQp8XOHMPODZU5iInlG7oVid0Kcsl*&a
zK>I3=OpqR)sJ<;Faki)I2cG*c{^BN-BiWplRHvZ%qNNtY);fsD&7g*YxIcC0jj!x8
zFE$7pP2b0f7)^??dV!PKa0brXrI&*60I=03gT&<FI3W{`#~KkdY7-u99zuAJ8Jnnk
ztFCpR&z|TUR&yF!o3Idea5C4at5tvFRh_d}HvmjP&hI5KKKbjJqG_pF>{Krua-61p
zugaK%OW-R3(!Z><TO!n>0r5BI-}_YXX$U!Rzh`&=EOm~aR0Gc#oJ92p0{J(}K6Sf<
z2=k}!1L&15zgG3Tmpo?;#B7u+GqJ!lob1pEh6gAB%e~9t6AVX_DqG-f!jVI(NbeHR
zGCbL(69;fi$NMFwPokzM*h&A@7d<i+_gJ&=Kt!)W`d-jkZ~&*pG|w7uk=FqZfQv#R
zvTu&Cs8Vb4i4b!F7AA6Qx`@8K3c;aHZLRwp3kTeoJyNQy(cSh6GclPQo2O;Te&Sc)
zu`UV>9`YEMex9(9A-E<mcuOPgoA{n7+uWTdkF|0gQ9F7Efr7r(rdReu?jDj@-r9qq
z#-OBnBr>U*61<S$xH%MwDew?~bz;q6QokSCAinKH)3#llnk_qT4t~f&NeyExmDOz0
zI+7?f0(C222kEk2mDqqg>Kg2CM*;0m@L<u{LecL8FhRH!zOdjd^;^|G&tob;Y~zWL
zXaD>p4ydi3(&LC9@Yty?{IiO_;<y>Bnh+?cD7H)DHl@QLvDOmje$NP6TWsbtKv2<)
zpZSgS$qI!Lu?iy;)-r?Z`DS4BGlGdJ1E5X@w0U-OedgJb{0Ou^VWvEp_a05_0uj3+
zO2X8+|IQc)Hhg?z6!7sM8CyrH8_1S@(zF#pF7F0OMKQMeV(q=u()gV|N5KMc!qiHD
zD{e*O>PXk=E4X$F8M_ksm(@&26F;8!Y$}DmC2eSDrv$gS29&Jgv5g8AgJ8PkLR-kI
zgq*x~cjlwU>dc*Co6tL)ijkdHmd+*Dg^Y*VnZ;0G^B_eMo+yhJs?~E_5korY=0;=t
zSp+EB;Z`r>FnI%MfS<)|0f6b*jdBszhH4^VNvZGoP*n*oOwc#4@?!H7pEC5-9U%f1
z<L=uB=yWPf9?HSb3@-!N``t!3ShAF?;v2>20bXl)ffD&$U`RWd36T;_Jajjf<nw`$
zZ(3V50sRD#4Y;ZpZ8FwR^;UE$XHS3pij4j{g%tJwcM5q~Xv3K>E5vKDu0+Od&^=RW
zF`e8HZQJ4(mrP3?OdF=4T}cJ#%vD@rtg^{i`AkFGUpF_$Y%Q%Gk!Q)K@@%9e@@@{U
z<iZATlZs`Dhz;`3`TnAxU(w1ATmAk0jz%c?mpD=rk+4Wo5F@`tJ;SGV!ZjLpKVgmF
z!|z*1nuRoYFKk^t{EFKq#4~S0v!8!Wp!%KN^Lio(6<l3L7T)}ZffdC~l6*v1@A&J6
zzny-yMmXbJ<~EUu{*3|tdNM!qz$Ch~_NoewSIb>)6FLQz_p7|rEolpMFK)tKbaA$2
z3yl?O(t0ObB?-?Z3|j+5#e5^x`=YlsM*hs3El{B4Wv6o<t6UM?P95bAzRy?Ds2e)P
zbpmx)w*yfCwDFeMi+=7`yekhG^+Ou*=b#^NDyO=r1mhZb-o0WCc0&Jj-`Dse1W&9A
zYk{@Il)3&MS1E5Qk(A0QcRsdGaA)+3{B|?_ubL>r1d;bkT>AJmMHHP7|F-XN^@=(5
zw$yOl^u;eMQ^#BNH}UN_pP$8%)QhLfmir)Vn2lY!kHB+tV|@KyXgTwl1d<?>4k%yR
zn2VD7=}(`fjLuMJZ<^RCcHx5}R`-;Updcv($M(hbiwJCP&ZT$qXD{R)CaLq;ohb)Y
zmatm5{IZic&|qJtF-8U2oAkJE1n*MZW$%5fPvO@56<6Y_xSgTW<4j9|+vF#BT&#}s
z;OKbNAQoH|i)VZk9@lj5$7rQjdYSw+_n)Q7CGwq`_cX&9&`0ORxfpAEAovdqpILKM
z;9WhE?ZxT^mu@ipWofLjjbztPoDbw&Zw6<?l0OCIE+*_^OqNl4U8=8=^Rq%R%WKyH
zI(z6w#fQaD2dM#C$30GvCa&!GSw^jO6^gdL3VpD6C32&teE9s2=f!A3+k!L=vQ;%m
ziY}i?P*p7ZBBdTZsMq3{sQYDg6S*ugyWh9sB&<vf^k^_xey92qr}#%6j1b@EQ&(Cl
z6Myp(G<YH^DYCEpC)Uu~MR-EJJ)IT1EK5fda;BjbNC+#0mWS(O{I1HpUD~lo`>1<i
ztv7jxB-R&SEZi$htEw>&dGbC3kxEygmhD43Wp*KEqZAe{c6PlN;)fR-zjk^a3%L;F
zHW#gZP2$d&w2A~~^wKB2+PI%~h)TPl!UycWZ?POsY;me|Z#+foveQ+msbWS0J{_zD
zW|uR)@5sWX6ApX64qFQrX<T3Mt#2{N=ZwA;#PudshZ2$_mcX<jozj*Nee2sv`>fk@
zfktHnbZ9pc$>or?7Y;kHKcICep%4uLM+IKG1RX~-@CLYzKZ6O=(G-WzWLLl%$DI<I
zZHc_h^y$6d?RV{|eoCfb;m~U@8KpL|!aOsWjccc}YlnWwaK0v+I~l<Ra5Z|d7cY(%
zw%3Z^2@RgLr|lY_A(;7P&ZSE*p_;3!A{qBn!MHR*39S7U_ww$k^3<lnmEcn)3WANW
zl&%2T(b|=U0by^v`p1IsR^P#974uWf0Si@95Y2A<6?XNp7&^d4gVQZJ@;+t=aYy55
zlQFX3j1pop+nQk^u+_I{kSu3Dqo*^W#K2Zc1^6N%Vq7zm-17D9Qgg3hiTO?T74i@#
zh`XjpsZx{^d2~HkX0M33C;1#uJc63=nCzbwoQO;<S>=cUp{9<1q}Uvu$Tx_~=)RJ2
zDtj0xjqbb0UR^A|wVUqcl`WPvIvr@=UE-XEK(ZLCd2;C)7&vEtyRhaDC*fGMnP=+7
zi4|EpZ^t{i(>|~L{}Nt*6cJNrqV*GowS|a<e&ol2Hv%#uf;MQlo71Z~abh(t{45rW
zaYr7_hdBs$P{^q(+idrLEfY>T2`(G8nteO>I#85e0!uC$EGr}Pt96(#>R)TCF1xJJ
z9N92_3r_cYhNuyc!(Djfk#rx6EV}W`7t@q3B}tsS32Nn$;axXU933f&mYk99$A~jt
zrnGJ&9+E3f+h`#1RzDAB_}7B!GgH?lywrOkU3Rhbj<u^dTRS~ns-PIqhLd&IPoZO!
z2+A^9x9bP-SMK%JzHCh7+s)$-d!y~;YRwg_X(g6-A{IOOMZ~stP}dX793^*^z4a+Y
z>J0To&N_U)qjZkhq9!i=jm?@W4$qza_T?pbu5TC)-j9I=Pks3#E?;A4*xd`kv(v55
zq5y5$jz4$2{`eb{a-xI_O5qlILnZL_-!)`%2pUWLAtxQ^p$T6c!ftmyc&yx3Ppj44
z`EZg^p9i`me4-UuRQ?1tg^-cQKs0#SX@2$Q_?erU@5{<S6spdlL3Q)d#njZ<c5%=9
z4AzI5;H0a5ZOLAp^$n6)Ldv|X$@1^NO#(%oyZwh!pVKvzafd7E`|#QwH!^&sOuNm=
zLdlzGp?|GCK2}JD*+8(@Y+p8-^bK_V@C=XSICm5;g-iZ`ZfNG&p+=^c9}x`>{VzlY
zvFRMZ*Mjw9$Z~48Zpnl(V#h`Jr|R1fJMc;{0;{jO^dB;qvRn+S|2u;@FWom(8k-#R
zw&5_m)KRQZb-97ASSr;4Yq4vrHva+b%JoE@<<+S}QB#SSWH=TG_iKOsVVB})*Tb4-
z59<hOqiXm@i%^r{=xr)%ZXQ9nY6uBR&lSHK_;MhJ>{bRsDc`!6un4&{Yx-~`oZzFI
zG)gzSXYnJ3gP+>d(t&c+rU`N*B2&AIk0gI~h%i4iP#(l4!{!#zjkPz)FV{*`Kh8^7
z{>b8xrTl4N?5l=iT~LRVQs!qTgJHVI@WZup)jQUWH7@m+nur%zBPE)h`yIa6^g1*r
zLwlr-5qNndt9QFXE*qO&a2arEP1V&_?P9n*qtJ|aCB4uCIqWqtZ<i$<(;v-5tt1Ja
z&b~0b=$f2sk+Hg+m!8v+vQtiWZHyyT1s}U|6#z<JS8vu?vG9c-Q?tun0;Vd~6|Eb0
zUv4|rB@>tq7<WhQQP`QhG6Z91UCYd_OK2rVOZx(uwaW#3OW;O@9ZqeJi?x4GJ6S{^
z;n|MjP7gjKt6rZ^m+qgm?vJ>)-&f;1TdL!laKgs0*A_=;XGqXhaeMhUF)wh>fIe|j
zXNiBH0RC&;;2=g79-ipGjCu|kwDtIz=Wv0Hz?;Bgmaz1f{p^wIl@j>l(K<(`b6f51
zjao@9v;3YSn~job=)Vg#pk<}~^H*2>v$*1WN}`UNuYIX7upHor#qc5%5RmRfG|ccs
zb8s}ny0Ued6eRw*YO{yjqAB~mCC79wy&c?-{#0LY@Sg<&i#;<Z$QCl}xGnW<Ivj}p
zyR7;KT3maXB|5urEw^}u3s6Ye1<j~WstL@n)$qnK*6A>vUAd5XeO@o)Bpj;Ny33id
z-z%;ry}falO7^Y8$6cW6-_ogM2Pihr+uIhmEkqFJp^{%A?FM~>N>3dm!ab{S_YaSg
zyq-YWio#o$G>QiOX1T+T&&5w3yE+?`FGzrRrxUg;wGQ}7JMfMu%-p*(BN6@+<)WM#
zQ|`pCfjt{yqktO3*Wr$dVK8Lws8#v`69pajlw?(_N8fFCkQq3EXsDjZ-Ba>ZHu+jj
zlk;y>zrpCAGQI6aKM=ps0H18q2G^7K7v3zLUuhIk$)D836Ish!@4qhuzH|0YzVeXZ
zVN<-3r$D7Lwj6ZyuxEudK2JoO@bX@e1qXF0WWQ2g3ALEIBJA!Ze9xn_8t|qm$`Vl}
zu^$(q6cxvT#z3N@(Dp_-cbf#?W^JR7hcj=tx~cM$^<sNL0>5?x<cY;(k+H*}aw`Tg
z(B9dS!+FNCCvO}etgKH&Q0hTUssh_In;=c-;`ld8CwXg=3a0umn7peWxqo>?LMkox
zQ<i`n=~Y$`O`a_Bn3D_@<^~kp`Q^i5oV@oF{-5~zXaYK;=AscQ=d`s6e2tY!o|8&Z
z>sA6cDG__Xvi<tPog+a?X>F?m4xl)@QMvN4jW$|u(XxDMyH6PPg?-$J*<woXu=NFY
z3daOqM7ldUDnZjd^(dW1Sj#AT0N~dqRW2@a8^pa=^}i|i)2)@N|BP~L^n0OlUl<f<
zJbzDm)2!U9jQw{`ox1zUP-SBGHOy@#ca>AJQA?=(h@epe;=MihjOg7Q0#eJ==-02S
zM(&XTzoP~BLS%&h&X`#?NU51Dg~NF_&N2**X!HYBLiOHGzCf&J10vz|mYhXc9Xhd%
zNQQ7OE&ZCO5*B;5QH)uhrgh|cy(U)(l}LlWWD4rO^;l}U(E(Jp4wX3kC>{bnaiWry
zU9M{$`T3GsGX9qMOG$8tQK90@Iz8n=Io==#Fc9{9dA#$I6rWU}ZvSDPDE{)kT)@>9
z8{ZL*b%PmaEoHa3m>WR9=r+Fg*~V~|+7rM6#0t19U=dPHH7My!3Yd+(@hva*@Uhye
zx6==%#*eJq!YV=xr%o@BRpirQi^}sIR5RRr)HC?x*|YpE5G=q6gVKTk?m2D$8!qBk
zxEuHQ4|5b~S;`;wS<BxT;$sQ01=D)%Oh-K}_U#1kgK5kK#Z$fpPr!sjkzk~ea&hgM
zNf@SAVU^O&EDzv=5~Z|Wg)hvBhmN51z^V4ypf}+LHhC;sNiAJ3zh2+x(Iget+1ugL
z*J8BEv!Xz^yNKp3YZH<VJn0b*!|H_pJY_?GZJhMixs%)JbtbVGNN{OCdm8R~Xud1d
zsl2AMbG(9~W1zA4wjij<T_q%&*m_<3PfiF?a~yw2WAq=>|4F?nNvY`IgU0DkJK8FX
z9}Z+y0tM*?XvHG&g0mbtTvO}-)XoZ)6aH9fUTI@Hc*&C)>`>u3UL>*gmJ23eAxGy}
zm>C<Fg7uqwwny)w_8$LvUzNY?r1?lfbF*;hSVe6`LBb0)BLToYcY8<v&!`3B8a6Uv
zuJmFR+=Jt{h04~NI@=UNpLVWuYIU!~6{AR)waQ@T?Yw%$o=S$tA`)61c5x$!DGpsa
zbK4i8g8&IUi(54J8*0{W8hddj*|eG#>ttieIUK~Z=5|^C6^pe`MmEybRTcF%U_2YA
zSeruXQ_-F9<!gQ&X(|R>IN3lnn&tV!4{rpaOHR1Mp4_UEs7$wX#Ui!qZn${(q3fA0
z@Yr<h%b>?)Y~VGpV6g>uwgOhFO-28y&~R5KSvGs1fP$|7ZkgmC4F6U``KzAtf4YjY
zQ7E<0#0KG`NT!g=i9E_*dR_{l%V+xn*NqT-6oq*qaaT)0RkQ7xv@E7eL0Hs6*5XC8
zDPk^MNF;Iw?aqmr<jHh^?g)WIL8~{fUfoOcIGwgDl-^rt{2CT!GW6r+L;gQjnFO4Z
z`Y!W!d#)lwW4Y?2CV!ss8+&-Vtf8FDYWqPR_Ae%_QX`CLRuxH0GwmkG``l&_06=JX
zw--PJ8gH+@Hz;+RJW;5to~=xakhxp!BGboPqk)nh7ZGuBojw>i;O*S(c+}6A@XF7l
z>ouu)dU{&lzJ6V<nze>)=ND&bmz#jaw>L>1iOd|wy=RZv+%&s5-fCk_gS@h~<}&a^
zJ|PE!m$8dAm6UK%bskWEqQL6v>V6k<_)vx{jz9UMNYDwvYyB?ep6j>@5_0d}-d@nn
z6%s|jN3#d*@@E^*7Ie2$$xsG&1uc?~$0DeNhtc*T*`GLiS4p_5R{dN1S83tz?cYxR
zW%&1#e|`UNnEYpkxh40$cuS(lJc(gJ<%fO-)LSL}`lo>P_rlwM&uRYeyz|$q@>t61
zQ&!0>ox{09)iiAvyk9zq^tGRoe;jK~{Spp6+>4W6<~Q!5{-W=wxC4!qctg#^WN!%`
zzi&?n%9+m+i>qMkP2xn(#vSSFRL!W$$b?az{C>?#x$4dM*aEqos`z>vOiwA2+aPQ$
zJnkVG!s}SGB=;e6)evVv2wwEE{keG3JF_Qv>>?Ur&3<}9M3%4WUG`fI<fAl1Q2E<&
zYNz$uGcqDat7T`?d4Y&u%4emZ4bgy<VZS%y7m<hnMO?)|&hj5~I|iEeljGO9D<?)F
z>FaFz;PrWbKRuiYKBqB%jXY=G1V;BE!Kh|1S$H-kPBm7zW&NLLWc?7csMzyOsCK)o
z)Ugqnw$nGrS|S$eVBegjTW`^Yt<5uZIjqp<Zv8lHG^ki8E27SdXc$V=v+!zTCKTE1
z$~D?_`&GZ*nQyPq-E~W4V+4_STOuSRBY)XFbwvs)ZVtq`Ic0g~2~N-o&pzK*%j4s8
zoh2JBu@hT&cRQHBNSlsK?5ki6<h3?a;c|CGBNMhcHnT5pS_(dKe@gVSpx5gbN#!)N
zAk#V1#5`3L6hs5_vJ9sA?SwG>c{yF<HaRtCyNdL=yftHldR91BJDnApjW&CAQW2Yy
z|GNJ$&MNgH=fq8Sw!gm3f9>s!g8}v8$N2t-b(c0>CdCQX5-d>V#dwSPR>VgGt%xcc
zu_BA+1_$rLL|fTklJaU<PKYX`IwDP8!ifK#b>!gra2j_7?|1I1X-13rA!{*)(pZGu
zk?F*|P35}@mRSb&p0K1c5w$cm8&ozi><P*ba*@}xo6}DNJdWJ(N-LvF1cOro<4Y{&
zC6vGxj}Y9n%1^KgDlrIxtmZ<C-a{HBtUQ)4fHt}}Lz1b|{q$TO2?3(>So>EFu$-#f
zkWc7ul`Y;p^}?z(3_fVL-Ah5&I=oQ7#ADX3>Vx=NEt`bY{GiLiO+)F6=YEe_<6*(f
zJt7PrC<q8eyyGYY?`d2151O7vH`<BQiC;=2=WKIy^%x;~Ry}6z%*pYnn=_SgCh7_j
z`Tg@RQr@PVYlZMqz|v3p2U`}<n9&hMbpBFa_-QvW>O9%3eZjYoV0CUvMYWKPpz3FL
zVN@-Wk^l(WypB7Y`X2ex#AN+tcyX$j`(oS(ye<Cc`>bAa0z6oS<mAxdx65<GHbLjT
z?k7SnLIyWhn>QFM`P1K)5mX0#{0=2#uPEUO=hrN|P1Uze7ctKDa`pN$t6C$U$Lx2G
zf{P#GW&Bj=4`VDcuZ(K@S}EVCL_O6}!z#6&EGA_#H?(j$QBKAxwCDY*agOiQoCVkY
z+c7qIl<3LgPVQXXO(8q^dZd~cD(Z=3jiEDlGL(Y7d6pZ0@~2W*(@lIQwUOvKMd!+d
zuOn7MZy{X+dS?%WRO|Q(sUJ@|<wk2*Gy@+g3JWEOr6pneO}ng@r@vgZ$my(B(y#N`
zfU2y|%|ca;n+?lB2ikWcPp;=nj}!dS!@}ih7n4%gUO^u+F1J<LnRCsdqTcTu=XU5~
zqs}GP(|TN=T|}!T#PQP39@O(GZS@?>jDLJr9{{nveaoR&|Gl8V@q?F`Z!f0g@A<*-
z#}s<qG5!|aICWaTiFL!pE>Z`qyl7gRHH{t4@MyoCmc@hi|GK^Mb7{THmMd7#LP+y6
zWagNnrO3M@H@Uqv=B@bir>Tndsd9yh*cE&3{Mn(9u($cFbj~BzcC&2n6QIj2gQgDf
zj^Sg=NEtmnbUL?7)|->%U|dQ!`kw>leO=|VJf6@AA0xcDa9V}a_0Tc#Yp9P!bBrM>
zu-IFM3r^^ya5LV;@^Am+vHX|#Q64sGZ^}x`BpIK_G89(Th$fL_zc_B1v;1e@Opo<N
zYiROx<B8quLPfDA`|qG+hCjvizhm5g4r6TD9NN87zbOdlt$MiaQk;szTi_D@X=SaQ
zKOV;Vdo_kk^;}QEVgp%9`|2zh#?1+y3>E^`bbCW|FNxC}5v94se|9untv3)&$yT>p
zTTXK$Dx0l2ZarN|B!8ZYq&hN+p;AsG9ps1A+#B^3J08Iy>CQ>JdLwd$4h^U+K{CHy
z(J69HHy5oI<zK3Iz&j|U`_=3A`|5G9Yn>O(mm!jA=Ux>y8pI%6XuJKyk}9O07v17a
z3||a*h^`GeAJZQ@M)E+1){~bD-t@P{uKcF!TNpU!^~(4VLl1&DUbgi^s@clB1mEax
z@O`)8RJRsS^INYbv<+?&av#?aPX&`@VfHS>PVoH)bedm&$1D9h_677qY7LCaTn@<#
zC0s0#2yYVgo!f6ZY^vKJiV|Irms4iv4x?5+@p0a>Z3!)S!}hlR8A@Qi#(-mY%~r(+
zgwFCVjt1{d-1dcyeaey$39*xQx1aur!-VAwKU~PU!KJxIBTz|H+zz9)ymqD3Oi5Ip
z%z7Jt32*ZK#N$QlqVFUyOLhBYt=W1xd$0Mibq7gZWo<z6kYTIrg{aP?q<^)b&}uKK
z#Z<$Ve1L#LmSXgUl5UEANvPUHZQmL_tm^rH#I#bn!IqP^y3+ZgZ(WVw-*cF-WRj2w
z_20Wqaj~I&l`lyUZ9w=f;|i$up`rVITxce||Mruqr{tIU&&ko30`oJpMf0*Kiu;S^
zeiUu}2sWl~hxomn*NRe`v|G)O`H!;#&v(Apn+-qxnpjD+#k)_&G7mvZlenR%*DtZA
zD^}9!dqQxX6kK1G^`Sp;%xWrmnV`9az`#rCXS~N6#*yaeR+T2DNDjG^1?xzI2R(}l
zQmRZ{%2ZZoGeY8}`Yx4%PU~8q2OMoiP|E)zREOuV`>dwWftK!O_S31eQB2lAbLLeU
zyM7~X822~1lW3kSB0TfS$AZxC78;F3h3fgf)A%E;<KbI79hh5t$NOyKxZv}}3?ZcJ
zf^<VY6Hy&*z&oDr)M=h!RVT0mu=u$VtoAISIVy!On%>3#IN$C;B04jZ8b0oxM&NQF
zc;1d`@2u}#_9Y;7Px8m+8&r9GBIezV&}snZcMng|YNb!DQl0y45!Gh8Qgo;qH9@w^
z9uWK+V;@;(`+0*(7Qgd7wP4$f<%qqaWFxe$wn5w#+v?^RP&&}$LLHvQ+Kk-lg72__
zkjD7DnFVp`b(D324S`->6(NGsQVatY-$=AFk=l-VG`+KXg)4c2kxAYce_+_8*;%N4
zLb-K@!+Cf3^V{t&j~=mYK$R~N`;&yokEYt;Ke^enK&{5($!vP#dzRbnon(DB2SXR@
z8>h7Wp>2zjw?2%alcP<Bcr>i3it-`GP@nhr2{x<tFD!^)Boa$h#RMf!a1kV9%n|()
zR}kgTK=?k^*~l1DD8^v@rkHOmfAzGBNrdo6$LAB_5(g+4luJx1dl<ku<zr84K%<Hk
zzzh?ytiRik^;+cP=1eB>34D@!eK^7zp-T^LUkeTlu+vGiFmvtjxGFKEu0LCiN~%xd
zz7?!^UE7qWpp@JkE0G{}?go`1DkpO=44-Hp7Y?r6YFU(29h1RSZDl;OWlN87;zG@w
z3b%3oB;z*qmI!;3@q8n(`H;xuT|tDEl3K}hE}Gt9AcdaadQ;@FMmVz|OX6k@Me<yO
z5kVgBdyA3lD@E{b3iksOo6KnKmhUbD=ly8V^SlTW(vEFXUHr|05-lMm9unvb-x%^i
zFV;R7cLI<8IN^}rnYOV?T+jVHKlYtLNjgmwIvQ2#%<F@@m9zR7m&ZCnZ!%L^t?>GW
z(FS{i#bLvN{|k?y1@_)_OLLmX_Y|acB#_c3b4G6}ozZ>7nH1kRM&vg*gHp1F`PqVN
zlzz)~PnjW|L@|2*fLpRH8(dE>TlRA=R!+jZ&}1t(&fp>2)P09V@R?Ahaikli%FD~x
zF*@Ekh_<OG<m~f^uj~LLE3A|IT$1&-KwL^5cPZ*MbSaz%isfPY&IqjfUfLKf&lG+#
z&Md!}rPLGLN89!EeB841MpCaPZ2pUQmXsZRb_#EZ$YL>zaVLJ2&82_1K~tK$>2;UE
zZXLy-4FZQJK=B57??l>Qv}q89>!ksB7I8<%g}$}7`Jo|Rt|Vb>`YG8QdWyh(Z^J^h
z*zhl-x56<EbKK|6XHBex_{8{YNArP$M`^3LZxYqjF5GIhe%Q~u`Mt6LpQdPAE|i6O
zQ%mMZU`pOFh58(A3NBfHu%3ByQIt|}zDQDc+7LF%?GcN{$&NyqKQsP`hVBX?SPi6k
zJ4mL`-XXA*UhTDFK;P6Ak$%RzT>uRcug4?4F?Dt6e?v!^{M%{6iG`Q(i|C2Y{#0jf
zpHAs_>iS9XhgfR9+)ZrHstVUG)YXNv^(_*s%m%3~%*$-&$uO((sHWxE;8&|5cz$|k
zUA>*}3Qy>@PKSGy8Vd<jf!yOs$418AkHDAy*#}94$mO!?wR1{cw0~kPFLf9k-?@YH
z1x&OL-K@<Ll#16wbAUg|Hs*O!hi&>yx_?n0us71A7@%7S3~I}G(u*#hlc7>B9&=Sf
zHZ5<+njx7~z}5TjP<w6vz4G+;eTV;IgJZJR-Nx}Npx#Wq$ljo~>s-}YAl=)sO(xc$
tqIMMJWg-m=h(NKt|6ejy+2oc$Me6YgZAbq%)PE-c6l9fUie7#6`CpOnCE)-7

literal 0
HcmV?d00001

diff --git a/doc/tuning.md b/doc/tuning.md
index 2673d68d9..6d07d4ddc 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -10,6 +10,7 @@
   * [Choose `intensity` and `worksize`](#choose-intensity-and-worksize)
   * [Add more GPUs](#add-more-gpus)
   * [Two Threads per GPU](two-threads-per-gpu)
+  * [Interleave Tuning](interleave-tuning )
   * [disable comp_mode](#disable-comp_mode)
   * [change the scratchpad memory pattern](change-the-scratchpad-memory-pattern)
   * [Increase Memory Pool](#increase-memory-pool)
@@ -83,13 +84,13 @@ If you are unsure of either GPU or platform index value, you can use `clinfo` to
 ```
 "gpu_threads_conf" :
 [
-    {
-      "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
-    {
-      "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 1, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
 ],
 
@@ -107,19 +108,49 @@ Therefore adjust your intensity by hand.
 ```
 "gpu_threads_conf" :
 [
-    {
-      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
-    {
-      "index" : 0, "intensity" : 768, "worksize" : 8, "affine_to_cpu" : false,
-      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true
+    { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+      "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+      "interleave" : 40
     },
 ],
 
 "platform_index" : 0,
 ```
 
+### Interleave Tuning
+
+Interleave controls when a worker thread is starting to calculate a bunch of hashes 
+if two worker threads are used to utilize one GPU.
+This option has no effect if only one worker thread is used per GPU.
+
+![Interleave](img/interleave.png) 
+
+Interleave defines how long a thread needs to wait to start the next hash calculation relative to the last started worker thread.
+To choose a interleave value larger than 50% makes no sense because than the gpu will not be utilized well enough.
+In the most cases the default 40 is a good value but on some systems e.g. Linux Rocm 1.9.1 driver with RX5XX you need to adjust the value.
+If you get many interleave message in a row (over 1 minute) you should adjust the value.
+
+```
+OpenCL Interleave 0|1: 642/2400.50 ms - 30.1
+OpenCL Interleave 0|0: 355/2265.05 ms - 30.2
+OpenCL Interleave 0|1: 221/2215.65 ms - 30.2
+```
+
+description:
+```
+<gpu id>|<thread id on the gpu>: <last delay>/<average calculation per hash bunch> ms - <interleave value>
+
+```
+`last delay` should gou slowly to 0.
+If it goes down and than jumps to a very large value multiple times within a minute you should reduce the intensity by 5.
+The `intensity value` will automatically go up and down within the range of +-5% to adjust kernel run-time fluctuations.
+Automatic adjustment is disabled as long as `auto-tuning` is active and will be started after it is finished. 
+If `last delay` goes down to 10ms and the messages stops and repeated from time to time with delays up to 15ms you will have already a good value.
+
 ### disable comp_mode
 
 `comp_mode` means compatibility mode and removes some checks in compute kernel those takes care that the miner can be used on a wide range of AMD/OpenCL GPU devices.
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 6e1c70b05..408cad97a 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -18,6 +18,7 @@
 #include "xmrstak/picosha2/picosha2.hpp"
 #include "xmrstak/params.hpp"
 #include "xmrstak/version.hpp"
+#include "xmrstak/net/msgstruct.hpp"
 
 #include <stdio.h>
 #include <string.h>
@@ -34,6 +35,7 @@
 #include <vector>
 #include <string>
 #include <iostream>
+#include <thread>
 
 #if defined _MSC_VER
 #include <direct.h>
@@ -43,7 +45,6 @@
 #endif
 
 
-
 #ifdef _WIN32
 #include <windows.h>
 #include <Shlobj.h>
@@ -302,6 +303,12 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		return ERR_OCL_API;
 	}
 
+	if((ret = clGetDeviceInfo(ctx->DeviceID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx->computeUnits), NULL)) != CL_SUCCESS)
+	{
+		printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_COMPUTE_UNITS for device %u.", err_to_str(ret), (uint32_t)ctx->deviceIdx);
+		return ERR_OCL_API;
+	}
+
 	ctx->InputBuffer = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, 88, NULL, &ret);
 	if(ret != CL_SUCCESS)
 	{
@@ -410,14 +417,17 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 				strided_index = 0;
 		}
 
+		// if intensity is a multiple of worksize than comp mode is not needed
+		int needCompMode = ctx->compMode && ctx->rawIntensity % ctx->workSize != 0 ? 1 : 0;
+
 		std::string options;
 		options += " -DITERATIONS=" + std::to_string(hashIterations);
-		options += " -DMASK=" + std::to_string(threadMemMask);
-		options += " -DWORKSIZE=" + std::to_string(ctx->workSize);
+		options += " -DMASK=" + std::to_string(threadMemMask) + "U";
+		options += " -DWORKSIZE=" + std::to_string(ctx->workSize) + "U";
 		options += " -DSTRIDED_INDEX=" + std::to_string(strided_index);
-		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp);
-		options += " -DCOMP_MODE=" + std::to_string(ctx->compMode ? 1u : 0u);
-		options += " -DMEMORY=" + std::to_string(hashMemSize);
+		options += " -DMEM_CHUNK_EXPONENT=" + std::to_string(mem_chunk_exp) + "U";
+		options += " -DCOMP_MODE=" + std::to_string(needCompMode);
+		options += " -DMEMORY=" + std::to_string(hashMemSize) + "LU";
 		options += " -DALGO=" + std::to_string(miner_algo[ii]);
 		options += " -DCN_UNROLL=" + std::to_string(ctx->unroll);
 		/* AMD driver output is something like: `1445.5 (VM)`
@@ -699,9 +709,9 @@ std::vector<GpuContext> getAMDDevices(int index)
 		{
 			GpuContext ctx;
 			std::vector<char> devNameVec(1024);
-			size_t maxMem;
-			if( devVendor.find("NVIDIA Corporation") != std::string::npos)
-				ctx.isNVIDIA = true;
+
+			ctx.isNVIDIA = isNVIDIADevice;
+			ctx.isAMD = isAMDDevice;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(int), &(ctx.computeUnits), NULL)) != CL_SUCCESS)
 			{
@@ -709,7 +719,7 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
-			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(maxMem), NULL)) != CL_SUCCESS)
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &(ctx.maxMemPerAlloc), NULL)) != CL_SUCCESS)
 			{
 				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DEVICE_MAX_MEM_ALLOC_SIZE for device %u.", err_to_str(clStatus), k);
 				continue;
@@ -722,8 +732,8 @@ std::vector<GpuContext> getAMDDevices(int index)
 			}
 
 			// the allocation for NVIDIA OpenCL is not limited to 1/4 of the GPU memory per allocation
-			if(ctx.isNVIDIA)
-				maxMem = ctx.freeMem;
+			if(isNVIDIADevice)
+				ctx.maxMemPerAlloc = ctx.freeMem;
 
 			if((clStatus = clGetDeviceInfo(device_list[k], CL_DEVICE_NAME, devNameVec.size(), devNameVec.data(), NULL)) != CL_SUCCESS)
 			{
@@ -731,11 +741,20 @@ std::vector<GpuContext> getAMDDevices(int index)
 				continue;
 			}
 
+			std::vector<char> openCLDriverVer(1024);
+			if((clStatus = clGetDeviceInfo(device_list[k], CL_DRIVER_VERSION, openCLDriverVer.size(), openCLDriverVer.data(), NULL)) != CL_SUCCESS)
+			{
+				printer::inst()->print_msg(L1,"WARNING: %s when calling clGetDeviceInfo to get CL_DRIVER_VERSION for device %u.", err_to_str(clStatus), k);
+				continue;
+			}
+
+			bool isHSAOpenCL = std::string(openCLDriverVer.data()).find("HSA") != std::string::npos;
+
 			// if environment variable GPU_SINGLE_ALLOC_PERCENT is not set we can not allocate the full memory
 			ctx.deviceIdx = k;
-			ctx.freeMem = std::min(ctx.freeMem, maxMem);
 			ctx.name = std::string(devNameVec.data());
 			ctx.DeviceID = device_list[k];
+			ctx.interleave = 40;
 			printer::inst()->print_msg(L0,"Found OpenCL GPU %s.",ctx.name.c_str());
 			ctxVec.push_back(ctx);
 		}
@@ -937,10 +956,29 @@ size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx)
 	// create a directory  for the OpenCL compile cache
 	create_directory(get_home() + "/.openclcache");
 
+	std::vector<std::shared_ptr<InterleaveData>> interleaveData(num_gpus, nullptr);
+
 	for(int i = 0; i < num_gpus; ++i)
 	{
+		const size_t devIdx = ctx[i].deviceIdx;
+		if(interleaveData.size() <= devIdx)
+		{
+			interleaveData.resize(devIdx + 1u, nullptr);
+		}
+		if(!interleaveData[devIdx])
+		{
+			interleaveData[devIdx].reset(new InterleaveData{});
+			interleaveData[devIdx]->lastRunTimeStamp = get_timestamp_ms();
+
+		}
+		ctx[i].idWorkerOnDevice=interleaveData[devIdx]->numThreadsOnGPU;
+		++interleaveData[devIdx]->numThreadsOnGPU;
+		ctx[i].interleaveData = interleaveData[devIdx];
+		ctx[i].interleaveData->adjustThreshold = static_cast<double>(ctx[i].interleave)/100.0;
+		ctx[i].interleaveData->startAdjustThreshold = ctx[i].interleaveData->adjustThreshold;
+
 		const std::string backendName = xmrstak::params::inst().openCLVendor;
-		if(ctx[i].stridedIndex == 2 && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
+		if( (ctx[i].stridedIndex == 2 || ctx[i].stridedIndex == 3) && (ctx[i].rawIntensity % ctx[i].workSize) != 0)
 		{
 			size_t reduced_intensity = (ctx[i].rawIntensity / ctx[i].workSize) * ctx[i].workSize;
 			ctx[i].rawIntensity = reduced_intensity;
@@ -1116,11 +1154,108 @@ size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t tar
 			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 3);
 			return ERR_OCL_API;
 		}
+
+		if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
+		{
+			printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
+			return(ERR_OCL_API);
+		}
 	}
 
 	return ERR_SUCCESS;
 }
 
+uint64_t updateTimings(GpuContext* ctx, const uint64_t t)
+{
+	// averagingBias = 1.0 - only the last delta time is taken into account
+	// averagingBias = 0.5 - the last delta time has the same weight as all the previous ones combined
+	// averagingBias = 0.1 - the last delta time has 10% weight of all the previous ones combined
+	const double averagingBias = 0.1;
+
+	int64_t t2 = get_timestamp_ms();
+	uint64_t runtime = (t2 - t);
+	{
+
+		std::lock_guard<std::mutex> g(ctx->interleaveData->mutex);
+		// 20000 mean that something went wrong an we reset the average
+		if(ctx->interleaveData->avgKernelRuntime == 0.0 || ctx->interleaveData->avgKernelRuntime > 20000.0)
+			ctx->interleaveData->avgKernelRuntime = runtime;
+		else
+			ctx->interleaveData->avgKernelRuntime = ctx->interleaveData->avgKernelRuntime * (1.0 - averagingBias) + (runtime) * averagingBias;
+	}
+	return runtime;
+}
+
+uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
+{
+	uint64_t t0 = get_timestamp_ms();
+
+	if(ctx->interleaveData->numThreadsOnGPU > 1 && ctx->interleaveData->adjustThreshold > 0.0)
+	{
+		t0 = get_timestamp_ms();
+		std::unique_lock<std::mutex> g(ctx->interleaveData->mutex);
+
+		int64_t delay = 0;
+		double dt = 0.0;
+
+		if(t0 > ctx->interleaveData->lastRunTimeStamp)
+			dt = static_cast<double>(t0 - ctx->interleaveData->lastRunTimeStamp);
+
+		const double avgRuntime = ctx->interleaveData->avgKernelRuntime;
+		const double optimalTimeOffset = avgRuntime * ctx->interleaveData->adjustThreshold;
+
+		// threshold where the the auto adjustment is disabled
+		constexpr uint32_t maxDelay = 10;
+		constexpr double maxAutoAdjust = 0.05;
+
+		if((dt > 0) && (dt < optimalTimeOffset))
+		{
+			delay = static_cast<int64_t>((optimalTimeOffset  - dt));
+
+			if(enableAutoAdjustment)
+			{
+				if(ctx->lastDelay == delay && delay > maxDelay)
+					ctx->interleaveData->adjustThreshold -= 0.001;
+				// if the delay doubled than increase the adjustThreshold
+				else if(delay > 1 && ctx->lastDelay * 2 < delay)
+					ctx->interleaveData->adjustThreshold += 0.001;
+			}
+			ctx->lastDelay = delay;
+
+			// this is std::clamp which is available in c++17
+			ctx->interleaveData->adjustThreshold = std::max(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold - maxAutoAdjust);
+			ctx->interleaveData->adjustThreshold = std::min(ctx->interleaveData->adjustThreshold, ctx->interleaveData->startAdjustThreshold + maxAutoAdjust);
+
+			// avoid that the auto adjustment is disable interleaving
+			ctx->interleaveData->adjustThreshold = std::max(
+				ctx->interleaveData->adjustThreshold,
+				0.001
+			);
+		}
+		delay = std::max(int64_t(0), delay);
+
+		ctx->interleaveData->lastRunTimeStamp = t0 + delay;
+
+		g.unlock();
+		if(delay > 0)
+		{
+			// do not notify the user anymore if we reach a good delay
+			if(delay > maxDelay)
+				printer::inst()->print_msg(L1,"OpenCL Interleave %u|%u: %u/%.2lf ms - %.1lf",
+					ctx->deviceIdx,
+					ctx->idWorkerOnDevice,
+					static_cast<uint32_t>(delay),
+					avgRuntime,
+					ctx->interleaveData->adjustThreshold * 100.
+				);
+
+			std::this_thread::sleep_for(std::chrono::milliseconds(delay));
+		}
+	}
+
+	return t0;
+}
+
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 {
 	// switch to the kernel storage
@@ -1154,12 +1289,10 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 
 	if((ret = clEnqueueWriteBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_FALSE, sizeof(cl_uint) * 0xFF, sizeof(cl_uint), &zero, 0, NULL, NULL)) != CL_SUCCESS)
 	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
+		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueWriteBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
-
 	size_t Nonce[2] = {ctx->Nonce, 1}, gthreads[2] = { g_thd, 8 }, lthreads[2] = { 8, 8 };
 	if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][0], 2, Nonce, gthreads, lthreads, 0, NULL, NULL)) != CL_SUCCESS)
 	{
@@ -1181,64 +1314,23 @@ size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo)
 		return ERR_OCL_API;
 	}
 
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[2], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[3], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 1, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[4], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 2, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->ExtraBuffers[5], CL_FALSE, sizeof(cl_uint) * g_intensity, sizeof(cl_uint), BranchNonces + 3, 0, NULL, NULL)) != CL_SUCCESS)
-	{
-		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
-		return ERR_OCL_API;
-	}
-
-	clFinish(ctx->CommandQueues);
-
 	for(int i = 0; i < 4; ++i)
 	{
-		if(BranchNonces[i])
+		size_t tmpNonce = ctx->Nonce;
+		if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, &g_thd, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
 		{
-			// Threads
-            cl_uint numThreads = BranchNonces[i];
-			if((clSetKernelArg(ctx->Kernels[kernel_storage][i + 3], 4, sizeof(cl_uint), &numThreads)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clSetKernelArg for kernel %d, argument %d.", err_to_str(ret), i + 3, 4);
-				return(ERR_OCL_API);
-			}
-
-			// round up to next multiple of w_size
-			BranchNonces[i] = ((BranchNonces[i] + w_size - 1u) / w_size) * w_size;
-			// number of global threads must be a multiple of the work group size (w_size)
-			assert(BranchNonces[i]%w_size == 0);
-			size_t tmpNonce = ctx->Nonce;
-			if((ret = clEnqueueNDRangeKernel(ctx->CommandQueues, ctx->Kernels[kernel_storage][i + 3], 1, &tmpNonce, BranchNonces + i, &w_size, 0, NULL, NULL)) != CL_SUCCESS)
-			{
-				printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
-				return ERR_OCL_API;
-			}
+			printer::inst()->print_msg(L1,"Error %s when calling clEnqueueNDRangeKernel for kernel %d.", err_to_str(ret), i + 3);
+			return ERR_OCL_API;
 		}
 	}
 
+	// this call is blocking therefore the access to the results without cl_finish is fine
 	if((ret = clEnqueueReadBuffer(ctx->CommandQueues, ctx->OutputBuffer, CL_TRUE, 0, sizeof(cl_uint) * 0x100, HashOutput, 0, NULL, NULL)) != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1,"Error %s when calling clEnqueueReadBuffer to fetch results.", err_to_str(ret));
 		return ERR_OCL_API;
 	}
 
-	clFinish(ctx->CommandQueues);
 	auto & numHashValues = HashOutput[0xFF];
 	// avoid out of memory read, we have only storage for 0xFF results
 	if(numHashValues > 0xFF)
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.hpp b/xmrstak/backend/amd/amd_gpu/gpu.hpp
index 63c5029d7..80fcbefde 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.hpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.hpp
@@ -12,23 +12,36 @@
 #include <stdint.h>
 #include <string>
 #include <vector>
+#include <mutex>
+#include <memory>
 
 #define ERR_SUCCESS (0)
 #define ERR_OCL_API (2)
 #define ERR_STUPID_PARAMS (1)
 
+struct InterleaveData
+{
+    std::mutex mutex;
 
+    double adjustThreshold = 0.4;
+    double startAdjustThreshold = 0.4;
+    double avgKernelRuntime = 0.0;
+    uint64_t lastRunTimeStamp = 0;
+    uint32_t numThreadsOnGPU = 0;
+};
 
 struct GpuContext
 {
 	/*Input vars*/
 	size_t deviceIdx;
 	size_t rawIntensity;
+	size_t maxRawIntensity;
 	size_t workSize;
 	int stridedIndex;
 	int memChunk;
 	int unroll = 0;
 	bool isNVIDIA = false;
+	bool isAMD = false;
 	int compMode;
 
 	/*Output vars*/
@@ -40,8 +53,13 @@ struct GpuContext
 	cl_program Program[2];
 	cl_kernel Kernels[2][8];
 	size_t freeMem;
+	size_t maxMemPerAlloc;
 	int computeUnits;
 	std::string name;
+	std::shared_ptr<InterleaveData> interleaveData;
+	uint32_t idWorkerOnDevice = 0u;
+	int interleave = 40;
+	uint64_t lastDelay = 0;
 
 	uint32_t Nonce;
 
@@ -54,5 +72,5 @@ std::vector<GpuContext> getAMDDevices(int index);
 size_t InitOpenCL(GpuContext* ctx, size_t num_gpus, size_t platform_idx);
 size_t XMRSetJob(GpuContext* ctx, uint8_t* input, size_t input_len, uint64_t target, xmrstak_algo miner_algo);
 size_t XMRRunJob(GpuContext* ctx, cl_uint* HashOutput, xmrstak_algo miner_algo);
-
-
+uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment = true);
+uint64_t updateTimings(GpuContext* ctx, const uint64_t t);
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
index 81c0d5ff9..6a3def72c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/cryptonight.cl
@@ -365,16 +365,16 @@ R"===(
 #if(STRIDED_INDEX==0)
 #   define IDX(x)	(x)
 #elif(STRIDED_INDEX==1)
-#   define IDX(x)	((x) * (Threads))
+#	define IDX(x)   (mul24(((uint)(x)), Threads))
 #elif(STRIDED_INDEX==2)
 #   define IDX(x)	(((x) % MEM_CHUNK) + ((x) / MEM_CHUNK) * WORKSIZE * MEM_CHUNK)
+#elif(STRIDED_INDEX==3)
+#	define IDX(x)   ((x) * WORKSIZE)
 #endif
 
 inline uint getIdx()
 {
-#if(STRIDED_INDEX==0 || STRIDED_INDEX==1 || STRIDED_INDEX==2)
     return get_global_id(0) - get_global_offset(0);
-#endif
 }
 
 #define mix_and_propagate(xin) (xin)[(get_local_id(1)) % 8][get_local_id(0)] ^ (xin)[(get_local_id(1) + 1) % 8][get_local_id(0)]
@@ -401,7 +401,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
         AES2[i] = rotate(tmp, 16U);
         AES3[i] = rotate(tmp, 24U);
     }
-        
+
     __local ulong State_buf[8 * 25];
 
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -416,16 +416,23 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         if (get_local_id(1) == 0)
         {
             __local ulong* State = State_buf + get_local_id(0) * 25;
-
+// NVIDIA
+#ifdef __NV_CL_C_VERSION
+			for(uint i = 0; i < 8; ++i)
+				State[i] = input[i];
+#else
             ((__local ulong8 *)State)[0] = vload8(0, input);
+#endif
             State[8]  = input[8];
             State[9]  = input[9];
             State[10] = input[10];
@@ -474,12 +481,11 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
     }
 
     mem_fence(CLK_LOCAL_MEM_FENCE);
-        
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     __local uint4 xin[8][8];
     {
-        
 
         /* Also left over threads perform this loop.
          * The left over thread results will be ignored
@@ -530,7 +536,7 @@ __kernel void JOIN(cn0,ALGO)(__global ulong *input, __global uint4 *Scratchpad,
 
 )==="
 R"===(
-        
+
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
 #	define SCRATCHPAD_CHUNK(N) (*(__local uint4*)((__local uchar*)(scratchpad_line) + (idxS ^ (N << 4))))
@@ -562,11 +568,14 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	ulong b[2];
 	uint4 b_x[1];
 #endif
-    __local uint AES0[256], AES1[256], AES2[256], AES3[256];
+    __local uint AES0[256], AES1[256];
 
 // cryptonight_monero_v8
 #if(ALGO==11)
+#	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
     __local uint RCP[256];
+#	endif
+
 	uint2 division_result;
 	uint sqrt_result;
 #endif
@@ -577,10 +586,8 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
         const uint tmp = AES0_C[i];
         AES0[i] = tmp;
         AES1[i] = rotate(tmp, 8U);
-        AES2[i] = rotate(tmp, 16U);
-        AES3[i] = rotate(tmp, 24U);
 // cryptonight_monero_v8
-#if(ALGO==11)
+#if(ALGO==11 && (defined(__clang__) && !defined(__NV_CL_C_VERSION)))
 		RCP[i] = RCP_C[i];
 #endif
     }
@@ -600,9 +607,11 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += get_group_id(0) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * get_local_id(0);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         a[0] = states[0] ^ states[4];
@@ -630,7 +639,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 		tweak1_2 ^= as_uint2(states[24]);
 #endif
     }
-    
+
     mem_fence(CLK_LOCAL_MEM_FENCE);
 
 #if(COMP_MODE==1)
@@ -638,7 +647,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 	if(gIdx < Threads)
 #endif
     {
-		ulong idx0 = a[0] & MASK;
+		uint idx0 = as_uint2(a[0]).s0 & MASK;
 
 		#pragma unroll CN_UNROLL
     for(int i = 0; i < ITERATIONS; ++i)
@@ -646,26 +655,26 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			ulong c[2];
 // cryptonight_monero_v8 && NVIDIA
 #if(ALGO==11 && defined(__NV_CL_C_VERSION))
-			ulong idxS = idx0 & 0x30;
+			uint idxS = idx0 & 0x30U;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #endif
 
 			((uint4 *)c)[0] = SCRATCHPAD_CHUNK(0);
 // cryptonight_bittube2
 #if(ALGO == 10)
-			((uint4 *)c)[0] = AES_Round_bittube2(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			((uint4 *)c)[0] = AES_Round2_bittube2(AES0, AES1, ~((uint4 *)c)[0], ((uint4 *)a)[0]);
 #else
-			((uint4 *)c)[0] = AES_Round(AES0, AES1, AES2, AES3, ((uint4 *)c)[0], ((uint4 *)a)[0]);
+			((uint4 *)c)[0] = AES_Round2(AES0, AES1, ((uint4 *)c)[0], ((uint4 *)a)[0]);
 #endif
 
 // cryptonight_monero_v8
 #if(ALGO==11)
         {
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
-				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
-				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1));
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
             SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
         }
 #endif
@@ -682,23 +691,23 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	endif
 			b_x[0].s2 ^= ((table >> index) & 0x30U) << 24;
 			SCRATCHPAD_CHUNK(0) = b_x[0];
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 // cryptonight_monero_v8
 #elif(ALGO==11)
 			SCRATCHPAD_CHUNK(0) = b_x[0] ^ ((uint4 *)c)[0];
 #	ifdef __NV_CL_C_VERSION
 			// flush shuffled data
 			SCRATCHPAD_CHUNK_GLOBAL = *scratchpad_line;
- 			idx0 = c[0] & MASK;
+ 			idx0 = as_uint2(c[0]).s0 & MASK;
  			idxS = idx0 & 0x30;
  			*scratchpad_line = SCRATCHPAD_CHUNK_GLOBAL;
 #	else
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 #	endif
 #else
 			b_x[0] ^= ((uint4 *)c)[0];
 			SCRATCHPAD_CHUNK(0) = b_x[0];
-			idx0 = c[0] & MASK;
+			idx0 = as_uint2(c[0]).s0 & MASK;
 #endif
 			uint4 tmp;
 			tmp = SCRATCHPAD_CHUNK(0);
@@ -713,28 +722,32 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			const uint d = (((uint *)c)[0] + (sqrt_result << 1)) | 0x80000001UL;
  			// Quotient may be as large as (2^64 - 1)/(2^31 + 1) = 8589934588 = 2^33 - 4
 			// We drop the highest bit to fit both quotient and remainder in 32 bits
+
+#	if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 			division_result = fast_div_v2(RCP, c[1], d);
+#	else
+			division_result = fast_div_v2(c[1], d);
+#	endif
+
  			// Use division_result as an input for the square root to prevent parallel implementation in hardware
 			sqrt_result = fast_sqrt_v2(c[0] + as_ulong(division_result));
-#endif
+
 			ulong2 result_mul;
 			result_mul.s0 = mul_hi(c[0], as_ulong2(tmp).s0);
 			result_mul.s1 = c[0] * as_ulong2(tmp).s0;
-// cryptonight_monero_v8
-#if(ALGO==11)
-        {
-				ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
-				ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
-				result_mul ^= chunk2;
-				ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
-				SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
-				SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
-            SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
-        }
-#endif
-			a[1] += result_mul.s1;
+			ulong2 chunk1 = as_ulong2(SCRATCHPAD_CHUNK(1)) ^ result_mul;
+			ulong2 chunk2 = as_ulong2(SCRATCHPAD_CHUNK(2));
+			result_mul ^= chunk2;
+			ulong2 chunk3 = as_ulong2(SCRATCHPAD_CHUNK(3));
+			SCRATCHPAD_CHUNK(1) = as_uint4(chunk3 + ((ulong2 *)(b_x + 1))[0]);
+			SCRATCHPAD_CHUNK(2) = as_uint4(chunk1 + ((ulong2 *)b_x)[0]);
+			SCRATCHPAD_CHUNK(3) = as_uint4(chunk2 + ((ulong2 *)a)[0]);
 			a[0] += result_mul.s0;
-
+			a[1] += result_mul.s1;
+#else
+			a[1] += c[0] * as_ulong2(tmp).s0;
+			a[0] += mul_hi(c[0], as_ulong2(tmp).s0);
+#endif
 // cryptonight_monero || cryptonight_aeon || cryptonight_ipbc || cryptonight_stellite || cryptonight_masari || cryptonight_bittube2
 #if(ALGO == 3 || ALGO == 5 || ALGO == 6 || ALGO == 7 || ALGO == 8 || ALGO == 10)
 
@@ -742,7 +755,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #	if(ALGO == 6 || ALGO == 10)
 			uint2 ipbc_tmp = tweak1_2 ^ ((uint2 *)&(a[0]))[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
-        SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
+			SCRATCHPAD_CHUNK(0) = ((uint4 *)a)[0];
 			((uint2 *)&(a[1]))[0] ^= ipbc_tmp;
 #	else
 			((uint2 *)&(a[1]))[0] ^= tweak1_2;
@@ -755,7 +768,7 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
 
         ((uint4 *)a)[0] ^= tmp;
-    
+
 // cryptonight_monero_v8
 #if (ALGO == 11)
 #	if defined(__NV_CL_C_VERSION)
@@ -765,22 +778,22 @@ __kernel void JOIN(cn1,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 			b_x[1] = b_x[0];
 #endif
 			b_x[0] = ((uint4 *)c)[0];
-			idx0 = a[0] & MASK;
+			idx0 = as_uint2(a[0]).s0 & MASK;
 
 // cryptonight_heavy || cryptonight_bittube2
 #if (ALGO == 4 || ALGO == 10)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
-                long q = fast_div_heavy(n, d | 0x5);
+            long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
-			idx0 = (d ^ q) & MASK;
-// cryptonight_haven
-#elif (ALGO == 9)
+			idx0 = (d ^ as_int2(q).s0) & MASK;
+// cryptonight_haven || cryptonight_superfast
+#elif (ALGO == 9 || ALGO == 12)
 			long n = *((__global long*)(Scratchpad + (IDX((idx0) >> 4))));
 			int d = ((__global int*)(Scratchpad + (IDX((idx0) >> 4))))[2];
 			long q = fast_div_heavy(n, d | 0x5);
 			*((__global long*)(Scratchpad + (IDX((idx0) >> 4)))) = n ^ q;
-			idx0 = ((~d) ^ q) & MASK;
+			idx0 = ((~d) ^ as_int2(q).s0) & MASK;
 #endif
 
     }
@@ -810,12 +823,12 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10  || ALGO == 12)
     __local uint4 xin1[8][8];
     __local uint4 xin2[8][8];
 #endif
-        
+
 #if(COMP_MODE==1)
     // do not use early return here
     if(gIdx < Threads)
@@ -825,9 +838,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #if(STRIDED_INDEX==0)
         Scratchpad += gIdx * (MEMORY >> 4);
 #elif(STRIDED_INDEX==1)
-        Scratchpad += gIdx;
+		Scratchpad += gIdx;
 #elif(STRIDED_INDEX==2)
         Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + MEM_CHUNK * (gIdx % WORKSIZE);
+#elif(STRIDED_INDEX==3)
+		Scratchpad += (gIdx / WORKSIZE) * (MEMORY >> 4) * WORKSIZE + (gIdx % WORKSIZE);
 #endif
 
         #if defined(__Tahiti__) || defined(__Pitcairn__)
@@ -847,8 +862,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
     barrier(CLK_LOCAL_MEM_FENCE);
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     __local uint4* xin1_store = &xin1[get_local_id(1)][get_local_id(0)];
     __local uint4* xin1_load = &xin1[(get_local_id(1) + 1) % 8][get_local_id(0)];
     __local uint4* xin2_store = &xin2[get_local_id(1)][get_local_id(0)];
@@ -861,11 +876,11 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
     if (gIdx < Threads)
 #endif
     {
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
         #pragma unroll 2
         for(int i = 0, i1 = get_local_id(1); i < (MEMORY >> 7); ++i, i1 = (i1 + 16) % (MEMORY >> 4))
         {
-            text ^= Scratchpad[IDX(i1)];
+            text ^= Scratchpad[IDX((uint)i1)];
             barrier(CLK_LOCAL_MEM_FENCE);
             text ^= *xin2_load;
 
@@ -875,7 +890,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 
             *xin1_store = text;
 
-            text ^= Scratchpad[IDX(i1 + 8)];
+            text ^= Scratchpad[IDX((uint)i1 + 8u)];
             barrier(CLK_LOCAL_MEM_FENCE);
             text ^= *xin1_load;
 
@@ -892,7 +907,7 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #else
         #pragma unroll 2
         for (int i = 0; i < (MEMORY >> 7); ++i) {
-            text ^= Scratchpad[IDX((i << 3) + get_local_id(1))];
+            text ^= Scratchpad[IDX((uint)((i << 3) + get_local_id(1)))];
 
             #pragma unroll 10
             for(int j = 0; j < 10; ++j)
@@ -901,8 +916,8 @@ __kernel void JOIN(cn2,ALGO) (__global uint4 *Scratchpad, __global ulong *states
 #endif
     }
 
-// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2
-#if (ALGO == 4 || ALGO == 9 || ALGO == 10)
+// cryptonight_heavy || cryptonight_haven || cryptonight_bittube2 || cryptonight_superfast
+#if (ALGO == 4 || ALGO == 9 || ALGO == 10 || ALGO == 12)
     /* Also left over threads performe this loop.
      * The left over thread results will be ignored
      */
@@ -971,7 +986,7 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
     const ulong idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1019,8 +1034,8 @@ __kernel void Skein(__global ulong *states, __global uint *BranchBuf, __global u
             ulong outIdx = atomic_inc(output + 0xFF);
 			if(outIdx < 0xFF)
 				output[outIdx] = BranchBuf[idx] + (uint)get_global_offset(0);
-            }
         }
+    }
     mem_fence(CLK_GLOBAL_MEM_FENCE);
 }
 
@@ -1052,7 +1067,7 @@ __kernel void JH(__global ulong *states, __global uint *BranchBuf, __global uint
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1106,7 +1121,7 @@ __kernel void Blake(__global ulong *states, __global uint *BranchBuf, __global u
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1182,7 +1197,7 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
     const uint idx = get_global_id(0) - get_global_offset(0);
 
     // do not use early return here
-    if(idx < Threads)
+    if(idx < BranchBuf[Threads])
     {
         states += 25 * BranchBuf[idx];
 
@@ -1238,4 +1253,4 @@ __kernel void Groestl(__global ulong *states, __global uint *BranchBuf, __global
         }
     }
 
-)==="
\ No newline at end of file
+)==="
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
index 21268fd78..161f2f55d 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_div_heavy.cl
@@ -6,23 +6,19 @@ inline long fast_div_heavy(long _a, int _b)
 {
 	long a = abs(_a);
 	int b = abs(_b);
-
 	float rcp = native_recip(convert_float_rte(b));
 	float rcp2 = as_float(as_uint(rcp) + (32U << 23));
-
-	ulong q1 = convert_ulong_rte(convert_float_rte(as_int2(a).s1) * rcp2);
+	ulong q1 = convert_ulong(convert_float_rte(as_int2(a).s1) * rcp2);
 	a -= q1 * as_uint(b);
-
-	long q2 = convert_long_rte(convert_float_rtn(a) * rcp);
+	float q2f = convert_float_rte(as_int2(a >> 12).s0) * rcp;
+	q2f = as_float(as_uint(q2f) + (12U << 23));
+	long q2 = convert_long_rte(q2f);
 	int a2 = as_int2(a).s0 - as_int2(q2).s0 * b;
-
 	int q3 = convert_int_rte(convert_float_rte(a2) * rcp);
 	q3 += (a2 - q3 * b) >> 31;
-
 	const long q = q1 + q2 + q3;
 	return ((as_int2(_a).s1 ^ _b) < 0) ? -q : q;
 }
 
 #endif
 )==="
-        
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
index 2c1b13865..c170387b4 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/fast_int_math_v2.cl
@@ -42,6 +42,9 @@ static const __constant uint RCP_C[256] =
 	0x38c62ffu,0x41a841ebu,0x286478bu,0x41244166u,0x1823b84u,0x40a140e2u,0x803883u,0x401C4060u,
 };
 
+// Rocm produce invalid results if get_reciprocal without lookup table is used
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
+
 inline uint get_reciprocal(const __local uchar *RCP, uint a)
 {
 	const uint index1 = (a & 0x7F000000U) >> 21;
@@ -66,63 +69,61 @@ inline uint get_reciprocal(const __local uchar *RCP, uint a)
 	return as_uint2(k).s1 + (b ? r : 0);
 }
 
-inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
-{
-	const uint r = get_reciprocal((const __local uchar *)RCP, b);
-	const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
-
-	ulong q;
-	((uint*)&q)[0] = as_uint2(k).s1;
-
-#if defined(cl_amd_device_attribute_query) && (OPENCL_DRIVER_MAJOR == 14)
-	/* The AMD driver 14.XX is not able to compile `(k < a)`
-	 * https://github.com/fireice-uk/xmr-stak/issues/1922
-	 * This is a workaround for the broken compiler.
-	 */
-	 ulong whyAMDwhy;
-	((uint*)&whyAMDwhy)[0] = as_uint2(k).s0;
-	((uint*)&whyAMDwhy)[1] = as_uint2(k).s1;
-	((uint*)&q)[1] = (whyAMDwhy < a) ? 1U : 0U;
 #else
-	((uint*)&q)[1] = (k < a) ? 1U : 0U;
-#endif
-	
-	const long tmp = a - q * b;
-	const bool overshoot = (tmp < 0);
-	const bool undershoot = (tmp >= b);
-
-	return (uint2)(
-		as_uint2(q).s0 + (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U),
-		as_uint2(tmp).s0 + (overshoot ? b : 0U) - (undershoot ? b : 0U)
-	);
-}
 
-inline uint fast_sqrt_v2(const ulong n1)
+inline uint get_reciprocal(uint a)
 {
-	float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+    const float a_hi = as_float((a >> 8) + ((126U + 31U) << 23));
+    const float a_lo = convert_float_rte(a & 0xFF);
+    const float r = native_recip(a_hi);
+    const float r_scaled = as_float(as_uint(r) + (64U << 23));
+    const float h = fma(a_lo, r, fma(a_hi, r, -1.0f));
+    return (as_uint(r) << 9) - convert_int_rte(h * r_scaled);
+}
 
-	float x1 = native_rsqrt(x);
-	x = native_sqrt(x);
+#endif
 
-	// The following line does x1 *= 4294967296.0f;
-	x1 = as_float(as_uint(x1) + (32U << 23));
+#if defined(__clang__) && !defined(__NV_CL_C_VERSION)
 
-	const uint x0 = as_uint(x) - (158U << 23);
-	const long delta0 = n1 - (((long)(x0) * x0) << 18);
-	const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+inline uint2 fast_div_v2(const __local uint *RCP, ulong a, uint b)
+{
+	const uint r = get_reciprocal((const __local uchar *)RCP, b);
 
-	uint result = (x0 << 10) + convert_int_rte(delta);
-	const uint s = result >> 1;
-	const uint b = result & 1;
+#else
 
-	const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
-	if ((long)(x2 + b) > 0) --result;
-	if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+inline uint2 fast_div_v2(ulong a, uint b)
+{
+	const uint r = get_reciprocal(b);
 
-	return result;
+#endif
+
+    const ulong k = mul_hi(as_uint2(a).s0, r) + ((ulong)(r) * as_uint2(a).s1) + a;
+    const uint q = as_uint2(k).s1;
+    long tmp = a - ((ulong)(q) * b);
+    ((int*)&tmp)[1] -= (as_uint2(k).s1 < as_uint2(a).s1) ? b : 0;
+    const int overshoot = ((int*)&tmp)[1] >> 31;
+    const int undershoot = as_int2(as_uint(b - 1) - tmp).s1 >> 31;
+    return (uint2)(q + overshoot - undershoot, as_uint2(tmp).s0 + (as_uint(overshoot) & b) - (as_uint(undershoot) & b));
+}
+inline uint fast_sqrt_v2(const ulong n1)
+{
+    float x = as_float((as_uint2(n1).s1 >> 9) + ((64U + 127U) << 23));
+    float x1 = native_rsqrt(x);
+    x = native_sqrt(x);
+    // The following line does x1 *= 4294967296.0f;
+    x1 = as_float(as_uint(x1) + (32U << 23));
+    const uint x0 = as_uint(x) - (158U << 23);
+    const long delta0 = n1 - (as_ulong((uint2)(mul24(x0, x0), mul_hi(x0, x0))) << 18);
+    const float delta = convert_float_rte(as_int2(delta0).s1) * x1;
+    uint result = (x0 << 10) + convert_int_rte(delta);
+    const uint s = result >> 1;
+    const uint b = result & 1;
+    const ulong x2 = (ulong)(s) * (s + b) + ((ulong)(result) << 32) - n1;
+    if ((long)(x2 + as_int(b - 1)) >= 0) --result;
+    if ((long)(x2 + 0x100000000UL + s) < 0) ++result;
+    return result;
 }
 
 #endif
 
 )==="
-        
\ No newline at end of file
diff --git a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
index 50e861e23..b99b62d5c 100644
--- a/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
+++ b/xmrstak/backend/amd/amd_gpu/opencl/wolf-aes.cl
@@ -74,42 +74,62 @@ static const __constant uint AES0_C[256] =
 
 #define BYTE(x, y)	(amd_bfe((x), (y) << 3U, 8U))
 
-inline uint4 AES_Round_bittube2(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, uint4 x, uint4 k)
-{
-	x = ~x;
-	k.s0 ^= AES0[BYTE(x.s0, 0)] ^ AES1[BYTE(x.s1, 1)] ^ AES2[BYTE(x.s2, 2)] ^ AES3[BYTE(x.s3, 3)];
-	x.s0 ^= k.s0;
-	k.s1 ^= AES0[BYTE(x.s1, 0)] ^ AES1[BYTE(x.s2, 1)] ^ AES2[BYTE(x.s3, 2)] ^ AES3[BYTE(x.s0, 3)];
-	x.s1 ^= k.s1;
-	k.s2 ^= AES0[BYTE(x.s2, 0)] ^ AES1[BYTE(x.s3, 1)] ^ AES2[BYTE(x.s0, 2)] ^ AES3[BYTE(x.s1, 3)];
-	x.s2 ^= k.s2;
-	k.s3 ^= AES0[BYTE(x.s3, 0)] ^ AES1[BYTE(x.s0, 1)] ^ AES2[BYTE(x.s1, 2)] ^ AES3[BYTE(x.s2, 3)];
-	return k;
-}
-
 uint4 AES_Round(const __local uint *AES0, const __local uint *AES1, const __local uint *AES2, const __local uint *AES3, const uint4 X, uint4 key)
 {
 	key.s0 ^= AES0[BYTE(X.s0, 0)];
-    key.s1 ^= AES0[BYTE(X.s1, 0)];
-    key.s2 ^= AES0[BYTE(X.s2, 0)];
-    key.s3 ^= AES0[BYTE(X.s3, 0)];
+	key.s1 ^= AES0[BYTE(X.s1, 0)];
+	key.s2 ^= AES0[BYTE(X.s2, 0)];
+	key.s3 ^= AES0[BYTE(X.s3, 0)];
 
 	key.s0 ^= AES2[BYTE(X.s2, 2)];
-    key.s1 ^= AES2[BYTE(X.s3, 2)];
-    key.s2 ^= AES2[BYTE(X.s0, 2)];
-    key.s3 ^= AES2[BYTE(X.s1, 2)];
+	key.s1 ^= AES2[BYTE(X.s3, 2)];
+	key.s2 ^= AES2[BYTE(X.s0, 2)];
+	key.s3 ^= AES2[BYTE(X.s1, 2)];
 
 	key.s0 ^= AES1[BYTE(X.s1, 1)];
-    key.s1 ^= AES1[BYTE(X.s2, 1)];
-    key.s2 ^= AES1[BYTE(X.s3, 1)];
-    key.s3 ^= AES1[BYTE(X.s0, 1)];
+	key.s1 ^= AES1[BYTE(X.s2, 1)];
+	key.s2 ^= AES1[BYTE(X.s3, 1)];
+	key.s3 ^= AES1[BYTE(X.s0, 1)];
 
 	key.s0 ^= AES3[BYTE(X.s3, 3)];
-    key.s1 ^= AES3[BYTE(X.s0, 3)];
-    key.s2 ^= AES3[BYTE(X.s1, 3)];
-    key.s3 ^= AES3[BYTE(X.s2, 3)];
+	key.s1 ^= AES3[BYTE(X.s0, 3)];
+	key.s2 ^= AES3[BYTE(X.s1, 3)];
+	key.s3 ^= AES3[BYTE(X.s2, 3)];
+
+	return key;
+}
+
+uint4 AES_Round2(const __local uint *AES0, const __local uint *AES1, const uint4 X, uint4 key)
+{
+	key.s0 ^= AES0[BYTE(X.s0, 0)];
+	key.s1 ^= AES0[BYTE(X.s1, 0)];
+	key.s2 ^= AES0[BYTE(X.s2, 0)];
+	key.s3 ^= AES0[BYTE(X.s3, 0)];
+
+	key.s0 ^= rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u);
+	key.s1 ^= rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u);
+	key.s2 ^= rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u);
+	key.s3 ^= rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u);
+
+	key.s0 ^= AES1[BYTE(X.s1, 1)];
+	key.s1 ^= AES1[BYTE(X.s2, 1)];
+	key.s2 ^= AES1[BYTE(X.s3, 1)];
+	key.s3 ^= AES1[BYTE(X.s0, 1)];
+
+	return key;
+}
+
+uint4 AES_Round2_bittube2(const __local uint *AES0, const __local uint *AES1, uint4 X, uint4 key)
+{
+	key.s0 ^= AES0[BYTE(X.s0, 0)] ^ rotate(AES0[BYTE(X.s2, 2)] ^ AES1[BYTE(X.s3, 3)], 16u) ^ AES1[BYTE(X.s1, 1)];
+	X.s0 ^= key.s0;
+	key.s1 ^= AES0[BYTE(X.s1, 0)] ^ rotate(AES0[BYTE(X.s3, 2)] ^ AES1[BYTE(X.s0, 3)], 16u) ^  AES1[BYTE(X.s2, 1)];
+	X.s1 ^= key.s1;
+	key.s2 ^= AES0[BYTE(X.s2, 0)] ^ rotate(AES0[BYTE(X.s0, 2)] ^ AES1[BYTE(X.s1, 3)], 16u) ^ AES1[BYTE(X.s3, 1)];
+	X.s2 ^= key.s2;
+	key.s3 ^= AES0[BYTE(X.s3, 0)] ^ rotate(AES0[BYTE(X.s1, 2)] ^ AES1[BYTE(X.s2, 3)], 16u) ^ AES1[BYTE(X.s0, 1)];
 
-    return key;
+	return key;
 }
 
 #endif
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index c5b331c87..ba4cebb7b 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -134,6 +134,13 @@ class autoAdjust
 				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_monero_v8 ||
 				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_monero_v8;
 
+			// true for all cryptonight_heavy derivates since we check the user and dev pool
+			bool useCryptonight_heavy =
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgoRoot() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgo() == cryptonight_heavy ||
+				::jconf::inst()->GetCurrentCoinSelection().GetDescription(0).GetMiningAlgoRoot() == cryptonight_heavy;
+
 			// set strided index to default
 			ctx.stridedIndex = 1;
 
@@ -144,19 +151,36 @@ class autoAdjust
 			// use chunked (4x16byte) scratchpad for all backends. Default `mem_chunk` is `2`
 			if(useCryptonight_v8)
 				ctx.stridedIndex = 2;
+			else if(useCryptonight_heavy)
+				ctx.stridedIndex = 3;
 
 			// increase all intensity limits by two for aeon
 			if(::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_lite)
 				maxThreads *= 2u;
 
-			// keep 128MiB memory free (value is randomly chosen)
-			size_t availableMem = ctx.freeMem - minFreeMem;
+			// keep 128MiB memory free (value is randomly chosen) from the max available memory
+			const size_t maxAvailableFreeMem = ctx.freeMem - minFreeMem;
+
+			size_t memPerThread = std::min(ctx.maxMemPerAlloc, maxAvailableFreeMem);
+
+			uint32_t numThreads = 1u;
+			if(ctx.isAMD)
+			{
+				numThreads = 2;
+				size_t memDoubleThread = maxAvailableFreeMem / numThreads;
+				memPerThread = std::min(memPerThread, memDoubleThread);
+			}
+
 			// 224byte extra memory is used per thread for meta data
 			size_t perThread = hashMemSize + 224u;
-			size_t maxIntensity = availableMem / perThread;
+			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min( maxThreads , maxIntensity );
 			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
 			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
+			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
+			if(numThreads == 2)
+				intensity = (possibleIntensity / 8) * 8;
+
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
 			if (intensity == 0)
 			{
@@ -166,18 +190,22 @@ class autoAdjust
 			}
 			if (intensity != 0)
 			{
-				conf += std::string("  // gpu: ") + ctx.name + " memory:" + std::to_string(availableMem / byteToMiB) + "\n";
-				conf += std::string("  // compute units: ") + std::to_string(ctx.computeUnits) + "\n";
-				// set 8 threads per block (this is a good value for the most gpus)
-				conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-					"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
-					"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
-					"    \"unroll\" : 8, \"comp_mode\" : true\n" +
-					"  },\n";
+				for(uint32_t thd = 0; thd < numThreads; ++thd)
+				{
+					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
+					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
+						std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" +  std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
+					// set 8 threads per block (this is a good value for the most gpus)
+					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
+						"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+						"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
+						"    \"unroll\" : 8, \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +
+						"  },\n";
+				}
 			}
 			else
 			{
-				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(availableMem / byteToMiB).c_str());
+				printer::inst()->print_msg(L0, "WARNING: Ignore gpu %s, %s MiB free memory is not enough to suggest settings.", ctx.name.c_str(), std::to_string(memPerThread / byteToMiB).c_str());
 			}
 		}
 
diff --git a/xmrstak/backend/amd/config.tpl b/xmrstak/backend/amd/config.tpl
index 421e0ed4b..26a8ef48a 100644
--- a/xmrstak/backend/amd/config.tpl
+++ b/xmrstak/backend/amd/config.tpl
@@ -7,6 +7,8 @@ R"===(// generated by XMRSTAK_VERSION
  * worksize      - Number of local GPU threads (nothing to do with CPU threads)
  * affine_to_cpu - This will affine the thread to a CPU. This can make a GPU miner play along nicer with a CPU miner.
  * strided_index - switch memory pattern used for the scratch pad memory
+ *                 3 = chunked memory, chunk size based on the 'worksize'
+ *                     required: intensity must be a multiple of worksize
  *                 2 = chunked memory, chunk size is controlled by 'mem_chunk'
  *                     required: intensity must be a multiple of worksize
  *                 1 or true  = use 16byte contiguous memory per thread, the next memory block has offset of intensity blocks
@@ -20,10 +22,16 @@ R"===(// generated by XMRSTAK_VERSION
  *                 to use a intensity which is not the multiple of the worksize.
  *                 If you set false and the intensity is not multiple of the worksize the miner can crash:
  *                 in this case set the intensity to a multiple of the worksize or activate comp_mode.
+ * interleave    - Controls the starting point in time between two threads on the same GPU device relative to the last started thread.
+ *                 This option has only an effect if two compute threads using the same GPU device: valid range [0;100]
+ *                 0  = disable thread interleaving
+ *                 40 = each working thread waits until 40% of the hash calculation of the previous started thread is finished
  * "gpu_threads_conf" :
  * [
- *	{ "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
- *    "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true },
+ *     { "index" : 0, "intensity" : 1000, "worksize" : 8, "affine_to_cpu" : false,
+ *       "strided_index" : true, "mem_chunk" : 2, "unroll" : 8, "comp_mode" : true,
+ *       "interleave" : 40
+ *     },
  * ],
  * If you do not wish to mine with your AMD GPU(s) then use:
  * "gpu_threads_conf" :
@@ -34,6 +42,16 @@ R"===(// generated by XMRSTAK_VERSION
 GPUCONFIG
 ],
 
+/*
+ * number of rounds per intensity performed to find the best intensity settings
+ *
+ * WARNING: experimental option
+ *
+ * 0 = disable auto tuning
+ * 10 or higher = recommended value if you don't already know the best intensity
+ */
+"auto_tune" : 0,
+
 /*
  * Platform index. This will be 0 unless you have different OpenCL platform - eg. AMD and Intel.
  */
diff --git a/xmrstak/backend/amd/jconf.cpp b/xmrstak/backend/amd/jconf.cpp
index 152f8add4..d3dc00d01 100644
--- a/xmrstak/backend/amd/jconf.cpp
+++ b/xmrstak/backend/amd/jconf.cpp
@@ -65,6 +65,19 @@ configVal oConfigValues[] = {
 
 constexpr size_t iConfigCnt = (sizeof(oConfigValues)/sizeof(oConfigValues[0]));
 
+
+enum optionalConfigEnum { iAutoTune };
+
+struct optionalConfigVal {
+	optionalConfigEnum iName;
+	const char* sName;
+	Type iType;
+};
+
+optionalConfigVal oOptionalConfigValues[] = {
+	{ iAutoTune, "auto_tune", kNumberType }
+};
+
 inline bool checkType(Type have, Type want)
 {
 	if(want == have)
@@ -106,7 +119,7 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	if(!oThdConf.IsObject())
 		return false;
 
-	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode;
+	const Value *idx, *intensity, *w_size, *aff, *stridedIndex, *memChunk, *unroll, *compMode, *interleave;
 	idx = GetObjectMember(oThdConf, "index");
 	intensity = GetObjectMember(oThdConf, "intensity");
 	w_size = GetObjectMember(oThdConf, "worksize");
@@ -115,11 +128,31 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	memChunk = GetObjectMember(oThdConf, "mem_chunk");
 	unroll = GetObjectMember(oThdConf, "unroll");
 	compMode = GetObjectMember(oThdConf, "comp_mode");
+	interleave = GetObjectMember(oThdConf, "interleave");
 
 	if(idx == nullptr || intensity == nullptr || w_size == nullptr || aff == nullptr || memChunk == nullptr ||
 		stridedIndex == nullptr || unroll == nullptr || compMode == nullptr)
 		return false;
 
+	// interleave is optional
+	if(interleave != nullptr)
+	{
+		if(!interleave->IsInt())
+		{
+			printer::inst()->print_msg(L0, "ERROR: interleave must be a number");
+			return false;
+		}
+		else if(interleave->GetInt() < 0 || interleave->GetInt() > 100)
+		{
+			printer::inst()->print_msg(L0, "ERROR: interleave must be in range [0;100]");
+			return false;
+		}
+		else
+		{
+			cfg.interleave = interleave->GetInt();
+		}
+	}
+
 	if(!idx->IsUint64() || !intensity->IsUint64() || !w_size->IsUint64())
 		return false;
 
@@ -137,9 +170,9 @@ bool jconf::GetThreadConfig(size_t id, thd_cfg &cfg)
 	else
 		cfg.stridedIndex = (int)stridedIndex->GetInt64();
 
-	if(cfg.stridedIndex > 2)
+	if(cfg.stridedIndex > 3)
 	{
-		printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 2");
+		printer::inst()->print_msg(L0, "ERROR: strided_index must be smaller than 3");
 		return false;
 	}
 
@@ -179,6 +212,20 @@ size_t jconf::GetPlatformIdx()
 	return prv->configValues[iPlatformIdx]->GetUint64();
 }
 
+size_t jconf::GetAutoTune()
+{
+	const Value* value = GetObjectMember(prv->jsonDoc, oOptionalConfigValues[iAutoTune].sName);
+	if( value != nullptr && value->IsUint64())
+	{
+		return value->GetUint64();
+	}
+	else
+	{
+		printer::inst()->print_msg(L0, "WARNING: OpenCL optional option 'auto-tune' not available");
+	}
+	return 0;
+}
+
 size_t jconf::GetThreadCount()
 {
 	return prv->configValues[aGpuThreadsConf]->Size();
diff --git a/xmrstak/backend/amd/jconf.hpp b/xmrstak/backend/amd/jconf.hpp
index b852c5940..51a0c79ac 100644
--- a/xmrstak/backend/amd/jconf.hpp
+++ b/xmrstak/backend/amd/jconf.hpp
@@ -27,6 +27,7 @@ class jconf
 		size_t w_size;
 		long long cpu_aff;
 		int stridedIndex;
+		int interleave = 40;
 		int memChunk;
 		int unroll;
 		bool compMode;
@@ -35,6 +36,7 @@ class jconf
 	size_t GetThreadCount();
 	bool GetThreadConfig(size_t id, thd_cfg &cfg);
 
+	size_t GetAutoTune();
 	size_t GetPlatformIdx();
 
 private:
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 5e70f25a6..b0f4e6ecd 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -58,6 +58,7 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 	iTimestamp = 0;
 	pGpuCtx = ctx;
 	this->affinity = cfg.cpu_aff;
+	autoTune = jconf::inst()->GetAutoTune();
 
 	std::unique_lock<std::mutex> lck(thd_aff_set);
 	std::future<void> order_guard = order_fix.get_future();
@@ -100,6 +101,7 @@ bool minethd::init_gpus()
 		vGpuData[i].memChunk = cfg.memChunk;
 		vGpuData[i].compMode = cfg.compMode;
 		vGpuData[i].unroll = cfg.unroll;
+		vGpuData[i].interleave = cfg.interleave;
 	}
 
 	return InitOpenCL(vGpuData.data(), n, jconf::inst()->GetPlatformIdx()) == ERR_SUCCESS;
@@ -186,6 +188,19 @@ void minethd::work_main()
 	uint8_t version = 0;
 	size_t lastPoolId = 0;
 
+	pGpuCtx->maxRawIntensity = pGpuCtx->rawIntensity;
+
+	if(autoTune != 0)
+	{
+		pGpuCtx->rawIntensity = pGpuCtx->computeUnits * pGpuCtx->workSize;
+		pGpuCtx->rawIntensity = std::min(pGpuCtx->maxRawIntensity, pGpuCtx->rawIntensity);
+	}
+	// parameters needed for auto tuning
+	uint32_t cntTestRounds = 0;
+	uint64_t accRuntime = 0;
+	double bestHashrate = 0.0;
+	uint32_t bestIntensity = pGpuCtx->maxRawIntensity;
+
 	while (bQuit == 0)
 	{
 		if (oWork.bStall)
@@ -220,7 +235,6 @@ void minethd::work_main()
 			version = new_version;
 		}
 
-		uint32_t h_per_round = pGpuCtx->rawIntensity;
 		size_t round_ctr = 0;
 
 		assert(sizeof(job_result::sJobID) == sizeof(pool_job::sJobID));
@@ -236,12 +250,15 @@ void minethd::work_main()
 			//Allocate a new nonce every 16 rounds
 			if((round_ctr++ & 0xF) == 0)
 			{
-				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, h_per_round * 16);
+				globalStates::inst().calc_start_nonce(pGpuCtx->Nonce, oWork.bNiceHash, pGpuCtx->rawIntensity * 16);
 				// check if the job is still valid, there is a small possibility that the job is switched
 				if(globalStates::inst().iGlobalJobNo.load(std::memory_order_relaxed) != iJobNo)
 					break;
 			}
 
+			// if auto tuning is running we will not adjust the interleave interval
+			const bool adjustInterleave = autoTune == 0;
+			uint64_t t0 = interleaveAdjustDelay(pGpuCtx, adjustInterleave);
 
 			cl_uint results[0x100];
 			memset(results,0,sizeof(cl_uint)*(0x100));
@@ -269,6 +286,58 @@ void minethd::work_main()
 			uint64_t iStamp = get_timestamp_ms();
 			iHashCount.store(iCount, std::memory_order_relaxed);
 			iTimestamp.store(iStamp, std::memory_order_relaxed);
+
+			accRuntime += updateTimings(pGpuCtx, t0);
+
+			// tune intensity
+			if(autoTune != 0)
+			{
+				if(cntTestRounds++ == autoTune)
+				{
+					double avgHashrate = static_cast<double>(cntTestRounds * pGpuCtx->rawIntensity) / (static_cast<double>(accRuntime) / 1000.0);
+					if(avgHashrate > bestHashrate)
+					{
+						bestHashrate = avgHashrate;
+						bestIntensity = pGpuCtx->rawIntensity;
+					}
+
+					// increase always in workSize steps to avoid problems with the compatibility mode
+					pGpuCtx->rawIntensity += pGpuCtx->workSize;
+					// trigger that we query for new nonce's because the number of nonce previous allocated depends on the rawIntensity
+					round_ctr = 0x10;
+
+					if(pGpuCtx->rawIntensity > pGpuCtx->maxRawIntensity)
+					{
+						// lock intensity to the best values
+						autoTune = 0;
+						pGpuCtx->rawIntensity = bestIntensity;
+						printer::inst()->print_msg(L1,"OpenCL %u|%u: lock intensity at %u",
+							pGpuCtx->deviceIdx,
+							pGpuCtx->idWorkerOnDevice,
+							bestIntensity
+						);
+					}
+					else
+					{
+						printer::inst()->print_msg(L1,"OpenCL %u|%u: auto-tune validate intensity %u|%u",
+							pGpuCtx->deviceIdx,
+							pGpuCtx->idWorkerOnDevice,
+							pGpuCtx->rawIntensity,
+							bestIntensity
+						);
+					}
+					// update gpu with new intensity
+					XMRSetJob(pGpuCtx, oWork.bWorkBlob, oWork.iWorkSize, target, miner_algo);
+				}
+				// use 3 rounds to warm up with the new intensity
+				else if(cntTestRounds == autoTune + 3)
+				{
+					// reset values for the next test period
+					cntTestRounds = 0;
+					accRuntime = 0;
+				}
+			}
+
 			std::this_thread::yield();
 		}
 
diff --git a/xmrstak/backend/amd/minethd.hpp b/xmrstak/backend/amd/minethd.hpp
index 32e66ec87..74ab5fb60 100644
--- a/xmrstak/backend/amd/minethd.hpp
+++ b/xmrstak/backend/amd/minethd.hpp
@@ -39,6 +39,7 @@ class minethd  : public iBackend
 
 	std::thread oWorkThd;
 	int64_t affinity;
+	uint32_t autoTune;
 
 	bool bQuit;
 	bool bNoPrefetch;
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 2b1741764..06cbe8740 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -182,7 +182,7 @@ void cn_explode_scratchpad(const __m128i* input, __m128i* output)
 	xin6 = _mm_load_si128(input + 10);
 	xin7 = _mm_load_si128(input + 11);
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		for(size_t i=0; i < 16; i++)
 		{
@@ -326,11 +326,11 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 			aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 		}
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 	}
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
 		{
@@ -377,7 +377,7 @@ void cn_implode_scratchpad(const __m128i* input, __m128i* output)
 				aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
 			}
 
-			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+			if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 				mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
 		}
 
@@ -716,7 +716,7 @@ inline void set_float_rounding_mode()
 		((int64_t*)ptr0)[0] = u ^ q; \
 		idx0 = d ^ q; \
 	} \
-	else if(ALGO == cryptonight_haven) \
+	else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast) \
 	{ \
 		ptr0 = (__m128i *)&l0[idx0 & MASK]; \
 		int64_t u  = ((int64_t*)ptr0)[0]; \
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 55879110a..20203a3c5 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -395,6 +395,13 @@ bool minethd::self_test()
 			hashf("\x85\x19\xe0\x39\x17\x2b\x0d\x70\xe5\xca\x7b\x33\x83\xd6\xb3\x16\x73\x15\xa4\x22\x74\x7b\x73\xf0\x19\xcf\x95\x28\xf0\xfd\xe3\x41\xfd\x0f\x2a\x63\x03\x0b\xa6\x45\x05\x25\xcf\x6d\xe3\x18\x37\x66\x9a\xf6\xf1\xdf\x81\x31\xfa\xf5\x0a\xaa\xb8\xd3\xa7\x40\x55\x89", 64, out, ctx);
 			bResult = bResult && memcmp(out, "\x90\xdc\x65\x53\x8d\xb0\x00\xea\xa2\x52\xcd\xd4\x1c\x17\x7a\x64\xfe\xff\x95\x36\xe7\x71\x68\x35\xd4\xcf\x5c\x73\x56\xb1\x2f\xcd", 32) == 0;
 		}
+    else if(algo == cryptonight_superfast)
+		{
+			hashf = func_selector(::jconf::inst()->HaveHardwareAes(), false, xmrstak_algo::cryptonight_superfast);
+			hashf("\x03\x05\xa0\xdb\xd6\xbf\x05\xcf\x16\xe5\x03\xf3\xa6\x6f\x78\x00\x7c\xbf\x34\x14\x43\x32\xec\xbf\xc2\x2e\xd9\x5c\x87\x00\x38\x3b\x30\x9a\xce\x19\x23\xa0\x96\x4b\x00\x00\x00\x08\xba\x93\x9a\x62\x72\x4c\x0d\x75\x81\xfc\xe5\x76\x1e\x9d\x8a\x0e\x6a\x1c\x3f\x92\x4f\xdd\x84\x93\xd1\x11\x56\x49\xc0\x5e\xb6\x01", 76, out, ctx);
+			bResult = bResult &&  memcmp(out, "\x40\x86\x5a\xa8\x87\x41\xec\x1d\xcc\xbd\x2b\xc6\xff\x36\xb9\x4d\x54\x71\x58\xdb\x94\x69\x8e\x3c\xa0\x3d\xe4\x81\x9a\x65\x9f\xef", 32) == 0;
+		}
+
 
 		if(!bResult)
 			printer::inst()->print_msg(L0,
@@ -520,6 +527,9 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 	case cryptonight_monero_v8:
 		algv = 10;
 		break;
+	case cryptonight_superfast:
+		algv = 11;
+		break;
 	default:
 		algv = 2;
 		break;
@@ -579,7 +589,12 @@ minethd::cn_hash_fun minethd::func_multi_selector(bool bHaveAes, bool bNoPrefetc
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, false>,
 		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, false, true>,
-		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>
+		Cryptonight_hash<N>::template hash<cryptonight_monero_v8, true, true>,
+    
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, false>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, false, true>,
+		Cryptonight_hash<N>::template hash<cryptonight_superfast, true, true>
 	};
 
 	std::bitset<2> digit;
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 6b1afa928..e905caa9f 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -16,7 +16,8 @@ enum xmrstak_algo
 	cryptonight_masari = 8, //equal to cryptonight_monero but with less iterations, used by masari
 	cryptonight_haven = 9, // equal to cryptonight_heavy with a small tweak
 	cryptonight_bittube2 = 10, // derived from cryptonight_heavy with own aes-round implementation and minor other tweaks
-	cryptonight_monero_v8 = 11
+	cryptonight_monero_v8 = 11,
+	cryptonight_superfast = 12
 };
 
 // define aeon settings
@@ -34,6 +35,8 @@ constexpr uint32_t CRYPTONIGHT_HEAVY_ITER = 0x40000;
 
 constexpr uint32_t CRYPTONIGHT_MASARI_ITER = 0x40000;
 
+constexpr uint32_t CRYPTONIGHT_SUPERFAST_ITER = 0x20000; 
+
 template<xmrstak_algo ALGO>
 inline constexpr size_t cn_select_memory() { return 0; }
 
@@ -70,6 +73,9 @@ inline constexpr size_t cn_select_memory<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr size_t cn_select_memory<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MEMORY; }
 
+template<>
+inline constexpr size_t cn_select_memory<cryptonight_superfast>() { return CRYPTONIGHT_MEMORY; } 
+
 inline size_t cn_select_memory(xmrstak_algo algo)
 {
 	switch(algo)
@@ -79,6 +85,7 @@ inline size_t cn_select_memory(xmrstak_algo algo)
 	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
+	case cryptonight_superfast: 
 		return CRYPTONIGHT_MEMORY;
 	case cryptonight_ipbc:
 	case cryptonight_aeon:
@@ -129,6 +136,9 @@ inline constexpr uint32_t cn_select_mask<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr uint32_t cn_select_mask<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_MASK; }
 
+template<>
+inline constexpr uint32_t cn_select_mask<cryptonight_superfast>() { return CRYPTONIGHT_MASK; } 
+
 inline size_t cn_select_mask(xmrstak_algo algo)
 {
 	switch(algo)
@@ -138,6 +148,7 @@ inline size_t cn_select_mask(xmrstak_algo algo)
 	case cryptonight_monero_v8:
 	case cryptonight_masari:
 	case cryptonight:
+	case cryptonight_superfast: 
 		return CRYPTONIGHT_MASK;
 	case cryptonight_ipbc:
 	case cryptonight_aeon:
@@ -188,6 +199,9 @@ inline constexpr uint32_t cn_select_iter<cryptonight_haven>() { return CRYPTONIG
 template<>
 inline constexpr uint32_t cn_select_iter<cryptonight_bittube2>() { return CRYPTONIGHT_HEAVY_ITER; }
 
+template<>
+inline constexpr uint32_t cn_select_iter<cryptonight_superfast>() { return CRYPTONIGHT_SUPERFAST_ITER; } 
+
 inline size_t cn_select_iter(xmrstak_algo algo)
 {
 	switch(algo)
@@ -207,6 +221,8 @@ inline size_t cn_select_iter(xmrstak_algo algo)
 		return CRYPTONIGHT_HEAVY_ITER;
 	case cryptonight_masari:
 		return CRYPTONIGHT_MASARI_ITER;
+	case cryptonight_superfast:
+		return CRYPTONIGHT_SUPERFAST_ITER;
 	default:
 		return 0;
 	}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
index e478600e3..199025635 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_aes.hpp
@@ -303,3 +303,9 @@ __device__ __forceinline__ static void cn_aes_gpu_init(uint32_t *sharedMemory)
 	for(int i = threadIdx.x; i < 1024; i += blockDim.x)
 		sharedMemory[i] = d_t_fn[i];
 }
+
+__device__ __forceinline__ static void cn_aes_gpu_init_half(uint32_t *sharedMemory)
+{
+        for(int i = threadIdx.x; i < 512; i += blockDim.x)
+                sharedMemory[i] = d_t_fn[i];
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
index fa7e09364..87c1befa8 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_core.cu
@@ -9,6 +9,7 @@
 
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp"
+#include "xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp"
 
 
 #ifdef _WIN32
@@ -121,6 +122,11 @@ __device__ __forceinline__ void storeGlobal64( T* addr, T const & val )
 #endif
 }
 
+__device__ __forceinline__ uint32_t rotate16( const uint32_t n )
+{
+	return (n >> 16u) | (n << 16u);
+}
+
 template<size_t ITERATIONS, uint32_t MEMORY>
 __global__ void cryptonight_core_gpu_phase1( int threads, int bfactor, int partidx, uint32_t * __restrict__ long_state, uint32_t * __restrict__ ctx_state2, uint32_t * __restrict__ ctx_key1 )
 {
@@ -267,9 +273,9 @@ __launch_bounds__( XMR_STAK_THREADS * 2 )
 __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b, uint32_t * d_ctx_state,
 		uint32_t startNonce, uint32_t * __restrict__ d_input )
 {
-	__shared__ uint32_t sharedMemory[1024];
+	__shared__ uint32_t sharedMemory[512];
 
-	cn_aes_gpu_init( sharedMemory );
+	cn_aes_gpu_init_half( sharedMemory );
 
 #if( __CUDA_ARCH__ < 300 )
 	extern __shared__ uint64_t externShared[];
@@ -340,8 +346,8 @@ __global__ void cryptonight_core_gpu_phase2_double( int threads, int bfactor, in
 		const u64 cx2 = myChunks[ idx1 + ((sub + 1) & 1) ];
 
 		u64 cx_aes = ax0 ^ u64(
-			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ t_fn2( (cx2.x >> 16) & 0xff ) ^ t_fn3( (cx2.y >> 24 ) ),
-			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ t_fn2( (cx2.y >> 16) & 0xff ) ^ t_fn3( (cx.x >> 24 ) )
+			t_fn0( cx.x & 0xff ) ^ t_fn1( (cx.y >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.x >> 16) & 0xff ) ^ t_fn1( (cx2.y >> 24 ) )),
+			t_fn0( cx.y & 0xff ) ^ t_fn1( (cx2.x >> 8) & 0xff ) ^ rotate16(t_fn0( (cx2.y >> 16) & 0xff ) ^ t_fn1( (cx.x >> 24 ) ))
 		);
 
 		if(ALGO == cryptonight_monero_v8)
@@ -523,7 +529,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 
 	a = (d_ctx_a + thread * 4)[sub];
 	idx0 = shuffle<4>(sPtr,sub, a, 0);
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		if(partidx != 0)
 		{
@@ -647,18 +653,18 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 			{
 				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
 				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
-				int64_t q = n / (d | 0x5);
+				int64_t q = fast_div_heavy(n, (d | 0x5));
 
 				if(sub&1)
 					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
 
 				idx0 = d ^ q;
 			}
-			else if(ALGO == cryptonight_haven)
+			else if(ALGO == cryptonight_haven || ALGO == cryptonight_superfast)
 			{
 				int64_t n = loadGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3));
 				int32_t d = loadGlobal32<uint32_t>( (uint32_t*)(( (uint64_t *) long_state ) + (( idx0 & MASK) >> 3) + 1u ));
-				int64_t q = n / (d | 0x5);
+				int64_t q = fast_div_heavy(n, (d | 0x5));
 
 				if(sub&1)
 					storeGlobal64<uint64_t>( ( (uint64_t *) long_state ) + (( idx0 & MASK ) >> 3), n ^ q );
@@ -672,7 +678,7 @@ __global__ void cryptonight_core_gpu_phase2_quad( int threads, int bfactor, int
 	{
 		(d_ctx_a + thread * 4)[sub] = a;
 		(d_ctx_b + thread * 4)[sub] = d[1];
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 			if(sub&1)
 				*(d_ctx_b + threads * 4 + thread) = idx0;
 	}
@@ -718,7 +724,7 @@ __global__ void cryptonight_core_gpu_phase3( int threads, int bfactor, int parti
 
 		cn_aes_pseudo_round_mut( sharedMemory, text, key );
 
-		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+		if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 		{
 			#pragma unroll
 			for ( int j = 0; j < 4; ++j )
@@ -756,7 +762,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_core_gpu_phase1<ITERATIONS,MEMORY><<< grid, block8 >>>( ctx->device_blocks*ctx->device_threads,
 			bfactorOneThree, i,
 			ctx->d_long_state,
-			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 ? ctx->d_ctx_state2 : ctx->d_ctx_state),
+			(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast ? ctx->d_ctx_state2 : ctx->d_ctx_state),
 			ctx->d_ctx_key1 ));
 
 		if ( partcount > 1 && ctx->device_bsleep > 0) compat_usleep( ctx->device_bsleep );
@@ -818,7 +824,7 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 
 	int roundsPhase3 = partcountOneThree;
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven|| ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast )
 	{
 		// cryptonight_heavy used two full rounds over the scratchpad memory
 		roundsPhase3 *= 2;
@@ -840,9 +846,9 @@ void cryptonight_core_gpu_hash(nvid_ctx* ctx, uint32_t nonce)
 void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t startNonce)
 {
 	typedef void (*cuda_hash_fn)(nvid_ctx* ctx, uint32_t nonce);
-	
+
 	if(miner_algo == invalid_algo) return;
-	
+
 	static const cuda_hash_fn func_table[] = {
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 0>,
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight, 1>,
@@ -875,7 +881,10 @@ void cryptonight_core_cpu_hash(nvid_ctx* ctx, xmrstak_algo miner_algo, uint32_t
 		cryptonight_core_gpu_hash<CRYPTONIGHT_HEAVY_ITER, CRYPTONIGHT_HEAVY_MASK, CRYPTONIGHT_HEAVY_MEMORY/4, cryptonight_bittube2, 1>,
 
 		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 0>,
-		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>
+		cryptonight_core_gpu_hash<CRYPTONIGHT_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_monero_v8, 1>,
+    
+		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 0>,
+		cryptonight_core_gpu_hash<CRYPTONIGHT_SUPERFAST_ITER, CRYPTONIGHT_MASK, CRYPTONIGHT_MEMORY/4, cryptonight_superfast, 1>
 	};
 
 	std::bitset<1> digit;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index 433e175dd..45afec9ac 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -114,7 +114,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	int thread = ( blockDim.x * blockIdx.x + threadIdx.x );
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -160,7 +160,7 @@ __global__ void cryptonight_extra_gpu_prepare( int threads, uint32_t * __restric
 	memcpy( d_ctx_key2 + thread * 40, ctx_key2, 40 * 4 );
 	memcpy( d_ctx_state + thread * 50, ctx_state, 50 * 4 );
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 
 		for(int i=0; i < 16; i++)
@@ -184,7 +184,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 
 	__shared__ uint32_t sharedMemory[1024];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		cn_aes_gpu_init( sharedMemory );
 		__syncthreads( );
@@ -201,7 +201,7 @@ __global__ void cryptonight_extra_gpu_final( int threads, uint64_t target, uint3
 	for ( i = 0; i < 50; i++ )
 		state[i] = ctx_state[i];
 
-	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2)
+	if(ALGO == cryptonight_heavy || ALGO == cryptonight_haven || ALGO == cryptonight_bittube2 || ALGO == cryptonight_superfast)
 	{
 		uint32_t key[40];
 
@@ -298,7 +298,8 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	if(
 		cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
 		cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+		cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+		cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
 	)
 	{
 		// extent ctx_b to hold the state of idx0
@@ -349,6 +350,11 @@ extern "C" void cryptonight_extra_cpu_prepare(nvid_ctx* ctx, uint32_t startNonce
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_haven><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
 			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
 	}
+	else if(miner_algo == cryptonight_superfast)
+	{
+		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_superfast><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
+			ctx->d_ctx_state,ctx->d_ctx_state2, ctx->d_ctx_a, ctx->d_ctx_b, ctx->d_ctx_key1, ctx->d_ctx_key2 ));
+	}
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_KERNEL(ctx->device_id, cryptonight_extra_gpu_prepare<cryptonight_bittube2><<<grid, block >>>( wsize, ctx->d_input, ctx->inputlen, startNonce,
@@ -396,6 +402,14 @@ extern "C" void cryptonight_extra_cpu_final(nvid_ctx* ctx, uint32_t startNonce,
 			cryptonight_extra_gpu_final<cryptonight_haven><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
 		);
 	}
+	else if(miner_algo == cryptonight_superfast)
+	{
+		CUDA_CHECK_MSG_KERNEL(
+			ctx->device_id,
+			"\n**suggestion: Try to increase the value of the attribute 'bfactor' in the NVIDIA config file.**",
+			cryptonight_extra_gpu_final<cryptonight_superfast><<<grid, block >>>( wsize, target, ctx->d_result_count, ctx->d_result_nonce, ctx->d_ctx_state,ctx->d_ctx_key2 )
+		);
+	} 
 	else if(miner_algo == cryptonight_bittube2)
 	{
 		CUDA_CHECK_MSG_KERNEL(
@@ -676,7 +690,8 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		if(
 			cryptonight_heavy == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
 			cryptonight_haven == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
-			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
+			cryptonight_bittube2 == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() ||
+			cryptonight_superfast == ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo()
 		)
 			perThread += 50 * 4; // state double buffer
 
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
new file mode 100644
index 000000000..555ccbef2
--- /dev/null
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_div_heavy.hpp
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <stdint.h>
+
+
+__device__ __forceinline__ int64_t fast_div_heavy(int64_t _a, int _b)
+{
+
+	uint64_t a = abs(_a);
+	int b = abs(_b);
+
+	float rcp = __frcp_rn(__int2float_rn(b));
+	float rcp2 = __uint_as_float(__float_as_uint(rcp) + (32U << 23));
+
+	uint64_t q1 = __float2ull_rz(__int2float_rn(((int*)&a)[1]) * rcp2);
+	a -= q1 * static_cast<uint32_t>(b);
+
+	uint64_t tmp = a >> 12;
+	float q2f = __int2float_rn(((int*)&tmp)[0]) * rcp;
+	q2f = __uint_as_float(__float_as_uint(q2f) + (12U << 23));
+	int64_t q2 = __float2ll_rn(q2f);
+	int a2 = ((int*)&a)[0] - ((int*)&q2)[0] * b;
+
+	int q3 = __float2int_rn(__int2float_rn(a2) * rcp);
+	q3 += (a2 - q3 * b) >> 31;
+
+	const uint64_t q = q1 + q2 + q3;
+	return ((((int*)&_a)[1] ^ _b) < 0) ? -q : q;
+}
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
index 796b7adda..0d54f1436 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_fast_int_math_v2.hpp
@@ -7,8 +7,7 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 	const float a_hi = __uint_as_float((a >> 8) + ((126U + 31U) << 23));
 	const float a_lo = __uint2float_rn(a & 0xFF);
 
-	float r;
-	asm("rcp.approx.f32 %0, %1;" : "=f"(r) : "f"(a_hi));
+	float r = __frcp_rn(a_hi);
 	const float r_scaled = __uint_as_float(__float_as_uint(r) + (64U << 23));
 
 	const float h = __fmaf_rn(a_lo, r, __fmaf_rn(a_hi, r, -1.0f));
@@ -18,21 +17,22 @@ __device__ __forceinline__ uint32_t get_reciprocal(uint32_t a)
 __device__ __forceinline__ uint64_t fast_div_v2(uint64_t a, uint32_t b)
 {
 	const uint32_t r = get_reciprocal(b);
-	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * ((uint32_t*)&a)[1]) + a;
+	const uint32_t a1 = ((uint32_t*)&a)[1];
+	const uint64_t k = __umulhi(((uint32_t*)&a)[0], r) + ((uint64_t)(r) * a1) + a;
 
-	uint32_t q[2];
-	q[0] = ((uint32_t*)&k)[1];
+	const uint32_t q = ((uint32_t*)&k)[1];
+	int64_t tmp = a - ((uint64_t)(q) * b);
+	((int32_t*)(&tmp))[1] -= q < a1 ? b : 0;
+	
+	const int overshoot = ((int*)(&tmp))[1] >> 31;
+	const int64_t tmp_u = (uint32_t)(b - 1) - tmp;
+	const int undershoot = ((int*)&tmp_u)[1] >> 31;
 
-	int64_t tmp = a - (uint64_t)(q[0]) * b;
-	((int32_t*)(&tmp))[1] -= (k < a) ? b : 0;
+	uint64_t result;
+	((uint32_t*)&result)[0] = q + overshoot - undershoot;
+	((uint32_t*)&result)[1] = ((uint32_t*)(&tmp))[0] + ((uint32_t)(overshoot) & b) - ((uint32_t)(undershoot) & b);
 
-	const bool overshoot = ((int32_t*)(&tmp))[1] < 0;
-	const bool undershoot = tmp >= b;
-
-	q[0] += (undershoot ? 1U : 0U) - (overshoot ? 1U : 0U);
-	q[1] = ((uint32_t*)(&tmp))[0] + (overshoot ? b : 0U) - (undershoot ? b : 0U);
-
-	return *((uint64_t*)(q));
+	return result;
 }
 
 __device__ __forceinline__ uint32_t fast_sqrt_v2(const uint64_t n1)
diff --git a/xmrstak/jconf.cpp b/xmrstak/jconf.cpp
index ca2fa9585..2a2dc8dbc 100644
--- a/xmrstak/jconf.cpp
+++ b/xmrstak/jconf.cpp
@@ -98,15 +98,17 @@ xmrstak::coin_selection coins[] = {
 	{ "cryptonight_lite",    {cryptonight_aeon, cryptonight_lite, 255u},          {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "cryptonight_lite_v7", {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr },
 	{ "cryptonight_lite_v7_xor", {cryptonight_aeon, cryptonight_ipbc, 255u},      {cryptonight_aeon, cryptonight_aeon, 0u}, nullptr },
-	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
-	{ "cryptonight_v8",      {cryptonight_monero, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "cryptonight_superfast",   {cryptonight_heavy, cryptonight_superfast, 255u},{cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
+	{ "cryptonight_v7",      {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
+	{ "cryptonight_v8",      {cryptonight_monero_v8, cryptonight_monero_v8, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "cryptonight_v7_stellite", {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
-	{ "graft",               {cryptonight_monero_v8, cryptonight_monero, 11u},    {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "freehaven",           {cryptonight_heavy, cryptonight_superfast, 255u},    {cryptonight_heavy, cryptonight_superfast, 0u},   nullptr },
+	{ "graft",               {cryptonight_monero_v8, cryptonight_monero_v8, 0u},    {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "haven",               {cryptonight_heavy, cryptonight_haven, 255u},        {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
-	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "intense",             {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "masari",              {cryptonight_monero_v8, cryptonight_masari, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u},nullptr },
-	{ "monero",              {cryptonight_monero_v8, cryptonight_monero, 8u},     {cryptonight_monero_v8, cryptonight_monero, 8u}, "pool.usxmrpool.com:3333" },
-	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero, 8u}, nullptr },
+	{ "monero",              {cryptonight_monero_v8, cryptonight_monero_v8, 0u},     {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, "pool.usxmrpool.com:3333" },
+	{ "qrl",             	 {cryptonight_monero_v8, cryptonight_monero, 255u},   {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "ryo",                 {cryptonight_heavy, cryptonight_heavy, 0u},          {cryptonight_heavy, cryptonight_heavy, 0u},   nullptr },
 	{ "stellite",            {cryptonight_monero_v8, cryptonight_stellite, 255u}, {cryptonight_monero_v8, cryptonight_monero_v8, 0u}, nullptr },
 	{ "turtlecoin",          {cryptonight_aeon, cryptonight_aeon, 0u},            {cryptonight_aeon, cryptonight_aeon, 0u},     nullptr }
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index d20ba082f..406c535d2 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -706,6 +706,9 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		case cryptonight_masari:
 			algo_name = "cryptonight_masari";
 			break;
+		case cryptonight_superfast:
+			algo_name = "cryptonight_superfast";
+			break;
 		default:
 			algo_name = "unknown";
 			break;
diff --git a/xmrstak/pools.tpl b/xmrstak/pools.tpl
index e86e2a537..58762de56 100644
--- a/xmrstak/pools.tpl
+++ b/xmrstak/pools.tpl
@@ -24,6 +24,7 @@ POOLCONF],
  *    aeon7 (use this for Aeon's new PoW)
  *    bbscoin (automatic switch with block version 3 to cryptonight_v7)
  *    bittube (uses cryptonight_bittube2 algorithm)
+ *    freehaven
  *    graft
  *    haven (automatic switch with block version 3 to cryptonight_haven)
  *    intense
@@ -41,6 +42,7 @@ POOLCONF],
  *    cryptonight_lite_v7_xor (algorithm used by ipbc)
  *    # 2MiB scratchpad memory
  *    cryptonight
+ *    cryptonight_superfast
  *    cryptonight_v7
  *    cryptonight_v8
  *    # 4MiB scratchpad memory
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 47d676350..09bfe8d8f 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -3,7 +3,7 @@
 //! git will put "#define GIT_ARCHIVE 1" on the next line inside archives. 
 #define GIT_ARCHIVE 1
 #if defined(GIT_ARCHIVE) && !defined(GIT_COMMIT_HASH)
-#define GIT_COMMIT_HASH 871371622
+#define GIT_COMMIT_HASH 7d6cd31b1
 #endif
 
 #ifndef GIT_COMMIT_HASH
@@ -19,7 +19,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak"
-#define XMR_STAK_VERSION "2.6.0"
+#define XMR_STAK_VERSION "2.7.1"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"