From 140260262ed0cc6e9cda98e643c238783f10ed98 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 21 Mar 2024 19:55:58 -0700 Subject: [PATCH] xxh --- default.pgo | Bin 1775 -> 2865 bytes go.mod | 1 + go.sum | 2 ++ main.go | 33 +++++++++++++++------------------ 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/default.pgo b/default.pgo index d17ab390d873622221ba7b7f7fa0338d28beedd9..2566ae6aaacbe42c350f05a319fe68e61df0a783 100644 GIT binary patch literal 2865 zcmV-13(oW(iwFP!00004|EyPQa2(Zg&T4n1)~p^qD@$rgmNYt+rO{&VT4Py$f?wbV z$Y5*?<|#(2@oJ;pnXQ?XANjQ@Trr`X*hK0Oaoinqe$2Bgl@BO!P>&&>;AgW*)`9Oee%NdZ~y(-dMN|}ERjMG z#L)V4|MJc8h6lv&M+-0fXNVC5sPOcghmeIKfTsd13?=xN0Sv?<5Wp~nl~VO2|0uyS zDpN|;1A%A+kJe)0@`-o1K!6dH(1qt*2SMCf!om>3ORj?;0)bMhK2{QoKnTmJTq#vQ z<2FG$;s-rF9yA)k5Ec&qZVi+`kjW6DtN-@N7_u;g@k&sD3Zq1l5>dYmGK4U_{3qwf zkcFTWS7RV1K`B;Hg;J?@G8szge?EF|j9ZuCD=wlktfDHVT0O`?l+joJ_mS`pc)pT_ zp&Z|?7eN4#P?6QMm9YqvW0azbtiD;#WB@vF?7xJKgu7bAVF_!fMv>L4%}j=J+W(%r zAHk!oVn2e@XqqCc2V0p85~j#Cob%NL|7RD(U~PRikJH z29WDO3RYC|gjdnt18%SoeilGpkTNX%-Aipy1w7%^wEKG=CIprE5OVu!EF3tpS_)QI z1{nd;QO~{#zvXoe;pZGp1gfxs8kAD?>j^$l6j`Q2o{b-~1PiB+mV*r3XOv!h-o;dn zdt7~e>H4h3Mru?_)mvJn0Dr{=#@>8x&0N>|0Tv10QY{9}m{mz~M)G&f+ z^xVGUsfj8q_p$=hm<+WvIu%|i7JmQubf^`f>gb}kUxinAD3@|Hf)Oly{OIAT9LUZb}N+O1UZ5v>u3f>_vfunJlQB8B#i@vMX=kC;GyO!Ldks`4fLbdJm zGq0W@A;w7q|xhZYbHbJpbLJXb_l>OZ7J9x+Od;f^1J#;-i{Ss+N5BoV5HK& z`iv$k*sFrd5W47upWF=-liMYd8>3N=Nfmz2v*MS?1p_gW+?%NI)TJ>#Yh^4G%9OF0 znw1u{G#c>|4L6A!VHSP(M{cW5PF=(UMO09~bj)s{GbLbDwxx>E(f18gB32Ly7+LUTF%bKB^ z_MGN(It0^@u@KZ^iiM#L|AWtLi7ygUW~gm332L#O+LdZ`ODqC)IFn{7vii@hj6kEm z9{pbdp&q|%Wnm!v8xTn9v4c7kS-sR6ivVFKbt zexR4hFrQBNQJ&#WD!g4OIA26rKqsa~@yr(paV(@meleGcV!FkWf(u147ttYK=&4Ej zi^Tq7I_ekOm8jdfuvkcR2|e!jkpVq@tA-`qx`qYGY#>+&E<+RU7yn7O2&&_fOvU*rSmoM=r zNC+BnCJRFo{sn)5NYIEeiYYg#?Xd_n;Vhb^$m)Ln09i@bo8&$MD7TImIrLMv8k za~Z*{bl9Uey-4q`cmqUWI?krqN?iT#!1xt+D?_-AcKcl2>FaU14H{TItfJT6a@)C? z&v1SvFA_!ne~Yuj2mDa^t}5z~2sc&?&zRdnc8_J9dXS)Y@`c6G9kP}8<`CE(ADGaOS}d5hD7ySa1PB;BI=$HL)b*S zo^fB|t$2^P9a^!Qx|K?G9h2c+`s%zpj1@fKYEeOrG^JYI8(=crM@K&9Z4ibw{P#NH zy$$EmTt!yDsbw;3rk5|f`%U-{^~g=`rx(xg0Uj2^7Xwk*gDb&3o9?}D^UC}AQO3H{=#~QUOdeSg5IWrkWGlVB#!r};{?+A(4FKlsqYqLCk=m(RPb zz5gd2c&1h)rvn$z0!3Dj^W^l?KR(N&g`pE)Z4&#PxR4eqvU-G=43E;Tx7>XRM_WYl zbh>)MT~hH~MVP3#h!!cb`i?RoAE9vRBmIx+Jd*>o*PAMV;#A>(6UPV1Kw}gFSSY+R zhEyngeF)XX3j*42Xl0I`&1DQn3n#4+Gp_}cxe+a5>ZauwNy{8`v}jgOn{hR^&9a9L zyDM&wm|9C>lVjLUBAd<_P9kSpznSPBu#*XKG-YX3;)gwA#w65Vl4i+3kdR>HB93C~T!+2QmL>h`v@$>GV~a@;Ypx|1=CoVM&n zyjs_EYFc^YS8$rKlEeKR7)M%JM4DZ%+r!%88$p?~tt9tVQ}pHJtsIYH3}!X-7n4KA z;7G>Ms){Q)-5&1GPJocqGZ}5^jX=h;&+TM zEO@vpzS_uZD}DljDgKvjog?69^;~Zcuh+CSBx$*PB*1dEvfverSVk z4&N9f4$#)k;eKn2h~}%-i{^_xWeH`k&E_%_7mFF0&;h2g&B^P@8{{oaId6ZsWG&ke ztC~NC(q`VUO+Ay)^Hw&UjH|J=W`3^L6<=v(c_S=02RClhb6VexWPXbCM1Duka5y4= zh-}k`jhsdqE0u6ERw}N>`m9vSu)E^7*|ueCxBN0;i(dGTlwtEgeZPk8F^%o{wdSC) zU27`pWzrqsq*JDz(W1poJ}Eh`k=*zipTORluL~!y+xhJH8E%)f4Lxs+=O<$&hqdv( zbMD?rP*0~uqy15%c;J)NJUwNA?K__aBZHWS?gcJk)| P00960z}5_F7Zv~j*oKUN literal 1775 zcmV~-w$Jvg@56In_U6sko|EkMR#6lQBubIe zAT=T?L_#2R?D;l#%XvT8T{nbZrbS69MM1fosL+YDNg)!Y2xz1rwLcKODg@}Cil(7L z3TY||qCkNng%(k?@2-7!Q7UNq=WhMZ%rnpX%ro!)2Os>;8*jhz%+*>!1OZb85z;6g zy#Mmm-&a1(zdzS~@)eO02=wU5W6Q`gAmC+zWgvy;1QeP@5HLd-Ijg)7{YYVsa&lIA zOsE#|r4j7@@)sxDATWXyeZLQIWeuR3#~gN_{^2G_F@iKbawhJQ#-$X?fQWCzUD5~y za#ndL)hvRDdCJRK<@E?yr0cf?E@B$H$Iq1^4bcTMbgG~B;)ec!-9Mb%1{sEsrS9^N zmyn5&!5j+B0%Wj21-YnXt3^y<_vO_*WI;8)_bzsAsj6{^Tt<^I$fog%Lppf~#(U5V(#L$2m~ z&;yL1L=PvxIs8|Y6mwqdU*cG0dbN-Bn<%Z1mB?6T5>)7&=buZH5AU*Z1GkhKeyqnLXfK{r>gUWU$fkdxe|0b_4;)VEbMI53bd006YIipJd_~+w1 zK7iLsECVHcp~jmG;0TS#HRWolS%eZ2kz7-rt1$@%>Fm$H!)wZTyv~!$I7*{(O?hXM zNiamGFURo;t~PkQf@3r$*OYS&Cc!X0aQ;WU9`UC!C&Y0YmnW3(3RT1mcCY=ZR>)OQ zstyhfGlCI1e=;7FaFa0;tBf#$8a@6lLr>JKanXrZAB(Zlcp@Hpv_EURKp--NQToPn zQJM%@{1mPWk8)Rz(dwxv9PRooZ7{|p7^h!79}6MjUt*yooTN#)ru?zaB$%L!UykEd zyh2>4D%Pnk*OX_8Nia#rFU9dPuGCRr64dGJ%kj}5UW>II#0E9wnsTwh2paUn%J;ao zL-_Xw%fK-H066jx$|TD*!^|{*OZgdhQ-Ph za@&=u=8m1a#;PD-w|i;{soTAk#&W}s(UZpR7rbj$1LMaOX5z-+nB zT%cEN!*W!mdBpV>O@B)DJC0u04h5zkXtvch1I_c@&ui1OeoM0r%TecDy=)tnqxv03 zJ=}3x(|UymJi`yneQnnW^+|r9G<%>tbdTkjcO4DQKp*luPH5StY6M4})jc`gs_W{sHt%YI-_jBU&5Mg?4By_fTMhJm^H#XfnN?e^ ztt}dM5c(ZYvz*X2JXL8PFuc#qyK1mtc&6I1oN#*2&i?Oa$8y5GpPf?gHbecc4P@qB z&0Cz;YL1xa=KoYCI2z2%8U7K=iMU8T^uV-@plzC-e&_9=y0-*% z+|Ya?zn3j4>?SqEDsDJQwSbDzOim(QISV8D|Q5SR0*0F!sS}^B2ZBsAz zPCUb3oUzwYXc_Ic{^8r9U<7vGlzkel1+%sINy|3N~`CeUqQ+mEdR)DWE3*^>Xr|+8xI>JpG>c z(lM#kzLb0queW2$u{;RP;I?wODv9p$=P0#wR R{{{d6|NjO+eeSdj007uUg+2fP diff --git a/go.mod b/go.mod index f45e8f0..786b706 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.22.0 toolchain go1.22.1 require ( + github.com/cespare/xxhash/v2 v2.2.0 github.com/kamstrup/intmap v0.2.0 go.coldcutz.net/go-stuff v0.0.0-20240222020121-e7bc41ea880c golang.org/x/exp v0.0.0-20240318143956-a85f2c67cd81 diff --git a/go.sum b/go.sum index 8bc5f3b..2fc03b1 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +1,5 @@ +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= diff --git a/main.go b/main.go index d117482..098c46b 100644 --- a/main.go +++ b/main.go @@ -12,6 +12,7 @@ import ( "sync" "syscall" + "github.com/cespare/xxhash/v2" "github.com/kamstrup/intmap" "go.coldcutz.net/go-stuff/utils" "golang.org/x/exp/maps" @@ -105,6 +106,7 @@ type stats struct { // 7.346 s ± 0.144 s - swiss map // 4.530 s ± 0.077 s - intmap plus remove interning indirection // 4.134 s ± 0.118 s - guess based split on semi +// 4.418 s ± 0.129 s - use a real hash function to make it more legit. slower :( // // graveyard: // - iterating in reverse order in splitOnSemi @@ -151,10 +153,10 @@ func run(log *slog.Logger) error { nextStart = chunks[ci].end + 1 } - resultses := make([]*intmap.Map[uint16, *stats], numWorkers) + resultses := make([]*intmap.Map[uint64, *stats], numWorkers) for i := range numWorkers { - res := intmap.New[uint16, *stats](10_000) + res := intmap.New[uint64, *stats](10_000) resultses[i] = res chunk := chunks[i] @@ -208,7 +210,7 @@ func NewWorker() *worker { return &worker{} } -func (w *worker) run(chunk []byte, res *intmap.Map[uint16, *stats]) error { +func (w *worker) run(chunk []byte, res *intmap.Map[uint64, *stats]) error { // our chunk is guaranteed to be made of full lines only lineStart := 0 for i := 0; i < len(chunk); i++ { @@ -234,7 +236,7 @@ func (w *worker) run(chunk []byte, res *intmap.Map[uint16, *stats]) error { return nil } -func (w *worker) parseLineBytes(line []byte) ([]byte, uint16, float32, error) { +func (w *worker) parseLineBytes(line []byte) ([]byte, uint64, float32, error) { stationBs, tempStr := w.splitOnSemi(line) stationHash := stationHash(stationBs) @@ -255,13 +257,8 @@ func (w *worker) splitOnSemi(bs []byte) ([]byte, []byte) { panic("no semicolon found") } -// this is a bit sus because it's not resistant to anagrams but uhhhh it's ok :) -func stationHash(name []byte) uint16 { - hash := uint16(0) - for _, b := range name { - hash += uint16(b) - } - return hash +func stationHash(name []byte) uint64 { + return xxhash.Sum64(name) } func parseFloat(bs []byte) float32 { @@ -284,7 +281,7 @@ func parseFloat(bs []byte) float32 { return sign * (float32(ip) + float32(fracPart)/10) } -func printRes(res *intmap.Map[uint16, *stats]) { +func printRes(res *intmap.Map[uint64, *stats]) { // {Abha=-23.0/18.0/59.2, Abidjan=-16.2/26.0/67.3, Abéché=-10.0/29.4/69.0, Accra=-10.1/26.4/66.4, Addis Ababa=-23.7/16.0/67.0, Adelaide=-27.8/17.3/58.5, ...} namesTohashes := getStationsToHashes(res) names := maps.Keys(namesTohashes) @@ -298,10 +295,10 @@ func printRes(res *intmap.Map[uint16, *stats]) { fmt.Printf("}\n") } -func mergeResults(resultses []*intmap.Map[uint16, *stats]) *intmap.Map[uint16, *stats] { - res := intmap.New[uint16, *stats](resultses[0].Len()) +func mergeResults(resultses []*intmap.Map[uint64, *stats]) *intmap.Map[uint64, *stats] { + res := intmap.New[uint64, *stats](resultses[0].Len()) for _, r := range resultses { - r.ForEach(func(k uint16, v *stats) { + r.ForEach(func(k uint64, v *stats) { s, ok := res.Get(k) if !ok { s = v @@ -317,9 +314,9 @@ func mergeResults(resultses []*intmap.Map[uint16, *stats]) *intmap.Map[uint16, * return res } -func getStationsToHashes(m *intmap.Map[uint16, *stats]) map[string]uint16 { - names := make(map[string]uint16, m.Len()) - m.ForEach(func(k uint16, s *stats) { +func getStationsToHashes(m *intmap.Map[uint64, *stats]) map[string]uint64 { + names := make(map[string]uint64, m.Len()) + m.ForEach(func(k uint64, s *stats) { names[s.station] = k }) return names