Skip to content

Commit

Permalink
xxh
Browse files Browse the repository at this point in the history
  • Loading branch information
asg0451 committed Mar 22, 2024
1 parent 7ce6251 commit 1402602
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 18 deletions.
Binary file modified default.pgo
Binary file not shown.
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.22.0
toolchain go1.22.1

require (
github.com/cespare/xxhash/v2 v2.2.0
github.com/kamstrup/intmap v0.2.0
go.coldcutz.net/go-stuff v0.0.0-20240222020121-e7bc41ea880c
golang.org/x/exp v0.0.0-20240318143956-a85f2c67cd81
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=
github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
Expand Down
33 changes: 15 additions & 18 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sync"
"syscall"

"github.com/cespare/xxhash/v2"
"github.com/kamstrup/intmap"
"go.coldcutz.net/go-stuff/utils"
"golang.org/x/exp/maps"
Expand Down Expand Up @@ -105,6 +106,7 @@ type stats struct {
// 7.346 s ± 0.144 s - swiss map
// 4.530 s ± 0.077 s - intmap plus remove interning indirection
// 4.134 s ± 0.118 s - guess based split on semi
// 4.418 s ± 0.129 s - use a real hash function to make it more legit. slower :(
//
// graveyard:
// - iterating in reverse order in splitOnSemi
Expand Down Expand Up @@ -151,10 +153,10 @@ func run(log *slog.Logger) error {
nextStart = chunks[ci].end + 1
}

resultses := make([]*intmap.Map[uint16, *stats], numWorkers)
resultses := make([]*intmap.Map[uint64, *stats], numWorkers)

for i := range numWorkers {
res := intmap.New[uint16, *stats](10_000)
res := intmap.New[uint64, *stats](10_000)
resultses[i] = res
chunk := chunks[i]

Expand Down Expand Up @@ -208,7 +210,7 @@ func NewWorker() *worker {
return &worker{}
}

func (w *worker) run(chunk []byte, res *intmap.Map[uint16, *stats]) error {
func (w *worker) run(chunk []byte, res *intmap.Map[uint64, *stats]) error {
// our chunk is guaranteed to be made of full lines only
lineStart := 0
for i := 0; i < len(chunk); i++ {
Expand All @@ -234,7 +236,7 @@ func (w *worker) run(chunk []byte, res *intmap.Map[uint16, *stats]) error {
return nil
}

func (w *worker) parseLineBytes(line []byte) ([]byte, uint16, float32, error) {
func (w *worker) parseLineBytes(line []byte) ([]byte, uint64, float32, error) {
stationBs, tempStr := w.splitOnSemi(line)

stationHash := stationHash(stationBs)
Expand All @@ -255,13 +257,8 @@ func (w *worker) splitOnSemi(bs []byte) ([]byte, []byte) {
panic("no semicolon found")
}

// this is a bit sus because it's not resistant to anagrams but uhhhh it's ok :)
func stationHash(name []byte) uint16 {
hash := uint16(0)
for _, b := range name {
hash += uint16(b)
}
return hash
func stationHash(name []byte) uint64 {
return xxhash.Sum64(name)
}

func parseFloat(bs []byte) float32 {
Expand All @@ -284,7 +281,7 @@ func parseFloat(bs []byte) float32 {

return sign * (float32(ip) + float32(fracPart)/10)
}
func printRes(res *intmap.Map[uint16, *stats]) {
func printRes(res *intmap.Map[uint64, *stats]) {
// {Abha=-23.0/18.0/59.2, Abidjan=-16.2/26.0/67.3, Abéché=-10.0/29.4/69.0, Accra=-10.1/26.4/66.4, Addis Ababa=-23.7/16.0/67.0, Adelaide=-27.8/17.3/58.5, ...}
namesTohashes := getStationsToHashes(res)
names := maps.Keys(namesTohashes)
Expand All @@ -298,10 +295,10 @@ func printRes(res *intmap.Map[uint16, *stats]) {
fmt.Printf("}\n")
}

func mergeResults(resultses []*intmap.Map[uint16, *stats]) *intmap.Map[uint16, *stats] {
res := intmap.New[uint16, *stats](resultses[0].Len())
func mergeResults(resultses []*intmap.Map[uint64, *stats]) *intmap.Map[uint64, *stats] {
res := intmap.New[uint64, *stats](resultses[0].Len())
for _, r := range resultses {
r.ForEach(func(k uint16, v *stats) {
r.ForEach(func(k uint64, v *stats) {
s, ok := res.Get(k)
if !ok {
s = v
Expand All @@ -317,9 +314,9 @@ func mergeResults(resultses []*intmap.Map[uint16, *stats]) *intmap.Map[uint16, *
return res
}

func getStationsToHashes(m *intmap.Map[uint16, *stats]) map[string]uint16 {
names := make(map[string]uint16, m.Len())
m.ForEach(func(k uint16, s *stats) {
func getStationsToHashes(m *intmap.Map[uint64, *stats]) map[string]uint64 {
names := make(map[string]uint64, m.Len())
m.ForEach(func(k uint64, s *stats) {
names[s.station] = k
})
return names
Expand Down

0 comments on commit 1402602

Please sign in to comment.