diff --git a/file/hasher/common_test.go b/file/hasher/common_test.go
new file mode 100644
index 0000000000..bad3556420
--- /dev/null
+++ b/file/hasher/common_test.go
@@ -0,0 +1,66 @@
+package hasher
+
+import (
+ "github.com/ethersphere/swarm/testutil"
+)
+
+const (
+ sectionSize = 32
+ branches = 128
+ chunkSize = 4096
+)
+
+var (
+ dataLengths = []int{31, // 0
+ 32, // 1
+ 33, // 2
+ 63, // 3
+ 64, // 4
+ 65, // 5
+ chunkSize, // 6
+ chunkSize + 31, // 7
+ chunkSize + 32, // 8
+ chunkSize + 63, // 9
+ chunkSize + 64, // 10
+ chunkSize * 2, // 11
+ chunkSize*2 + 32, // 12
+ chunkSize * 128, // 13
+ chunkSize*128 + 31, // 14
+ chunkSize*128 + 32, // 15
+ chunkSize*128 + 64, // 16
+ chunkSize * 129, // 17
+ chunkSize * 130, // 18
+ chunkSize * 128 * 128, // 19
+ chunkSize*128*128 + 32, // 20
+ }
+ expected = []string{
+ "ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0
+ "0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1
+ "3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2
+ "95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3
+ "490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4
+ "541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5
+ "c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6
+ "91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7
+ "73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8
+ "db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9
+ "ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10
+ "29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11
+ "61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12
+ "3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13
+ "e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14
+ "485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15
+ "624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16
+ "b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17
+ "59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18
+ "522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19
+ "ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20
+ }
+
+ start = 0
+ end = len(dataLengths)
+)
+
+func init() {
+ testutil.Init()
+}
diff --git a/file/hasher/hasher.go b/file/hasher/hasher.go
index 9478fb79b8..5cebba192f 100644
--- a/file/hasher/hasher.go
+++ b/file/hasher/hasher.go
@@ -14,7 +14,7 @@
// You should have received a copy of the GNU Lesser General Public License
// along with the Swarm library. If not, see .
-package file
+package hasher
import (
"context"
diff --git a/file/hasher/hasher_test.go b/file/hasher/hasher_test.go
index babb981ef3..91ca296d81 100644
--- a/file/hasher/hasher_test.go
+++ b/file/hasher/hasher_test.go
@@ -14,7 +14,7 @@
// You should have received a copy of the GNU Lesser General Public License
// along with the Swarm library. If not, see .
-package file
+package hasher
import (
"bytes"
diff --git a/file/hasher/param.go b/file/hasher/param.go
new file mode 100644
index 0000000000..6de12f1065
--- /dev/null
+++ b/file/hasher/param.go
@@ -0,0 +1,56 @@
+package hasher
+
+import (
+ "context"
+ "sync"
+
+ "github.com/ethersphere/swarm/file"
+)
+
+// defines the boundaries of the hashing job and also contains the hash factory function of the job
+// setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start)
+type treeParams struct {
+ SectionSize int
+ Branches int
+ ChunkSize int
+ Spans []int
+ Debug bool
+ hashFunc file.SectionWriterFunc
+ writerPool sync.Pool
+ ctx context.Context
+}
+
+func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams {
+
+ h := hashFunc(context.Background())
+ p := &treeParams{
+ SectionSize: h.SectionSize(),
+ Branches: h.Branches(),
+ ChunkSize: h.SectionSize() * h.Branches(),
+ hashFunc: hashFunc,
+ }
+ h.Reset()
+ p.writerPool.New = func() interface{} {
+ hf := p.hashFunc(p.ctx)
+ return hf
+ }
+ p.Spans = generateSpanSizes(p.Branches, 9)
+ return p
+}
+
+func (p *treeParams) SetContext(ctx context.Context) {
+ p.ctx = ctx
+}
+
+func (p *treeParams) GetContext() context.Context {
+ return p.ctx
+}
+
+func (p *treeParams) PutWriter(w file.SectionWriter) {
+ w.Reset()
+ p.writerPool.Put(w)
+}
+
+func (p *treeParams) GetWriter() file.SectionWriter {
+ return p.writerPool.Get().(file.SectionWriter)
+}
diff --git a/file/hasher/reference.go b/file/hasher/reference.go
new file mode 100644
index 0000000000..0ceb570ee8
--- /dev/null
+++ b/file/hasher/reference.go
@@ -0,0 +1,145 @@
+package hasher
+
+import (
+ "github.com/ethersphere/swarm/file"
+)
+
+// ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm
+type ReferenceHasher struct {
+ params *treeParams
+ cursors []int // section write position, indexed per level
+ length int // number of bytes written to the data level of the hasher
+ buffer []byte // keeps data and hashes, indexed by cursors
+ counts []int // number of sums performed, indexed per level
+ hasher file.SectionWriter // underlying hasher
+}
+
+// NewReferenceHasher constructs and returns a new ReferenceHasher
+// This implementation is limited to a tree of 9 levels, where level 0 is the data level
+// With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means
+// a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes
+func NewReferenceHasher(params *treeParams) *ReferenceHasher {
+ // TODO: remove when bmt interface is amended
+ h := params.GetWriter()
+ return &ReferenceHasher{
+ params: params,
+ cursors: make([]int, 9),
+ counts: make([]int, 9),
+ buffer: make([]byte, params.ChunkSize*9),
+ hasher: h,
+ }
+}
+
+// Hash computes and returns the root hash of arbitrary data
+func (r *ReferenceHasher) Hash(data []byte) []byte {
+ l := r.params.ChunkSize
+ for i := 0; i < len(data); i += r.params.ChunkSize {
+ if len(data)-i < r.params.ChunkSize {
+ l = len(data) - i
+ }
+ r.update(0, data[i:i+l])
+ }
+
+ // if we didn't end on a chunk boundary we need to hash remaining chunks first
+ r.hashUnfinished()
+
+ // if the already hashed parts tree is balanced
+ r.moveDanglingChunk()
+
+ return r.digest()
+}
+
+// write to the data buffer on the specified level
+// calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash
+// adjusts cursors accordingly
+func (r *ReferenceHasher) update(lvl int, data []byte) {
+ if lvl == 0 {
+ r.length += len(data)
+ }
+ copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data)
+ r.cursors[lvl] += len(data)
+ if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize {
+ ref := r.sum(lvl)
+ r.update(lvl+1, ref)
+ r.cursors[lvl] = r.cursors[lvl+1]
+ }
+}
+
+// calculates and returns the bmt sum of the last written data on the level
+func (r *ReferenceHasher) sum(lvl int) []byte {
+ r.counts[lvl]++
+ spanSize := r.params.Spans[lvl] * r.params.ChunkSize
+ span := (r.length-1)%spanSize + 1
+
+ sizeToSum := r.cursors[lvl] - r.cursors[lvl+1]
+
+ r.hasher.Reset()
+ r.hasher.SetSpan(span)
+ r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum])
+ ref := r.hasher.Sum(nil)
+ return ref
+}
+
+// called after all data has been written
+// sums the final chunks of each level
+// skips intermediate levels that end on span boundary
+func (r *ReferenceHasher) digest() []byte {
+
+ // the first section of the buffer will hold the root hash
+ return r.buffer[:r.params.SectionSize]
+}
+
+// hashes the remaining unhashed chunks at the end of each level
+func (r *ReferenceHasher) hashUnfinished() {
+ if r.length%r.params.ChunkSize != 0 {
+ ref := r.sum(0)
+ copy(r.buffer[r.cursors[1]:], ref)
+ r.cursors[1] += len(ref)
+ r.cursors[0] = r.cursors[1]
+ }
+}
+
+// in case of a balanced tree this method concatenates the reference to the single reference
+// at the highest level of the tree.
+//
+// Let F be full chunks (disregarding branching factor) and S be single references
+// in the following scenario:
+//
+// S
+// F F
+// F F F
+// F F F F S
+//
+// The result will be:
+//
+// SS
+// F F
+// F F F
+// F F F F
+//
+// After which the SS will be hashed to obtain the final root hash
+func (r *ReferenceHasher) moveDanglingChunk() {
+
+ // calculate the total number of levels needed to represent the data (including the data level)
+ targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches)
+
+ // sum every intermediate level and write to the level above it
+ for i := 1; i < targetLevel; i++ {
+
+ // and if there is a single reference outside a balanced tree on this level
+ // don't hash it again but pass it on to the next level
+ if r.counts[i] > 0 {
+ // TODO: simplify if possible
+ if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 {
+ r.cursors[i+1] = r.cursors[i]
+ r.cursors[i] = r.cursors[i-1]
+ continue
+ }
+ }
+
+ ref := r.sum(i)
+ copy(r.buffer[r.cursors[i+1]:], ref)
+ r.cursors[i+1] += len(ref)
+ r.cursors[i] = r.cursors[i+1]
+ }
+}
diff --git a/file/hasher/reference_test.go b/file/hasher/reference_test.go
new file mode 100644
index 0000000000..d4deef5c0b
--- /dev/null
+++ b/file/hasher/reference_test.go
@@ -0,0 +1,140 @@
+package hasher
+
+import (
+ "context"
+ "fmt"
+ "strconv"
+ "strings"
+ "testing"
+
+ "github.com/ethereum/go-ethereum/common/hexutil"
+ "github.com/ethersphere/swarm/bmt"
+ "github.com/ethersphere/swarm/file"
+ "github.com/ethersphere/swarm/log"
+ "github.com/ethersphere/swarm/testutil"
+ "golang.org/x/crypto/sha3"
+)
+
+// TestManualDanglingChunk is a test script explicitly hashing and writing every individual level in the dangling chunk edge case
+// we use a balanced tree with data size of chunkSize*branches, and a single chunk of data
+// this case is chosen because it produces the wrong result in the pyramid hasher at the time of writing (master commit hash 4928d989ebd0854d993c10c194e61a5a5455e4f9)
+func TestManualDanglingChunk(t *testing.T) {
+ pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+ h := bmt.New(pool)
+
+ // to execute the job we need buffers with the following capacities:
+ // level 0: chunkSize*branches+chunkSize
+ // level 1: chunkSize
+ // level 2: sectionSize * 2
+ var levels [][]byte
+ levels = append(levels, nil)
+ levels = append(levels, make([]byte, chunkSize))
+ levels = append(levels, make([]byte, sectionSize*2))
+
+ // hash the balanced tree portion of the data level and write to level 1
+ _, levels[0] = testutil.SerialData(chunkSize*branches+chunkSize, 255, 0)
+ for i := 0; i < chunkSize*branches; i += chunkSize {
+ h.Reset()
+ h.SetSpan(chunkSize)
+ h.Write(levels[0][i : i+chunkSize])
+ copy(levels[1][i/branches:], h.Sum(nil))
+ }
+ refHex := hexutil.Encode(levels[1][:sectionSize])
+ correctRefHex := "0xc10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef"
+ if refHex != correctRefHex {
+ t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
+ }
+
+ // write the dangling chunk
+ // hash it and write the reference on the second section of level 2
+ h.Reset()
+ h.SetSpan(chunkSize)
+ h.Write(levels[0][chunkSize*branches:])
+ copy(levels[2][sectionSize:], h.Sum(nil))
+ refHex = hexutil.Encode(levels[2][sectionSize:])
+ correctRefHex = "0x81b31d9a7f6c377523e8769db021091df23edd9fd7bd6bcdf11a22f518db6006"
+ if refHex != correctRefHex {
+ t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex)
+ }
+
+ // hash the chunk on level 1 and write into the first section of level 2
+ h.Reset()
+ h.SetSpan(chunkSize * branches)
+ h.Write(levels[1])
+ copy(levels[2], h.Sum(nil))
+ refHex = hexutil.Encode(levels[2][:sectionSize])
+ correctRefHex = "0x3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09"
+ if refHex != correctRefHex {
+ t.Fatalf("manual dangling balanced tree; expected %s, got %s", correctRefHex, refHex)
+ }
+
+ // hash the two sections on level 2 to obtain the root hash
+ h.Reset()
+ h.SetSpan(chunkSize*branches + chunkSize)
+ h.Write(levels[2])
+ ref := h.Sum(nil)
+ refHex = hexutil.Encode(ref)
+ correctRefHex = "0xb8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199"
+ if refHex != correctRefHex {
+ t.Fatalf("manual dangling root; expected %s, got %s", correctRefHex, refHex)
+ }
+}
+
+// TestReferenceFileHasherVector executes the file hasher algorithms on serial input data of periods of 0-254
+// of lengths defined in common_test.go
+//
+// the "expected" array in common_test.go is generated by this implementation, and test failure due to
+// result mismatch is nothing else than an indication that something has changed in the reference filehasher
+// or the underlying hashing algorithm
+func TestReferenceHasherVector(t *testing.T) {
+
+ hashFunc := func(_ context.Context) file.SectionWriter {
+ pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+ return bmt.New(pool)
+ }
+ params := newTreeParams(hashFunc)
+ var mismatch int
+ for i := start; i < end; i++ {
+ dataLength := dataLengths[i]
+ log.Info("start", "i", i, "len", dataLength)
+ rh := NewReferenceHasher(params)
+ _, data := testutil.SerialData(dataLength, 255, 0)
+ refHash := rh.Hash(data)
+ eq := true
+ if expected[i] != fmt.Sprintf("%x", refHash) {
+ mismatch++
+ eq = false
+ }
+ t.Logf("[%7d+%4d]\t%v\tref: %x\texpect: %s", dataLength/chunkSize, dataLength%chunkSize, eq, refHash, expected[i])
+ }
+ if mismatch > 0 {
+ t.Fatalf("mismatches: %d/%d", mismatch, end-start)
+ }
+}
+
+// BenchmarkReferenceHasher establishes a baseline for a fully synchronous file hashing operation
+// it will be vastly inefficient
+func BenchmarkReferenceHasher(b *testing.B) {
+ for i := start; i < end; i++ {
+ b.Run(fmt.Sprintf("%d", dataLengths[i]), benchmarkReferenceHasher)
+ }
+}
+
+func benchmarkReferenceHasher(b *testing.B) {
+ benchParams := strings.Split(b.Name(), "/")
+ dataLength, err := strconv.ParseInt(benchParams[1], 10, 64)
+ if err != nil {
+ b.Fatal(err)
+ }
+ hashFunc := func(_ context.Context) file.SectionWriter {
+ pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize)
+ return bmt.New(pool)
+ }
+ params := newTreeParams(hashFunc)
+ b.ResetTimer()
+ for i := 0; i < b.N; i++ {
+ _, data := testutil.SerialData(int(dataLength), 255, 0)
+ fh := NewReferenceHasher(params)
+ fh.Hash(data)
+ }
+}
diff --git a/file/hasher/util.go b/file/hasher/util.go
new file mode 100644
index 0000000000..141fd1d114
--- /dev/null
+++ b/file/hasher/util.go
@@ -0,0 +1,31 @@
+package hasher
+
+import (
+ "math"
+)
+
+// TODO: level 0 should be SectionSize() not Branches()
+// generates a dictionary of maximum span lengths per level represented by one SectionSize() of data
+func generateSpanSizes(branches int, levels int) []int {
+ spans := make([]int, levels)
+ span := 1
+ for i := 0; i < 9; i++ {
+ spans[i] = span
+ span *= branches
+ }
+ return spans
+}
+
+// TODO: use params instead of sectionSize, branches
+// calculate the last level index which a particular data section count will result in.
+// the returned level will be the level of the root hash
+func getLevelsFromLength(l int, sectionSize int, branches int) int {
+ if l == 0 {
+ return 0
+ } else if l <= sectionSize*branches {
+ return 1
+ }
+ c := (l - 1) / (sectionSize)
+
+ return int(math.Log(float64(c))/math.Log(float64(branches)) + 1)
+}
diff --git a/file/hasher/util_test.go b/file/hasher/util_test.go
new file mode 100644
index 0000000000..51640e4ad5
--- /dev/null
+++ b/file/hasher/util_test.go
@@ -0,0 +1,17 @@
+package hasher
+
+import "testing"
+
+// TestLevelsFromLength verifies getLevelsFromLength
+func TestLevelsFromLength(t *testing.T) {
+
+ sizes := []int{sectionSize, chunkSize, chunkSize + sectionSize, chunkSize * branches, chunkSize*branches + 1}
+ expects := []int{1, 1, 2, 2, 3}
+
+ for i, size := range sizes {
+ lvl := getLevelsFromLength(size, sectionSize, branches)
+ if expects[i] != lvl {
+ t.Fatalf("size %d, expected %d, got %d", size, expects[i], lvl)
+ }
+ }
+}
diff --git a/testutil/data.go b/testutil/data.go
new file mode 100644
index 0000000000..f3bea59e91
--- /dev/null
+++ b/testutil/data.go
@@ -0,0 +1,15 @@
+package testutil
+
+import (
+ "bytes"
+ "io"
+)
+
+func SerialData(l int, mod int, offset int) (r io.Reader, slice []byte) {
+ slice = make([]byte, l)
+ for i := 0; i < len(slice); i++ {
+ slice[i] = byte((i + offset) % mod)
+ }
+ r = io.LimitReader(bytes.NewReader(slice), int64(l))
+ return
+}