This repository was archived by the owner on Aug 2, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 110
file, testutil: Add reference file hasher #2099
Merged
Merged
Changes from 5 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
42b1887
file, testutil: Add reference file hasher
nolash 457b569
file: Remove premature code
nolash 65a444e
file: Remove unused zeroHex and unused logs
nolash 93bdad9
file: Add comments
nolash d603c6d
file: Elaborate comments, remove redundant loglines, var rename
nolash 028aa1e
file: Split up digest function, add explanations
nolash fe7ddee
file: Purify digest method
nolash File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,66 @@ | ||
| package hasher | ||
|
|
||
| import ( | ||
| "github.com/ethersphere/swarm/testutil" | ||
| ) | ||
|
|
||
| const ( | ||
| sectionSize = 32 | ||
| branches = 128 | ||
| chunkSize = 4096 | ||
| ) | ||
|
|
||
| var ( | ||
| dataLengths = []int{31, // 0 | ||
| 32, // 1 | ||
| 33, // 2 | ||
| 63, // 3 | ||
| 64, // 4 | ||
| 65, // 5 | ||
| chunkSize, // 6 | ||
| chunkSize + 31, // 7 | ||
| chunkSize + 32, // 8 | ||
| chunkSize + 63, // 9 | ||
| chunkSize + 64, // 10 | ||
| chunkSize * 2, // 11 | ||
| chunkSize*2 + 32, // 12 | ||
| chunkSize * 128, // 13 | ||
| chunkSize*128 + 31, // 14 | ||
| chunkSize*128 + 32, // 15 | ||
| chunkSize*128 + 64, // 16 | ||
| chunkSize * 129, // 17 | ||
| chunkSize * 130, // 18 | ||
| chunkSize * 128 * 128, // 19 | ||
| chunkSize*128*128 + 32, // 20 | ||
| } | ||
| expected = []string{ | ||
| "ece86edb20669cc60d142789d464d57bdf5e33cb789d443f608cbd81cfa5697d", // 0 | ||
| "0be77f0bb7abc9cd0abed640ee29849a3072ccfd1020019fe03658c38f087e02", // 1 | ||
| "3463b46d4f9d5bfcbf9a23224d635e51896c1daef7d225b86679db17c5fd868e", // 2 | ||
| "95510c2ff18276ed94be2160aed4e69c9116573b6f69faaeed1b426fea6a3db8", // 3 | ||
| "490072cc55b8ad381335ff882ac51303cc069cbcb8d8d3f7aa152d9c617829fe", // 4 | ||
| "541552bae05e9a63a6cb561f69edf36ffe073e441667dbf7a0e9a3864bb744ea", // 5 | ||
| "c10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef", // 6 | ||
| "91699c83ed93a1f87e326a29ccd8cc775323f9e7260035a5f014c975c5f3cd28", // 7 | ||
| "73759673a52c1f1707cbb61337645f4fcbd209cdc53d7e2cedaaa9f44df61285", // 8 | ||
| "db1313a727ffc184ae52a70012fbbf7235f551b9f2d2da04bf476abe42a3cb42", // 9 | ||
| "ade7af36ac0c7297dc1c11fd7b46981b629c6077bce75300f85b02a6153f161b", // 10 | ||
| "29a5fb121ce96194ba8b7b823a1f9c6af87e1791f824940a53b5a7efe3f790d9", // 11 | ||
| "61416726988f77b874435bdd89a419edc3861111884fd60e8adf54e2f299efd6", // 12 | ||
| "3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09", // 13 | ||
| "e5c76afa931e33ac94bce2e754b1bb6407d07f738f67856783d93934ca8fc576", // 14 | ||
| "485a526fc74c8a344c43a4545a5987d17af9ab401c0ef1ef63aefcc5c2c086df", // 15 | ||
| "624b2abb7aefc0978f891b2a56b665513480e5dc195b4a66cd8def074a6d2e94", // 16 | ||
| "b8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199", // 17 | ||
| "59de730bf6c67a941f3b2ffa2f920acfaa1713695ad5deea12b4a121e5f23fa1", // 18 | ||
| "522194562123473dcfd7a457b18ee7dee8b7db70ed3cfa2b73f348a992fdfd3b", // 19 | ||
| "ed0cc44c93b14fef2d91ab3a3674eeb6352a42ac2f0bbe524711824aae1e7bcc", // 20 | ||
| } | ||
|
|
||
| start = 0 | ||
| end = len(dataLengths) | ||
| ) | ||
|
|
||
| func init() { | ||
| testutil.Init() | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| package hasher | ||
|
|
||
| import ( | ||
| "context" | ||
| "sync" | ||
|
|
||
| "github.com/ethersphere/swarm/file" | ||
| ) | ||
|
|
||
| // defines the boundaries of the hashing job and also contains the hash factory function of the job | ||
| // setting Debug means omitting any automatic behavior (for now it means job processing won't auto-start) | ||
| type treeParams struct { | ||
| SectionSize int | ||
| Branches int | ||
| ChunkSize int | ||
| Spans []int | ||
| Debug bool | ||
| hashFunc file.SectionWriterFunc | ||
| writerPool sync.Pool | ||
| ctx context.Context | ||
| } | ||
|
|
||
| func newTreeParams(hashFunc file.SectionWriterFunc) *treeParams { | ||
|
|
||
| h := hashFunc(context.Background()) | ||
| p := &treeParams{ | ||
| SectionSize: h.SectionSize(), | ||
| Branches: h.Branches(), | ||
| ChunkSize: h.SectionSize() * h.Branches(), | ||
| hashFunc: hashFunc, | ||
| } | ||
| h.Reset() | ||
| p.writerPool.New = func() interface{} { | ||
| hf := p.hashFunc(p.ctx) | ||
| return hf | ||
| } | ||
| p.Spans = generateSpanSizes(p.Branches, 9) | ||
| return p | ||
| } | ||
|
|
||
| func (p *treeParams) SetContext(ctx context.Context) { | ||
| p.ctx = ctx | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| func (p *treeParams) GetContext() context.Context { | ||
| return p.ctx | ||
| } | ||
|
|
||
| func (p *treeParams) PutWriter(w file.SectionWriter) { | ||
| w.Reset() | ||
| p.writerPool.Put(w) | ||
| } | ||
|
|
||
| func (p *treeParams) GetWriter() file.SectionWriter { | ||
| return p.writerPool.Get().(file.SectionWriter) | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| package hasher | ||
|
|
||
| import ( | ||
| "github.com/ethersphere/swarm/file" | ||
| ) | ||
|
|
||
| // ReferenceHasher is the source-of-truth implementation of the swarm file hashing algorithm | ||
| type ReferenceHasher struct { | ||
| params *treeParams | ||
| cursors []int // section write position, indexed per level | ||
| length int // number of bytes written to the data level of the hasher | ||
| buffer []byte // keeps data and hashes, indexed by cursors | ||
| counts []int // number of sums performed, indexed per level | ||
| hasher file.SectionWriter // underlying hasher | ||
| } | ||
|
|
||
| // NewReferenceHasher constructs and returns a new ReferenceHasher | ||
| // This implementation is limited to a tree of 9 levels, where level 0 is the data level | ||
| // With 32 section size and 128 branches (i.e. unencrypted, non erasure-coded content) this means | ||
| // a capacity of 4096 bytes * (128^(9-1)) ~ 295.148 * (10^18) bytes | ||
| func NewReferenceHasher(params *treeParams) *ReferenceHasher { | ||
| // TODO: remove when bmt interface is amended | ||
| h := params.GetWriter() | ||
| return &ReferenceHasher{ | ||
| params: params, | ||
| cursors: make([]int, 9), | ||
| counts: make([]int, 9), | ||
| buffer: make([]byte, params.ChunkSize*9), | ||
nolash marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| hasher: h, | ||
| } | ||
| } | ||
|
|
||
| // Hash computes and returns the root hash of arbitrary data | ||
| func (r *ReferenceHasher) Hash(data []byte) []byte { | ||
| l := r.params.ChunkSize | ||
| for i := 0; i < len(data); i += r.params.ChunkSize { | ||
| if len(data)-i < r.params.ChunkSize { | ||
| l = len(data) - i | ||
| } | ||
| r.update(0, data[i:i+l]) | ||
| } | ||
| return r.digest() | ||
| } | ||
|
|
||
| // write to the data buffer on the specified level | ||
| // calls sum if chunk boundary is reached and recursively calls this function for the next level with the acquired bmt hash | ||
| // adjusts cursors accordingly | ||
| func (r *ReferenceHasher) update(lvl int, data []byte) { | ||
| if lvl == 0 { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it would be nice to write that level 0 is the data layer. especially when this reference hasher is another representation of a tree or trie, in which tree height is measured as the inverse (0 is the root)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. agreed. |
||
| r.length += len(data) | ||
| } | ||
| copy(r.buffer[r.cursors[lvl]:r.cursors[lvl]+len(data)], data) | ||
| r.cursors[lvl] += len(data) | ||
| if r.cursors[lvl]-r.cursors[lvl+1] == r.params.ChunkSize { | ||
| ref := r.sum(lvl) | ||
| r.update(lvl+1, ref) | ||
| r.cursors[lvl] = r.cursors[lvl+1] | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| // calculates and returns the bmt sum of the last written data on the level | ||
| func (r *ReferenceHasher) sum(lvl int) []byte { | ||
| r.counts[lvl]++ | ||
| spanSize := r.params.Spans[lvl] * r.params.ChunkSize | ||
| span := (r.length-1)%spanSize + 1 | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| sizeToSum := r.cursors[lvl] - r.cursors[lvl+1] | ||
|
|
||
| r.hasher.Reset() | ||
| r.hasher.SetSpan(span) | ||
| r.hasher.Write(r.buffer[r.cursors[lvl+1] : r.cursors[lvl+1]+sizeToSum]) | ||
| ref := r.hasher.Sum(nil) | ||
| return ref | ||
| } | ||
|
|
||
| // called after all data has been written | ||
| // sums the final chunks of each level | ||
| // skips intermediate levels that end on span boundary | ||
| func (r *ReferenceHasher) digest() []byte { | ||
|
|
||
| // if we did not end on a chunk boundary, the last chunk hasn't been hashed | ||
| // we need to do this first | ||
| if r.length%r.params.ChunkSize != 0 { | ||
| ref := r.sum(0) | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| copy(r.buffer[r.cursors[1]:], ref) | ||
| r.cursors[1] += len(ref) | ||
| r.cursors[0] = r.cursors[1] | ||
| } | ||
|
|
||
| // calculate the total number of levels needed to represent the data (including the data level) | ||
| targetLevel := getLevelsFromLength(r.length, r.params.SectionSize, r.params.Branches) | ||
|
|
||
| // sum every intermediate level and write to the level above it | ||
| for i := 1; i < targetLevel; i++ { | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // if the tree is balanced or if there is a single reference outside a balanced tree on this level | ||
| // don't hash it again but pass it on to the next level | ||
| if r.counts[i] > 0 { | ||
| // TODO: simplify if possible | ||
| if r.counts[i-1]-r.params.Spans[targetLevel-1-i] <= 1 { | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| r.cursors[i+1] = r.cursors[i] | ||
| r.cursors[i] = r.cursors[i-1] | ||
| continue | ||
| } | ||
| } | ||
|
|
||
| ref := r.sum(i) | ||
| copy(r.buffer[r.cursors[i+1]:], ref) | ||
| r.cursors[i+1] += len(ref) | ||
| r.cursors[i] = r.cursors[i+1] | ||
| } | ||
|
|
||
| // the first section of the buffer will hold the root hash | ||
| return r.buffer[:r.params.SectionSize] | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,140 @@ | ||
| package hasher | ||
|
|
||
| import ( | ||
| "context" | ||
| "fmt" | ||
| "strconv" | ||
| "strings" | ||
| "testing" | ||
|
|
||
| "github.com/ethereum/go-ethereum/common/hexutil" | ||
| "github.com/ethersphere/swarm/bmt" | ||
| "github.com/ethersphere/swarm/file" | ||
| "github.com/ethersphere/swarm/log" | ||
| "github.com/ethersphere/swarm/testutil" | ||
| "golang.org/x/crypto/sha3" | ||
| ) | ||
|
|
||
| // TestManualDanglingChunk is a test script explicitly hashing and writing every individual level in the dangling chunk edge case | ||
| // we use a balanced tree with data size of chunkSize*branches, and a single chunk of data | ||
| // this case is chosen because it produces the wrong result in the pyramid hasher at the time of writing (master commit hash 4928d989ebd0854d993c10c194e61a5a5455e4f9) | ||
| func TestManualDanglingChunk(t *testing.T) { | ||
| pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) | ||
| h := bmt.New(pool) | ||
|
|
||
| // to execute the job we need buffers with the following capacities: | ||
| // level 0: chunkSize*branches+chunkSize | ||
| // level 1: chunkSize | ||
| // level 2: sectionSize * 2 | ||
| var levels [][]byte | ||
| levels = append(levels, nil) | ||
| levels = append(levels, make([]byte, chunkSize)) | ||
| levels = append(levels, make([]byte, sectionSize*2)) | ||
|
|
||
| // hash the balanced tree portion of the data level and write to level 1 | ||
| _, levels[0] = testutil.SerialData(chunkSize*branches+chunkSize, 255, 0) | ||
| for i := 0; i < chunkSize*branches; i += chunkSize { | ||
| h.Reset() | ||
| h.SetSpan(chunkSize) | ||
| h.Write(levels[0][i : i+chunkSize]) | ||
| copy(levels[1][i/branches:], h.Sum(nil)) | ||
| } | ||
| refHex := hexutil.Encode(levels[1][:sectionSize]) | ||
| correctRefHex := "0xc10090961e7682a10890c334d759a28426647141213abda93b096b892824d2ef" | ||
| if refHex != correctRefHex { | ||
| t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex) | ||
| } | ||
|
|
||
| // write the dangling chunk | ||
| // hash it and write the reference on the second section of level 2 | ||
| h.Reset() | ||
| h.SetSpan(chunkSize) | ||
| h.Write(levels[0][chunkSize*branches:]) | ||
| copy(levels[2][sectionSize:], h.Sum(nil)) | ||
| refHex = hexutil.Encode(levels[2][sectionSize:]) | ||
| correctRefHex = "0x81b31d9a7f6c377523e8769db021091df23edd9fd7bd6bcdf11a22f518db6006" | ||
| if refHex != correctRefHex { | ||
| t.Fatalf("manual dangling single chunk; expected %s, got %s", correctRefHex, refHex) | ||
| } | ||
|
|
||
| // hash the chunk on level 1 and write into the first section of level 2 | ||
| h.Reset() | ||
| h.SetSpan(chunkSize * branches) | ||
| h.Write(levels[1]) | ||
| copy(levels[2], h.Sum(nil)) | ||
| refHex = hexutil.Encode(levels[2][:sectionSize]) | ||
| correctRefHex = "0x3047d841077898c26bbe6be652a2ec590a5d9bd7cd45d290ea42511b48753c09" | ||
| if refHex != correctRefHex { | ||
| t.Fatalf("manual dangling balanced tree; expected %s, got %s", correctRefHex, refHex) | ||
| } | ||
|
|
||
| // hash the two sections on level 2 to obtain the root hash | ||
| h.Reset() | ||
| h.SetSpan(chunkSize*branches + chunkSize) | ||
| h.Write(levels[2]) | ||
| ref := h.Sum(nil) | ||
| refHex = hexutil.Encode(ref) | ||
| correctRefHex = "0xb8e1804e37a064d28d161ab5f256cc482b1423d5cd0a6b30fde7b0f51ece9199" | ||
| if refHex != correctRefHex { | ||
| t.Fatalf("manual dangling root; expected %s, got %s", correctRefHex, refHex) | ||
| } | ||
| } | ||
|
|
||
| // TestReferenceFileHasherVector executes the file hasher algorithms on serial input data of periods of 0-254 | ||
| // of lengths defined in common_test.go | ||
| // | ||
| // the "expected" array in common_test.go is generated by this implementation, and test failure due to | ||
| // result mismatch is nothing else than an indication that something has changed in the reference filehasher | ||
| // or the underlying hashing algorithm | ||
| func TestReferenceHasherVector(t *testing.T) { | ||
|
|
||
| hashFunc := func(_ context.Context) file.SectionWriter { | ||
| pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) | ||
| return bmt.New(pool) | ||
| } | ||
| params := newTreeParams(hashFunc) | ||
| var mismatch int | ||
| for i := start; i < end; i++ { | ||
| dataLength := dataLengths[i] | ||
| log.Info("start", "i", i, "len", dataLength) | ||
| rh := NewReferenceHasher(params) | ||
| _, data := testutil.SerialData(dataLength, 255, 0) | ||
| refHash := rh.Hash(data) | ||
| eq := true | ||
| if expected[i] != fmt.Sprintf("%x", refHash) { | ||
| mismatch++ | ||
| eq = false | ||
| } | ||
| t.Logf("[%7d+%4d]\t%v\tref: %x\texpect: %s", dataLength/chunkSize, dataLength%chunkSize, eq, refHash, expected[i]) | ||
| } | ||
| if mismatch > 0 { | ||
| t.Fatalf("mismatches: %d/%d", mismatch, end-start) | ||
| } | ||
| } | ||
|
|
||
| // BenchmarkReferenceHasher establishes a baseline for a fully synchronous file hashing operation | ||
| // it will be vastly inefficient | ||
| func BenchmarkReferenceHasher(b *testing.B) { | ||
| for i := start; i < end; i++ { | ||
| b.Run(fmt.Sprintf("%d", dataLengths[i]), benchmarkReferenceHasher) | ||
| } | ||
| } | ||
|
|
||
| func benchmarkReferenceHasher(b *testing.B) { | ||
| benchParams := strings.Split(b.Name(), "/") | ||
| dataLength, err := strconv.ParseInt(benchParams[1], 10, 64) | ||
| if err != nil { | ||
| b.Fatal(err) | ||
| } | ||
| hashFunc := func(_ context.Context) file.SectionWriter { | ||
| pool := bmt.NewTreePool(sha3.NewLegacyKeccak256, branches, bmt.PoolSize) | ||
| return bmt.New(pool) | ||
| } | ||
| params := newTreeParams(hashFunc) | ||
| b.ResetTimer() | ||
| for i := 0; i < b.N; i++ { | ||
| _, data := testutil.SerialData(int(dataLength), 255, 0) | ||
| fh := NewReferenceHasher(params) | ||
| fh.Hash(data) | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.