-
Notifications
You must be signed in to change notification settings - Fork 110
forky: Fixed Chunk Data Size Store #2017
base: master
Are you sure you want to change the base?
Changes from 1 commit
a2d5edc
2506b88
c8f3622
ec14da3
e352b88
c8c16e1
66aa7fd
7aed3b1
26bcb48
1e94680
db658c7
26f6626
5be3c25
c222011
661a7f5
f51c6d8
91fc21f
60e3938
c542f32
0fc5e3a
842f7d8
d9341b8
0f13d3b
4b6f726
39d328a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,43 +52,40 @@ var ErrDBClosed = errors.New("closed database") | |
| // Store is the main FCDS implementation. It stores chunk data into | ||
| // a number of files partitioned by the last byte of the chunk address. | ||
| type Store struct { | ||
| shards map[uint8]*os.File // relations with shard id and a shard file | ||
| shardsMu map[uint8]*sync.Mutex // mutex for every shard file | ||
| meta MetaStore // stores chunk offsets | ||
| free map[uint8]struct{} // which shards have free offsets | ||
| freeMu sync.RWMutex // protects free field | ||
| freeCache *offsetCache // optional cache of free offset values | ||
| wg sync.WaitGroup // blocks Close until all other method calls are done | ||
| maxChunkSize int // maximal chunk data size | ||
| quit chan struct{} // quit disables all operations after Close is called | ||
| quitOnce sync.Once // protects close channel from multiple Close calls | ||
| shards []shard // relations with shard id and a shard file and their mutexes | ||
| meta MetaStore // stores chunk offsets | ||
| free []bool // which shards have free offsets | ||
| freeMu sync.RWMutex // protects free field | ||
| freeCache *offsetCache // optional cache of free offset values | ||
| wg sync.WaitGroup // blocks Close until all other method calls are done | ||
| maxChunkSize int // maximal chunk data size | ||
| quit chan struct{} // quit disables all operations after Close is called | ||
| quitOnce sync.Once // protects quit channel from multiple Close calls | ||
| } | ||
|
|
||
| // NewStore constructs a new Store with files at path, with specified max chunk size. | ||
| // New constructs a new Store with files at path, with specified max chunk size. | ||
| // Argument withCache enables in memory cache of free chunk data positions in files. | ||
| func NewStore(path string, maxChunkSize int, metaStore MetaStore, withCache bool) (s *Store, err error) { | ||
| func New(path string, maxChunkSize int, metaStore MetaStore, withCache bool) (s *Store, err error) { | ||
| if err := os.MkdirAll(path, 0777); err != nil { | ||
| return nil, err | ||
| } | ||
| shards := make(map[byte]*os.File, shardCount) | ||
| shardsMu := make(map[uint8]*sync.Mutex) | ||
| shards := make([]shard, shardCount) | ||
| for i := byte(0); i < shardCount; i++ { | ||
| shards[i], err = os.OpenFile(filepath.Join(path, fmt.Sprintf("chunks-%v.db", i)), os.O_CREATE|os.O_RDWR, 0666) | ||
| shards[i].f, err = os.OpenFile(filepath.Join(path, fmt.Sprintf("chunks-%v.db", i)), os.O_CREATE|os.O_RDWR, 0666) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| shardsMu[i] = new(sync.Mutex) | ||
| shards[i].mu = new(sync.Mutex) | ||
| } | ||
| var freeCache *offsetCache | ||
| if withCache { | ||
| freeCache = newOffsetCache(shardCount) | ||
| } | ||
| return &Store{ | ||
| shards: shards, | ||
| shardsMu: shardsMu, | ||
| meta: metaStore, | ||
| freeCache: freeCache, | ||
| free: make(map[uint8]struct{}), | ||
| free: make([]bool, shardCount), | ||
| maxChunkSize: maxChunkSize, | ||
| quit: make(chan struct{}), | ||
| }, nil | ||
|
|
@@ -102,16 +99,16 @@ func (s *Store) Get(addr chunk.Address) (ch chunk.Chunk, err error) { | |
| } | ||
| defer done() | ||
|
|
||
| mu := s.shardsMu[getShard(addr)] | ||
| mu.Lock() | ||
| defer mu.Unlock() | ||
| sh := s.shards[getShard(addr)] | ||
| sh.mu.Lock() | ||
| defer sh.mu.Unlock() | ||
|
|
||
| m, err := s.getMeta(addr) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| data := make([]byte, m.Size) | ||
| n, err := s.shards[getShard(addr)].ReadAt(data, m.Offset) | ||
| n, err := sh.f.ReadAt(data, m.Offset) | ||
| if err != nil && err != io.EOF { | ||
| return nil, err | ||
| } | ||
|
|
@@ -129,7 +126,7 @@ func (s *Store) Has(addr chunk.Address) (yes bool, err error) { | |
| } | ||
| defer done() | ||
|
|
||
| mu := s.shardsMu[getShard(addr)] | ||
| mu := s.shards[getShard(addr)].mu | ||
| mu.Lock() | ||
| defer mu.Unlock() | ||
|
|
||
|
|
@@ -152,74 +149,76 @@ func (s *Store) Put(ch chunk.Chunk) (err error) { | |
| defer done() | ||
|
|
||
| addr := ch.Address() | ||
| shard := getShard(addr) | ||
| f := s.shards[shard] | ||
| data := ch.Data() | ||
|
|
||
| section := make([]byte, s.maxChunkSize) | ||
| copy(section, data) | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| s.freeMu.RLock() | ||
| _, hasFree := s.free[shard] | ||
| s.freeMu.RUnlock() | ||
| shard := getShard(addr) | ||
| sh := s.shards[shard] | ||
|
|
||
| var offset int64 | ||
| var reclaimed bool | ||
| mu := s.shardsMu[shard] | ||
| mu.Lock() | ||
| if hasFree { | ||
| var freeOffset int64 = -1 | ||
| if s.freeCache != nil { | ||
| freeOffset = s.freeCache.get(shard) | ||
| } | ||
| if freeOffset < 0 { | ||
| freeOffset, err = s.meta.FreeOffset(shard) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| } | ||
| if freeOffset < 0 { | ||
| offset, err = f.Seek(0, io.SeekEnd) | ||
| if err != nil { | ||
| mu.Unlock() | ||
| return err | ||
| } | ||
| s.freeMu.Lock() | ||
| delete(s.free, shard) | ||
| s.freeMu.Unlock() | ||
| } else { | ||
| offset, err = f.Seek(freeOffset, io.SeekStart) | ||
| if err != nil { | ||
| mu.Unlock() | ||
| return err | ||
| } | ||
| reclaimed = true | ||
| } | ||
| sh.mu.Lock() | ||
| defer sh.mu.Unlock() | ||
|
|
||
| offset, reclaimed, err := s.getOffset(shard) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| if offset < 0 { | ||
| offset, err = sh.f.Seek(0, io.SeekEnd) | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } else { | ||
| offset, err = f.Seek(0, io.SeekEnd) | ||
| if err != nil { | ||
| mu.Unlock() | ||
| return err | ||
| } | ||
| _, err = sh.f.Seek(offset, io.SeekStart) | ||
| } | ||
| _, err = f.Write(section) | ||
| if err != nil { | ||
| mu.Unlock() | ||
| return err | ||
| } | ||
| if reclaimed { | ||
| if s.freeCache != nil { | ||
| s.freeCache.remove(shard, offset) | ||
| } | ||
| defer mu.Unlock() | ||
| } else { | ||
| mu.Unlock() | ||
|
|
||
| if _, err = sh.f.Write(section); err != nil { | ||
| return err | ||
| } | ||
| if reclaimed && s.freeCache != nil { | ||
| s.freeCache.remove(shard, offset) | ||
| } | ||
| return s.meta.Set(addr, shard, reclaimed, &Meta{ | ||
| Size: uint16(len(data)), | ||
zelig marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Offset: offset, | ||
| }) | ||
| } | ||
|
|
||
| // getOffset returns an offset where chunk data can be written to | ||
| // and a flag if the offset is reclaimed from a previously removed chunk. | ||
| // If offset is less then 0, no free offsets are available. | ||
| func (s *Store) getOffset(shard uint8) (offset int64, reclaimed bool, err error) { | ||
| if !s.shardHasFreeOffsets(shard) { | ||
| // shard does not have free offset | ||
|
||
| return -1, false, err | ||
|
||
| } | ||
|
|
||
| offset = -1 // negative offset denotes no available free offset | ||
| if s.freeCache != nil { | ||
| // check if local cache has an offset | ||
| offset = s.freeCache.get(shard) | ||
| } | ||
|
|
||
| if offset < 0 { | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // free cache did not return a free offset, | ||
| // check the meta store for one | ||
| offset, err = s.meta.FreeOffset(shard) | ||
| if err != nil { | ||
| return 0, false, err | ||
| } | ||
| } | ||
| if offset < 0 { | ||
| // meta store did not return a free offset, | ||
| // mark this shard that has no free offsets | ||
| s.markShardWithFreeOffsets(shard, false) | ||
| return -1, false, nil | ||
| } | ||
|
|
||
| return offset, true, nil | ||
| } | ||
|
|
||
| // Delete removes chunk data. | ||
acud marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| func (s *Store) Delete(addr chunk.Address) (err error) { | ||
| done, err := s.protect() | ||
|
|
@@ -229,11 +228,9 @@ func (s *Store) Delete(addr chunk.Address) (err error) { | |
| defer done() | ||
|
|
||
| shard := getShard(addr) | ||
| s.freeMu.Lock() | ||
| s.free[shard] = struct{}{} | ||
| s.freeMu.Unlock() | ||
| s.markShardWithFreeOffsets(shard, true) | ||
|
|
||
| mu := s.shardsMu[shard] | ||
| mu := s.shards[shard].mu | ||
acud marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| mu.Lock() | ||
| defer mu.Unlock() | ||
|
|
||
|
|
@@ -260,18 +257,18 @@ func (s *Store) Iterate(fn func(chunk.Chunk) (stop bool, err error)) (err error) | |
| } | ||
| defer done() | ||
|
|
||
| for _, mu := range s.shardsMu { | ||
| mu.Lock() | ||
| for _, sh := range s.shards { | ||
| sh.mu.Lock() | ||
| } | ||
| defer func() { | ||
| for _, mu := range s.shardsMu { | ||
| mu.Unlock() | ||
| for _, sh := range s.shards { | ||
| sh.mu.Unlock() | ||
| } | ||
| }() | ||
|
|
||
| return s.meta.Iterate(func(addr chunk.Address, m *Meta) (stop bool, err error) { | ||
| data := make([]byte, m.Size) | ||
| _, err = s.shards[getShard(addr)].ReadAt(data, m.Offset) | ||
| _, err = s.shards[getShard(addr)].f.ReadAt(data, m.Offset) | ||
| if err != nil { | ||
| return true, err | ||
| } | ||
|
|
@@ -298,8 +295,8 @@ func (s *Store) Close() (err error) { | |
| case <-time.After(15 * time.Second): | ||
|
||
| } | ||
|
|
||
| for _, f := range s.shards { | ||
| if err := f.Close(); err != nil { | ||
| for _, sh := range s.shards { | ||
| if err := sh.f.Close(); err != nil { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping the fd open for a long duration without mmap is a disaster in the making. I am not sure if go file flush() does a os level fsync(). If it is, then we should flush() at regular intervals to save the contents against crash.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice observation. Go os.File.Sync() is flushing the content to the disk, I am not sure to which flush() are you referring to. Flushing on (regular) intervals is what operating system is already doing. I am not sure how this would help agains crash unless we fsync on every write, which is quite costly. I have already tested fsync on every write and it makes fcds much slower, even compared to go-leveldb, as go-leveldb dos not fsync at all. Mmap brings its own complexity, especially on different operating systems.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the reply.
Apologies if i confused you. There are two things...
The first one is done by golang itself as pointed out by you. The second one is the one i am concerned.
It is usually done by OS disk drivers whenever they feel it is okay. The fsync() is expensive as pointed out by you. To avoid fsyncing on every commit, DB's usually implement WAL's which fsyncs it on very small regular intervals on the background. so that even if some data is lost it will be of for very small duration . If you fsync on foreground (query path) everytime that will be very expensive.
leveldb takes another strategy my doing a larger mmap file than required and written directly using memcopy, then on mmap driver takes care of writing the dirty pages to the disk. All i am saying is, one way or other, if we want to evade crash and end up with corrupted files on bootup, We have to implement this otherwise the I am sure we can expect some gibberish files when you switch off power abruptly.
Yes. We should not reinvent the wheel.
I found this intresting read on this topic
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for explanations.
I have already tried badger and it is slower then go-leveldb. Maybe you can find to do it more efficiently. And still it does not scale with more cpu cores.
I am concerned if this is actually help us, as os is already doing that in some frequency. Also, as you pointed, the solution would be to implement WAL, which in some basic form I already did, with of course, some performance penalties. As far as I can see go-leveldb is not using mmap. I see your point very valid, but I think that your suggestions should be tested, even the ones that I already did, to revalidate. We can talk about possibilities, but it would be good to actually test resilience for the weak points that you described. As this is a very important part of the swarm I am more in favour not to merge this PR until we are clear that it is reliable. Currently, I see that it creates more problems than it brings benefits.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did few changes in the badger configuration and did run TestLevelDBForky and TestBadgerSuite test cases. These are the results. The iteration in Badger is somewhat slow... so i commented out the Iteration and did these test cases.. with an assumption that write/read/delete is more important and not iteration. There are more write improvements i can make in badger.. like batching the Writes and so on.. but for now.. i will leave it like this...
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, but I cannot say anything from the screenshots except to compare timings and conclude that TestBadgerSuite writes are slower. It would be very helpful to actually see the changes and what is TestBadgerSuite doing.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Look at the last 2 commits in my fork https://github.com/jmozah/forky/commits/master |
||
| return err | ||
| } | ||
| } | ||
|
|
@@ -327,7 +324,25 @@ func (s *Store) getMeta(addr chunk.Address) (m *Meta, err error) { | |
| return s.meta.Get(addr) | ||
| } | ||
|
|
||
| func (s *Store) markShardWithFreeOffsets(shard uint8, has bool) { | ||
| s.freeMu.Lock() | ||
| s.free[shard] = has | ||
| s.freeMu.Unlock() | ||
| } | ||
|
|
||
| func (s *Store) shardHasFreeOffsets(shard uint8) (has bool) { | ||
| s.freeMu.RLock() | ||
| has = s.free[shard] | ||
| s.freeMu.RUnlock() | ||
| return has | ||
| } | ||
|
|
||
| // getShard returns a shard number for the chunk address. | ||
| func getShard(addr chunk.Address) (shard uint8) { | ||
| return addr[len(addr)-1] % shardCount | ||
acud marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| type shard struct { | ||
| f *os.File | ||
| mu *sync.Mutex | ||
| } | ||

Uh oh!
There was an error while loading. Please reload this page.