Skip to content

Commit

Permalink
Make metadata code aware of newgen archives
Browse files Browse the repository at this point in the history
  • Loading branch information
macneale4 committed Feb 12, 2025
1 parent bbaad98 commit 810b837
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 68 deletions.
165 changes: 99 additions & 66 deletions go/store/nbs/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,46 @@
package nbs

import (
"errors"
"os"
"path/filepath"

"github.com/dolthub/dolt/go/libraries/utils/filesys"
"github.com/dolthub/dolt/go/store/hash"
)

type StorageType int

const (
Journal StorageType = iota
TableFileNewGen
TableFileOldGen
Archive
)

type ArchiveMetadata struct {
originalTableFileId string
}

type TableFileFormat int

const (
TypeNoms TableFileFormat = iota
TypeArchive
)

type StorageArtifact struct {
id hash.Hash
path string
storageType StorageType
// ID of the storage artifact. This is uses in the manifest to identify the artifact, but it is not the file name.
// as archives has a suffix.
id hash.Hash
// path to the storage artifact.
path string
// storageType is the type of the storage artifact.
storageType TableFileFormat
// arcMetadata is additional metadata for archive files. it is only set for storageType == TypeArchive.
arcMetadata *ArchiveMetadata
}

type StorageMetadata struct {
// root is the path to storage. Specifically, it contains a .dolt directory.
root string
artifacts []StorageArtifact
}

func (sm *StorageMetadata) ArchiveFilesPresent() bool {
for _, artifact := range sm.artifacts {
if artifact.storageType == Archive {
if artifact.storageType == TypeArchive {
return true
}
}
Expand All @@ -60,14 +65,16 @@ func (sm *StorageMetadata) ArchiveFilesPresent() bool {
func (sm *StorageMetadata) RevertMap() map[hash.Hash]hash.Hash {
revertMap := make(map[hash.Hash]hash.Hash)
for _, artifact := range sm.artifacts {
if artifact.storageType == Archive {
if artifact.storageType == TypeArchive {
md := artifact.arcMetadata
revertMap[artifact.id] = hash.Parse(md.originalTableFileId)
}
}
return revertMap
}

// oldGenTableExists returns true if the table file exists in the oldgen directory. This is a file system check for
// a table file we have no record of, but may be useful in the process of reverting an archive operation.
func (sm *StorageMetadata) oldGenTableExists(id hash.Hash) (bool, error) {
path := filepath.Join(sm.root, ".dolt", "noms", "oldgen", id.String())
_, err := os.Stat(path)
Expand All @@ -88,81 +95,107 @@ func GetStorageMetadata(path string) (StorageMetadata, error) {
return StorageMetadata{}, err
}

// TODO: new gen and journal information in storage metadata will be useful in the future.
// newGen := filepath.Join(path, ".dolt", "noms")
// newgenManifest := filepath.Join(newGen, "manifest")
newGen := filepath.Join(path, ".dolt", "noms")
newgenManifest := filepath.Join(newGen, "manifest")
manifestReader, err := os.Open(newgenManifest)
if err != nil {
return StorageMetadata{}, err
}

manifest, err := ParseManifest(manifestReader)
if err != nil {
return StorageMetadata{}, err
}

var artifacts []StorageArtifact

// for each table in the manifest, get the table spec
for i := 0; i < manifest.NumTableSpecs(); i++ {
tableSpecInfo := manifest.GetTableSpecInfo(i)
artifact, err := buildArtifact(tableSpecInfo, newGen)
if err != nil {
return StorageMetadata{}, err
}
artifacts = append(artifacts, artifact)
}

oldgen := filepath.Join(path, ".dolt", "noms", "oldgen")
oldgen := filepath.Join(newGen, "oldgen")
oldgenManifest := filepath.Join(oldgen, "manifest")

// If there is not oldgen manifest, then GC has never been run. Which is fine. We just don't have any oldgen.
// If there is no oldgen manifest, then GC has never been run. Which is fine. We just don't have any oldgen.
if _, err := os.Stat(oldgenManifest); err != nil {
return StorageMetadata{}, nil
return StorageMetadata{path, artifacts}, nil
}

// create a io.Reader for the manifest file
manifestReader, err := os.Open(oldgenManifest)
manifestReader, err = os.Open(oldgenManifest)
if err != nil {
return StorageMetadata{}, err
}

manifest, err := ParseManifest(manifestReader)
manifest, err = ParseManifest(manifestReader)
if err != nil {
return StorageMetadata{}, err
}

var artifacts []StorageArtifact

// for each table in the manifest, get the table spec
for i := 0; i < manifest.NumTableSpecs(); i++ {
tableSpecInfo := manifest.GetTableSpecInfo(i)

// If the oldgen/name exists, it's not an archive. If it exists with a .darc suffix, then it's an archive.
tfName := tableSpecInfo.GetName()
fullPath := filepath.Join(oldgen, tfName)
_, err := os.Stat(fullPath)
if err == nil {
// exists. Not an archive.
artifacts = append(artifacts, StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TableFileOldGen,
})
} else if os.IsNotExist(err) {
arcName := tfName + ".darc"
arcPath := filepath.Join(oldgen, arcName)
_, err := os.Stat(arcPath)
if err == nil {
// reader for the path. State. call
reader, fileSize, err := openReader(arcPath)
if err != nil {
return StorageMetadata{}, err
}

arcMetadata, err := newArchiveMetadata(reader, fileSize)
if err != nil {
return StorageMetadata{}, err
}

artifacts = append(artifacts, StorageArtifact{
id: hash.Parse(tfName),
path: arcPath,
storageType: Archive,
arcMetadata: arcMetadata,
})
} else {
// any error is bad here. If the files don't exist, then the manifest is no good.
return StorageMetadata{}, err
}
} else {
// some other error.
artifact, err := buildArtifact(tableSpecInfo, oldgen)
if err != nil {
return StorageMetadata{}, err
}
artifacts = append(artifacts, artifact)
}

return StorageMetadata{path, artifacts}, nil
}

func buildArtifact(info TableSpecInfo, genPath string) (StorageArtifact, error) {
tfName := info.GetName()

// This code is going to be removed as soon as backup supports archives.
archive := false
fullPath := filepath.Join(genPath, tfName)

_, err := os.Stat(fullPath)
if err != nil {
if errors.Is(err, os.ErrNotExist) {
fullPath = filepath.Join(genPath, tfName+ArchiveFileSuffix)
} else {
return StorageArtifact{}, err
}
_, err = os.Stat(fullPath)
if err != nil {
return StorageArtifact{}, err
}
archive = true
}

if !archive {
return StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TypeNoms,
}, nil
} else {
reader, fileSize, err := openReader(fullPath)
if err != nil {
return StorageArtifact{}, err
}

arcMetadata, err := newArchiveMetadata(reader, fileSize)
if err != nil {
return StorageArtifact{}, err
}

return StorageArtifact{
id: hash.Parse(tfName),
path: fullPath,
storageType: TypeArchive,
arcMetadata: arcMetadata,
}, nil
}
}

func validateDir(path string) error {
info, err := os.Stat(path)

Expand Down
13 changes: 11 additions & 2 deletions integration-tests/bats/archive.bats
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,16 @@ mutations_and_gc_statement() {
run dolt sql -q 'select sum(i) from tbl;'
[[ "$status" -eq 0 ]] || false
[[ "$output" =~ "138075" ]] || false # i = 1 - 525, sum is 138075


## Temporary check. We want to ensure that backup will give an error, even when
## there are archives in newgen.
mkdir ../backup
dolt backup add bac1 file://../backup

run dolt backup sync bac1
[ "$status" -eq 1 ]
[[ "$output" =~ "error: archive files present" ]] || false
}

@test "archive: can clone respiratory with mixed types" {
Expand Down Expand Up @@ -235,7 +245,7 @@ mutations_and_gc_statement() {
dolt fetch

## update the remote repo directly. Need to run the archive command when the server is stopped.
## This will result in achived files on the remote, which we will need to read chunks from when we fetch.
## This will result in archived files on the remote, which we will need to read chunks from when we fetch.
cd ../../remote
kill $remotesrv_pid
wait $remotesrv_pid || :
Expand All @@ -248,7 +258,6 @@ mutations_and_gc_statement() {
[[ "$remotesrv_pid" -gt 0 ]] || false

cd ../cloned/repo1

run dolt fetch
[ "$status" -eq 0 ]

Expand Down

0 comments on commit 810b837

Please sign in to comment.