Skip to content

No magic bytes found at end of file [dirty writes, unsafe shutdown] #680

@jakthom-blw

Description

@jakthom-blw

What happens?

I have been consistently seeing issues when ducklake writer procs are forcefully shut down by the host (like when the machine loses power).

When the writer process is killed and the parquet file is in a partially-written state (IE: rowgroups purged to disk but parquet footer metadata not yet flushed), a junk parquet file ends up in the configured DATA_DIR (therefore breaking any wildcard reads on said directory until the junk file is cleaned up).

The subsequent junk file most often does not end up in the Ducklake metadata ducklake_data_file table, though sometimes it does. When this is the case, the resulting record has to be manually removed from the ducklake_data_file table or the table cannot be queried:

D select count(*) from nmea_pgn_data;
Invalid Input Error:
File 'db/power/nmea_pgn_data/year=2025/month=12/day=23/ducklake-019b4aa8-8821-7d23-accb-689338652ebc.parquet' too small to be a Parquet file

D select * from ducklake_data_file where path ilike '%year=2025/month=12/day=23/ducklake-019b4aa8-8821-7d23-accb-689338652ebc.parquet%';
     data_file_id = 103486
         table_id = 34
   begin_snapshot = 102751
     end_snapshot = NULL
       file_order = NULL
             path = year=2025/month=12/day=23/ducklake-019b4aa8-8821-7d23-accb-689338652ebc.parquet
 path_is_relative = true
      file_format = parquet
     record_count = 2414
  file_size_bytes = 40697
      footer_size = 1388
     row_id_start = 16957857
     partition_id = 35
   encryption_key = NULL
partial_file_info = NULL
       mapping_id = NULL

(but the actual file looks very different)

db/power/nmea_pgn_data/year=2025/month=12/day=23$ ls -lha ducklake-019b4aa8-8821-7d23-accb-689338652ebc.parquet
-rw-r--r-- 1 blue blue 0 Dec 23 10:01 ducklake-019b4aa8-8821-7d23-accb-689338652ebc.parquet

As expected, these failure modes are not great. And I have yet to find a graceful away around them since the host is expected to be able to handle unexpected power cycling.

To Reproduce

This golang code will consistently reproduce partially-flushed parquet files in the configured ducklake data_dir:

package main

import (
	"database/sql"
	"fmt"
	"log/slog"
	"os"
	"time"

	_ "github.com/duckdb/duckdb-go/v2"
)

const (
	rowCount = 50_000_000
)

func main() {
	// Set up logging
	logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{
		Level: slog.LevelInfo,
	}))
	slog.SetDefault(logger)

	// Start timeout goroutine to kill after 10 seconds
	go func() {
		time.Sleep(10 * time.Second)
		slog.Error("timeout exceeded, forcing exit")
		os.Exit(2)
	}()

	// Get current working directory
	cwd, err := os.Getwd()
	if err != nil {
		slog.Error("failed to get current directory", "error", err)
		os.Exit(1)
	}

	slog.Info("using local directory", "cwd", cwd)

	// Connect to DuckDB
	db, err := sql.Open("duckdb", "repro.db")
	if err != nil {
		slog.Error("failed to connect to duckdb", "error", err)
		os.Exit(1)
	}
	defer db.Close()

	// Install and load DuckLake
	slog.Info("setting up ducklake")
	if _, err := db.Exec("FORCE INSTALL ducklake FROM core_nightly"); err != nil {
		slog.Error("failed to install ducklake", "error", err)
		os.Exit(1)
	}
	if _, err := db.Exec("LOAD ducklake"); err != nil {
		slog.Error("failed to load ducklake", "error", err)
		os.Exit(1)
	}

	// Create secret for local storage using absolute path
	if _, err := db.Exec("CREATE OR REPLACE PERSISTENT SECRET local_lake (TYPE ducklake, METADATA_PATH 'catalog.lake', DATA_PATH 'data')"); err != nil {
		slog.Error("failed to create secret", "error", err)
		os.Exit(1)
	}

	// Attach the lake
	if _, err := db.Exec("ATTACH 'ducklake:local_lake' AS lake"); err != nil {
		slog.Error("failed to attach lake", "error", err)
		os.Exit(1)
	}

	// Create table
	slog.Info("creating table")
	if _, err := db.Exec("CREATE OR REPLACE TABLE lake.test_data (timestamp TIMESTAMP, id INTEGER)"); err != nil {
		slog.Error("failed to create table", "error", err)
		os.Exit(1)
	}

	// Insert a whole lotta rows using a single SQL statement
	slog.Info("inserting rows", "total", rowCount)

	insertSQL := `
		INSERT INTO lake.test_data (timestamp, id)
		SELECT 
			NOW() + INTERVAL (i) SECOND as timestamp,
			CAST(random() * 1000000 AS INTEGER) as id
		FROM generate_series(0, ` + fmt.Sprintf("%d", rowCount-1) + `) as t(i)
	`

	if _, err := db.Exec(insertSQL); err != nil {
		slog.Error("failed to insert rows", "error", err)
		os.Exit(1)
	}
}

It can be executed via:

go run $FILENAME.go

And will consistently result in:

D select count(*) from read_parquet('data/main/test_data/*.parquet');
Invalid Input Error:
No magic bytes found at end of file 'data/main/test_data/ducklake-019b9fee-8a42-7602-85f7-f72c86c9eb81.parquet'

LINE 1: select count(*) from read_parquet('data/main/test_data/*.parquet');

OS:

linux, aarch64

DuckDB Version:

1.4.1

DuckLake Version:

912c032

DuckDB Client:

Go (but really all of them)

Hardware:

No response

Full Name:

Jake Thomas

Affiliation:

blw

What is the latest build you tested with? If possible, we recommend testing with the latest nightly build.

912c032

Did you include all relevant data sets for reproducing the issue?

No - Other reason (please specify in the issue body)

Did you include all code required to reproduce the issue?

  • Yes, I have

Did you include all relevant configuration (e.g., CPU architecture, Python version, Linux distribution) to reproduce the issue?

  • Yes, I have

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions