Skip to content

Crash when using FromSpec with dask and threads #20845

@acampove

Description

@acampove

Check duplicate issues.

  • Checked for duplicates

Description

I am seeing a crash when I try to use a Dask client to read files with FromSpec using threads. Processes seem to work.

Reproducer

import json
import ROOT
import numpy
import matplotlib.pyplot as plt

from ROOT             import RDF # type: ignore
from dask.distributed import Client, LocalCluster

NFILES  = 100
PATH    = 'files'
# -------------------------------------
def create_connection(nproc : int):
    cluster = LocalCluster(
        n_workers         =1, 
        threads_per_worker=nproc, 
        processes         =False, 
        memory_limit      ='2GiB')

    client  = Client(cluster)

    return client
# -------------------------------------
def _create_files(index : int):
    main     = f"{PATH}/main_{index:03}.root"
    friend   = f"{PATH}/frnd_{index:03}.root"
    
    df       = ROOT.RDataFrame(100000)
    df.Define("x", "gRandom->Rndm()").Snapshot("Events", main)
    
    df       = ROOT.RDataFrame(100000)
    df.Define("y", "gRandom->Rndm()").Snapshot("Events", friend)
# -------------------------------------
def _get_array(client) -> numpy.ndarray:
    main_files = [ f'{PATH}/main_{index:03}.root' for index in range(NFILES) ]
    frnd_files = [ f'{PATH}/frnd_{index:03}.root' for index in range(NFILES) ]
    
    spec = {
        "samples": {
            "main": {
                "trees": ["Events"],
                "files": main_files,
            }
        },
        "friends": {
            "friend": {
                "trees": ["Events"],
                "files": frnd_files, 
            }
        }
    }
    
    with open("spec.json", "w") as f:
        json.dump(spec, f, indent=4)
    
    df= RDF.Experimental.FromSpec("spec.json", executor = client)

    return df.AsNumpy(['x'])['x']
# -------------------------------------
def main():
    '''
    Entry point
    '''
    for index in range(NFILES):
        _create_files(index = index)

    client = create_connection(nproc = 5)
    arr_1  = _get_array(client)
    arr_2  = _get_array(client)

    plt.scatter(arr_1[:1000], arr_2[:1000])
    plt.savefig('plot.png')  
    plt.close('all')
# ----------------------
if __name__ == '__main__':
    main()
# -------------------------------------

ROOT version

6.38

Installation method

micromamba

Operating system

almalinux9

Additional context

The version using processes (which seems to work) could be a workaround for:

#20840

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions