Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,29 @@ LEMATERIALFETCHER_DEST_TABLE_NAME=optimade_materials
# Transformer processing settings
# LEMATERIALFETCHER_BATCH_SIZE=500
# LEMATERIALFETCHER_OFFSET=0
# LEMATERIALFETCHER_LOG_EVERY=1000
# LEMATERIALFETCHER_LOG_EVERY=1000

# ------------------------------------------------------------------------------
# LeMatRho Configuration (charge density pipeline)
# ------------------------------------------------------------------------------

# AWS credentials for authenticated S3 access (LeMatRho bucket)
# AWS_ACCESS_KEY_ID=your_access_key
# AWS_SECRET_ACCESS_KEY=your_secret_key
# AWS_DEFAULT_REGION=us-east-1

# LeMatRho S3 bucket name
# LEMATERIALFETCHER_LEMATRHO_BUCKET_NAME=lemat-rho

# VASP pseudopotential directory (required for Bader/DDEC6 analysis)
# PMG_VASP_PSP_DIR=/path/to/vasp/pseudopotentials

# External tool paths (optional, auto-detected on PATH if not set)
# LEMATERIALFETCHER_BADER_PATH=/path/to/bader
# LEMATERIALFETCHER_CHARGEMOL_PATH=/path/to/chargemol
# LEMATERIALFETCHER_CHGSUM_SCRIPT_PATH=/path/to/chgsum.pl
Comment thread
Ramlaoui marked this conversation as resolved.
Outdated
# LEMATERIALFETCHER_ATOMIC_DENSITIES_PATH=/path/to/atomic_densities

# HuggingFace (for pushing dataset after pipeline completes)
# HF_REPO_ID=your-org/lematrho-dataset
# HF_TOKEN=hf_your_token
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ celerybeat.pid

# Environments
.env
.env.*
!.env.example
.venv
# env/
venv/
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"moyopy>=0.4.2",
"ase>=3.24.0",
"material-hasher",
"mp-pyrho>=0.3.1",
]

[project.scripts]
Expand Down Expand Up @@ -58,5 +59,10 @@ dev-dependencies = [
material-hasher = { git = "https://github.com/LeMaterial/lematerial-hasher.git" }


[tool.pytest.ini_options]
markers = [
"integration: tests that require real AWS credentials and S3 access (deselect with '-m \"not integration\"')",
]

[tool.ruff.lint]
extend-select = ["I"]
36 changes: 36 additions & 0 deletions src/lematerial_fetcher/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
AlexandriaTrajectoryTransformer,
AlexandriaTransformer,
)
from lematerial_fetcher.fetcher.lematrho.pipeline import LeMatRhoDirectPipeline
from lematerial_fetcher.fetcher.mp.fetch import MPFetcher
from lematerial_fetcher.fetcher.mp.transform import (
MPTrajectoryTransformer,
Expand All @@ -37,13 +38,15 @@
from lematerial_fetcher.utils.cli import (
add_common_options,
add_fetch_options,
add_lematrho_direct_options,
add_mp_fetch_options,
add_mysql_options,
add_push_options,
add_transformer_options,
get_default_mp_bucket_name,
)
from lematerial_fetcher.utils.config import (
load_direct_pipeline_config,
load_fetcher_config,
load_push_config,
load_transformer_config,
Expand Down Expand Up @@ -114,9 +117,17 @@ def oqmd_cli(ctx):
pass


@click.group(name="lematrho")
@click.pass_context
def lematrho_cli(ctx):
"""Commands for fetching charge density data from LeMatRho."""
pass


cli.add_command(mp_cli)
cli.add_command(alexandria_cli)
cli.add_command(oqmd_cli)
cli.add_command(lematrho_cli)

# ------------------------------------------------------------------------------
# MP commands
Expand Down Expand Up @@ -341,6 +352,31 @@ def oqmd_transform(ctx, traj, **config_kwargs):
logger.fatal("\nAborted.", exit=1)


# ------------------------------------------------------------------------------
# LeMatRho commands
# ------------------------------------------------------------------------------


@lematrho_cli.command(name="run")
@click.pass_context
@add_lematrho_direct_options
def lematrho_run(ctx, **config_kwargs):
"""Run complete LeMatRho pipeline: S3 -> Parquet -> HuggingFace.

Downloads charge density data, compresses via pyrho, optionally runs
Bader and DDEC6 analysis, and writes Parquet files directly.
No PostgreSQL required.

Requires AWS credentials (AWS_ACCESS_KEY_ID + AWS_SECRET_ACCESS_KEY).
"""
config = load_direct_pipeline_config(**config_kwargs)
try:
pipeline = LeMatRhoDirectPipeline(config=config, debug=ctx.obj["debug"])
pipeline.run()
except KeyboardInterrupt:
logger.fatal("\nAborted.", exit=1)


# ------------------------------------------------------------------------------
# Push commands
# ------------------------------------------------------------------------------
Expand Down
40 changes: 40 additions & 0 deletions src/lematerial_fetcher/database/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,14 @@ def columns(cls) -> dict[str, str]:
"space_group_it_number": "INTEGER",
"cross_compatibility": "BOOLEAN",
"bawl_fingerprint": "TEXT",
"compressed_charge_density": "JSONB",
"compressed_aeccar0": "JSONB",
"compressed_aeccar1": "JSONB",
"compressed_aeccar2": "JSONB",
"charge_density_grid_shape": "INTEGER[]",
"bader_charges": "FLOAT[]",
"bader_atomic_volume": "FLOAT[]",
"ddec6_charges": "FLOAT[]",
}

def _prepare_species_data(self, species: list[dict[str, Any]]) -> list[Json]:
Expand Down Expand Up @@ -572,6 +580,14 @@ def insert_data(self, structure: OptimadeStructure) -> None:
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
)
cur.execute(query, input_data)
self.conn.commit()
Expand Down Expand Up @@ -639,6 +655,14 @@ def batch_insert_data(
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
)
)

Expand Down Expand Up @@ -740,6 +764,14 @@ def insert_data(self, structure: Trajectory) -> None:
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
Comment thread
Ramlaoui marked this conversation as resolved.
Outdated
# trajectory-specific fields
structure.relaxation_step,
structure.relaxation_number,
Expand Down Expand Up @@ -810,6 +842,14 @@ def batch_insert_data(
structure.space_group_it_number,
structure.cross_compatibility,
structure.bawl_fingerprint,
Json(structure.compressed_charge_density),
Json(structure.compressed_aeccar0),
Json(structure.compressed_aeccar1),
Json(structure.compressed_aeccar2),
structure.charge_density_grid_shape,
structure.bader_charges,
structure.bader_atomic_volume,
structure.ddec6_charges,
# trajectory-specific fields
structure.relaxation_step,
structure.relaxation_number,
Expand Down
2 changes: 1 addition & 1 deletion src/lematerial_fetcher/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def db(self) -> StructuresDatabase:

Returns
-------
StructuresDatabase
StructuresDatabase
Database connection
"""
if self._db is None:
Expand Down
1 change: 1 addition & 0 deletions src/lematerial_fetcher/fetcher/lematrho/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Copyright 2025 Entalpic
Loading
Loading