Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 29 additions & 26 deletions src/lematerial_fetcher/fetcher/lematrho/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@

from lematerial_fetcher.fetcher.lematrho.utils import (
GRID_KEY_MAP,
RELAX_CALC_TYPE,
STATIC_CALC_TYPE,
STATIC_FILES,
VALID_PREFIXES,
compress_chgcar,
download_gz_file_from_s3,
get_cross_compatibility,
parse_vasprun_structure,
parse_vasprun_output,
run_bader_from_bytes,
run_ddec6_from_bytes,
)
Expand Down Expand Up @@ -487,55 +486,56 @@ def _process_material(
# analysis work per material, so this is not a bottleneck.
aws_client = get_aws_client(authenticated=True)

# Step 1: Download and parse vasprun.xml.gz for structure
vasprun_key = f"{material_id}/{RELAX_CALC_TYPE}/vasprun.xml.gz"
try:
vasprun_bytes = LeMatRhoDirectPipeline._download_with_retry(
aws_client, bucket, vasprun_key
)
structure = parse_vasprun_structure(vasprun_bytes)
del vasprun_bytes
except Exception as e:
logger.warning(f"Failed to parse vasprun.xml.gz for {material_id}: {e}")
return None

# Step 2: Download and compress charge density files
compressed_grids = {}
# Step 1: Download all static files (vasprun.xml.gz + charge density files).
# Memory trade-off: raw decompressed bytes are kept in memory for
# Bader/DDEC6 analysis to avoid a second S3 download. Each CHGCAR
# can be 100-500 MB, so peak RSS per worker ≈ sum of needed files.
compressed_grids = {}
raw_files = {}

# Determine which raw files to keep
need_raw = set()
# Determine which raw files to keep (use .gz names, matching STATIC_FILES).
# vasprun.xml.gz is always needed for structure, forces, stress, and energy.
need_raw = {"vasprun.xml.gz"}
if tool_paths["can_run_bader"]:
need_raw |= _BADER_FILES
if tool_paths["can_run_ddec6"]:
need_raw |= _DDEC6_FILES

for filename in STATIC_FILES:
s3_key = f"{material_id}/{STATIC_CALC_TYPE}/{filename}"
grid_name = GRID_KEY_MAP[filename]
vasp_name = filename.replace(".gz", "")
try:
raw_bytes = LeMatRhoDirectPipeline._download_with_retry(
aws_client, bucket, s3_key
)

# Compress via pyrho
compressed = compress_chgcar(raw_bytes, grid_shape)
compressed_grids[grid_name] = compressed
del compressed
# Compress charge density files via pyrho (skip vasprun.xml)
if filename in GRID_KEY_MAP:
compressed = compress_chgcar(raw_bytes, grid_shape)
compressed_grids[GRID_KEY_MAP[filename]] = compressed
del compressed

# Keep raw bytes if needed for analysis
# Keep raw bytes if needed downstream (stored without .gz suffix)
if filename in need_raw:
vasp_name = filename.replace(".gz", "")
raw_files[vasp_name] = raw_bytes
del raw_bytes
except Exception as e:
logger.warning(
f"Failed to process {filename} for {material_id}: {e}"
)

# Step 2: Parse structure, forces, stress, and energy from static vasprun
if "vasprun.xml" not in raw_files:
logger.warning(f"Missing vasprun.xml.gz for {material_id}, skipping.")
return None
try:
structure, static_forces, static_stress, static_energy = (
parse_vasprun_output(raw_files.pop("vasprun.xml"))
)
except Exception as e:
logger.warning(f"Failed to parse vasprun.xml.gz for {material_id}: {e}")
return None

# Step 3: Bader analysis (if tools available and files downloaded)
bader_charges = None
bader_atomic_volume = None
Expand Down Expand Up @@ -570,8 +570,11 @@ def _process_material(
immutable_id=material_id,
last_modified=datetime.now(),
**optimade_dict,
functional=Functional.PBE,
functional=Functional.r2SCAN,
cross_compatibility=cross_compatibility,
forces=static_forces,
stress_tensor=static_stress,
energy=static_energy,
compressed_charge_density=compressed_grids.get("charge_density"),
compressed_aeccar0=compressed_grids.get("aeccar0"),
compressed_aeccar1=compressed_grids.get("aeccar1"),
Expand Down
53 changes: 44 additions & 9 deletions src/lematerial_fetcher/fetcher/lematrho/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import tempfile
from typing import Any, Optional

import numpy as np
from pymatgen.command_line.bader_caller import BaderAnalysis
from pymatgen.command_line.chargemol_caller import ChargemolAnalysis
from pymatgen.core import Structure
Expand All @@ -21,9 +22,7 @@

# ── S3 folder structure constants ──────────────────────────────────────────────
STATIC_CALC_TYPE = "LeMatRhoStaticMaker"
RELAX_CALC_TYPE = "LeMatRhoRelaxMaker_1"
STATIC_FILES = ["CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"]
RELAX_FILES = ["vasprun.xml.gz"]
STATIC_FILES = ["vasprun.xml.gz", "CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"]

# Only process materials with these ID prefixes
VALID_PREFIXES = ("oqmd-", "mp-", "agm")
Expand Down Expand Up @@ -76,17 +75,25 @@ def download_gz_file_from_s3(client: Any, bucket: str, key: str) -> bytes:
body.close()


def parse_vasprun_structure(vasprun_bytes: bytes) -> Structure:
"""Parse a vasprun.xml to extract the final relaxed structure.
def parse_vasprun_output(
vasprun_bytes: bytes,
) -> tuple[Structure, Optional[list[list[float]]], Optional[list[list[float]]], Optional[float]]:
"""Parse a vasprun.xml: final structure, last ionic-step forces/stress, and total energy.

Writes bytes to a temporary file because pymatgen's ``Vasprun`` requires
a filesystem path, not a file-like object.
Intended for the ``LeMatRhoStaticMaker`` vasprun (``NSW=0``), where the structure
is already fully relaxed and ``ionic_steps[-1]`` holds the residual forces and
stress at the relaxed geometry. Writes bytes to a temporary file because
pymatgen's ``Vasprun`` requires a filesystem path, not a file-like object.

Args:
vasprun_bytes: Raw vasprun.xml content.

Returns:
The final relaxed pymatgen Structure.
Tuple of (final_structure, forces, stress_tensor, energy) where:
- final_structure: pymatgen Structure
- forces: nsites × 3 (eV/Å), or None if absent
- stress_tensor: 3 × 3 (kBar), or None if absent
- energy: total energy in eV from ``Vasprun.final_energy``, or None if absent
"""
with tempfile.TemporaryDirectory() as tmpdir:
path = os.path.join(tmpdir, "vasprun.xml")
Expand All @@ -98,7 +105,35 @@ def parse_vasprun_structure(vasprun_bytes: bytes) -> Structure:
parse_eigen=False,
parse_potcar_file=False,
)
return vasprun.final_structure

structure = vasprun.final_structure
forces_out: Optional[list[list[float]]] = None
stress_out: Optional[list[list[float]]] = None
energy_out: Optional[float] = None

if vasprun.ionic_steps:
final = vasprun.ionic_steps[-1]
frc = final.get("forces") # Forces in nsites × 3 (eV/Å)
if frc is not None:
forces_out = np.asarray(frc, dtype=float).reshape(-1, 3).tolist()

strs = final.get("stress") # Stress tensor in 3 × 3 (kBar)
if strs is not None:
s = np.asarray(strs, dtype=float)
if s.shape == (3, 3):
stress_out = s.tolist()
elif s.size == 9:
stress_out = s.reshape(3, 3).tolist()
elif s.size == 6:
xx, yy, zz, xy, yz, xz = (float(x) for x in s.flat[:6])
stress_out = [[xx, xy, xz], [xy, yy, yz], [xz, yz, zz]]

try:
energy_out = float(vasprun.final_energy)
except Exception:
pass

return structure, forces_out, stress_out, energy_out


def compress_chgcar(chgcar_bytes: bytes, grid_shape: tuple[int, int, int]) -> list:
Expand Down
Loading
Loading