diff --git a/src/lematerial_fetcher/fetcher/lematrho/pipeline.py b/src/lematerial_fetcher/fetcher/lematrho/pipeline.py index 711a6b7..ac3ea96 100644 --- a/src/lematerial_fetcher/fetcher/lematrho/pipeline.py +++ b/src/lematerial_fetcher/fetcher/lematrho/pipeline.py @@ -20,14 +20,13 @@ from lematerial_fetcher.fetcher.lematrho.utils import ( GRID_KEY_MAP, - RELAX_CALC_TYPE, STATIC_CALC_TYPE, STATIC_FILES, VALID_PREFIXES, compress_chgcar, download_gz_file_from_s3, get_cross_compatibility, - parse_vasprun_structure, + parse_vasprun_output, run_bader_from_bytes, run_ddec6_from_bytes, ) @@ -487,27 +486,16 @@ def _process_material( # analysis work per material, so this is not a bottleneck. aws_client = get_aws_client(authenticated=True) - # Step 1: Download and parse vasprun.xml.gz for structure - vasprun_key = f"{material_id}/{RELAX_CALC_TYPE}/vasprun.xml.gz" - try: - vasprun_bytes = LeMatRhoDirectPipeline._download_with_retry( - aws_client, bucket, vasprun_key - ) - structure = parse_vasprun_structure(vasprun_bytes) - del vasprun_bytes - except Exception as e: - logger.warning(f"Failed to parse vasprun.xml.gz for {material_id}: {e}") - return None - - # Step 2: Download and compress charge density files - compressed_grids = {} + # Step 1: Download all static files (vasprun.xml.gz + charge density files). # Memory trade-off: raw decompressed bytes are kept in memory for # Bader/DDEC6 analysis to avoid a second S3 download. Each CHGCAR # can be 100-500 MB, so peak RSS per worker ≈ sum of needed files. + compressed_grids = {} raw_files = {} - # Determine which raw files to keep - need_raw = set() + # Determine which raw files to keep (use .gz names, matching STATIC_FILES). + # vasprun.xml.gz is always needed for structure, forces, stress, and energy. + need_raw = {"vasprun.xml.gz"} if tool_paths["can_run_bader"]: need_raw |= _BADER_FILES if tool_paths["can_run_ddec6"]: @@ -515,20 +503,20 @@ def _process_material( for filename in STATIC_FILES: s3_key = f"{material_id}/{STATIC_CALC_TYPE}/{filename}" - grid_name = GRID_KEY_MAP[filename] + vasp_name = filename.replace(".gz", "") try: raw_bytes = LeMatRhoDirectPipeline._download_with_retry( aws_client, bucket, s3_key ) - # Compress via pyrho - compressed = compress_chgcar(raw_bytes, grid_shape) - compressed_grids[grid_name] = compressed - del compressed + # Compress charge density files via pyrho (skip vasprun.xml) + if filename in GRID_KEY_MAP: + compressed = compress_chgcar(raw_bytes, grid_shape) + compressed_grids[GRID_KEY_MAP[filename]] = compressed + del compressed - # Keep raw bytes if needed for analysis + # Keep raw bytes if needed downstream (stored without .gz suffix) if filename in need_raw: - vasp_name = filename.replace(".gz", "") raw_files[vasp_name] = raw_bytes del raw_bytes except Exception as e: @@ -536,6 +524,18 @@ def _process_material( f"Failed to process {filename} for {material_id}: {e}" ) + # Step 2: Parse structure, forces, stress, and energy from static vasprun + if "vasprun.xml" not in raw_files: + logger.warning(f"Missing vasprun.xml.gz for {material_id}, skipping.") + return None + try: + structure, static_forces, static_stress, static_energy = ( + parse_vasprun_output(raw_files.pop("vasprun.xml")) + ) + except Exception as e: + logger.warning(f"Failed to parse vasprun.xml.gz for {material_id}: {e}") + return None + # Step 3: Bader analysis (if tools available and files downloaded) bader_charges = None bader_atomic_volume = None @@ -570,8 +570,11 @@ def _process_material( immutable_id=material_id, last_modified=datetime.now(), **optimade_dict, - functional=Functional.PBE, + functional=Functional.r2SCAN, cross_compatibility=cross_compatibility, + forces=static_forces, + stress_tensor=static_stress, + energy=static_energy, compressed_charge_density=compressed_grids.get("charge_density"), compressed_aeccar0=compressed_grids.get("aeccar0"), compressed_aeccar1=compressed_grids.get("aeccar1"), diff --git a/src/lematerial_fetcher/fetcher/lematrho/utils.py b/src/lematerial_fetcher/fetcher/lematrho/utils.py index e3cb022..10e1249 100644 --- a/src/lematerial_fetcher/fetcher/lematrho/utils.py +++ b/src/lematerial_fetcher/fetcher/lematrho/utils.py @@ -12,6 +12,7 @@ import tempfile from typing import Any, Optional +import numpy as np from pymatgen.command_line.bader_caller import BaderAnalysis from pymatgen.command_line.chargemol_caller import ChargemolAnalysis from pymatgen.core import Structure @@ -21,9 +22,7 @@ # ── S3 folder structure constants ────────────────────────────────────────────── STATIC_CALC_TYPE = "LeMatRhoStaticMaker" -RELAX_CALC_TYPE = "LeMatRhoRelaxMaker_1" -STATIC_FILES = ["CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"] -RELAX_FILES = ["vasprun.xml.gz"] +STATIC_FILES = ["vasprun.xml.gz", "CHGCAR.gz", "AECCAR0.gz", "AECCAR1.gz", "AECCAR2.gz"] # Only process materials with these ID prefixes VALID_PREFIXES = ("oqmd-", "mp-", "agm") @@ -76,17 +75,25 @@ def download_gz_file_from_s3(client: Any, bucket: str, key: str) -> bytes: body.close() -def parse_vasprun_structure(vasprun_bytes: bytes) -> Structure: - """Parse a vasprun.xml to extract the final relaxed structure. +def parse_vasprun_output( + vasprun_bytes: bytes, +) -> tuple[Structure, Optional[list[list[float]]], Optional[list[list[float]]], Optional[float]]: + """Parse a vasprun.xml: final structure, last ionic-step forces/stress, and total energy. - Writes bytes to a temporary file because pymatgen's ``Vasprun`` requires - a filesystem path, not a file-like object. + Intended for the ``LeMatRhoStaticMaker`` vasprun (``NSW=0``), where the structure + is already fully relaxed and ``ionic_steps[-1]`` holds the residual forces and + stress at the relaxed geometry. Writes bytes to a temporary file because + pymatgen's ``Vasprun`` requires a filesystem path, not a file-like object. Args: vasprun_bytes: Raw vasprun.xml content. Returns: - The final relaxed pymatgen Structure. + Tuple of (final_structure, forces, stress_tensor, energy) where: + - final_structure: pymatgen Structure + - forces: nsites × 3 (eV/Å), or None if absent + - stress_tensor: 3 × 3 (kBar), or None if absent + - energy: total energy in eV from ``Vasprun.final_energy``, or None if absent """ with tempfile.TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "vasprun.xml") @@ -98,7 +105,35 @@ def parse_vasprun_structure(vasprun_bytes: bytes) -> Structure: parse_eigen=False, parse_potcar_file=False, ) - return vasprun.final_structure + + structure = vasprun.final_structure + forces_out: Optional[list[list[float]]] = None + stress_out: Optional[list[list[float]]] = None + energy_out: Optional[float] = None + + if vasprun.ionic_steps: + final = vasprun.ionic_steps[-1] + frc = final.get("forces") # Forces in nsites × 3 (eV/Å) + if frc is not None: + forces_out = np.asarray(frc, dtype=float).reshape(-1, 3).tolist() + + strs = final.get("stress") # Stress tensor in 3 × 3 (kBar) + if strs is not None: + s = np.asarray(strs, dtype=float) + if s.shape == (3, 3): + stress_out = s.tolist() + elif s.size == 9: + stress_out = s.reshape(3, 3).tolist() + elif s.size == 6: + xx, yy, zz, xy, yz, xz = (float(x) for x in s.flat[:6]) + stress_out = [[xx, xy, xz], [xy, yy, yz], [xz, yz, zz]] + + try: + energy_out = float(vasprun.final_energy) + except Exception: + pass + + return structure, forces_out, stress_out, energy_out def compress_chgcar(chgcar_bytes: bytes, grid_shape: tuple[int, int, int]) -> list: diff --git a/tests/fetcher/lematrho/test_lematrho_pipeline.py b/tests/fetcher/lematrho/test_lematrho_pipeline.py index 73f71fb..30a989e 100644 --- a/tests/fetcher/lematrho/test_lematrho_pipeline.py +++ b/tests/fetcher/lematrho/test_lematrho_pipeline.py @@ -21,6 +21,7 @@ _structure_to_row, ) from lematerial_fetcher.fetcher.lematrho.utils import ( + parse_vasprun_output, run_bader_from_bytes, run_ddec6_from_bytes, ) @@ -196,7 +197,7 @@ def test_excludes_processed_ids(self, mock_get_client, mock_config): class TestProcessMaterial: @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_happy_path_no_tools( @@ -220,7 +221,7 @@ def test_happy_path_no_tools( ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]], ) - mock_parse_vasprun.return_value = structure + mock_parse_vasprun.return_value = (structure, None, None, None) mock_compress.return_value = [[[1.0] * 10] * 10] * 10 mock_get_optimade.return_value = _make_mock_optimade_dict() @@ -233,7 +234,7 @@ def test_happy_path_no_tools( # Check key fields assert result["immutable_id"] == "mp-123" - assert result["functional"] == "pbe" + assert result["functional"] == "r2scan" assert result["cross_compatibility"] is True assert result["nsites"] == 2 assert result["charge_density_grid_shape"] == [10, 10, 10] @@ -257,7 +258,7 @@ def test_happy_path_no_tools( @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_missing_vasprun_returns_none( @@ -280,7 +281,7 @@ def test_missing_vasprun_returns_none( @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_partial_charge_files( @@ -308,7 +309,7 @@ def download_side_effect(client, bucket, key): structure = Structure( Lattice.cubic(3.0), ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] ) - mock_parse_vasprun.return_value = structure + mock_parse_vasprun.return_value = (structure, None, None, None) mock_compress.return_value = [[[1.0]]] mock_get_optimade.return_value = _make_mock_optimade_dict() @@ -326,7 +327,7 @@ def download_side_effect(client, bucket, key): @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_cross_compatibility_excludes_yb( @@ -347,7 +348,7 @@ def test_cross_compatibility_excludes_yb( structure = Structure( Lattice.cubic(3.0), ["Yb", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] ) - mock_parse_vasprun.return_value = structure + mock_parse_vasprun.return_value = (structure, None, None, None) mock_compress.return_value = [[[1.0]]] optimade_dict = _make_mock_optimade_dict() @@ -385,7 +386,7 @@ def test_cross_compatibility_excludes_yb( @patch("lematerial_fetcher.fetcher.lematrho.pipeline.run_bader_from_bytes") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_bader_failure_still_returns_result( @@ -407,7 +408,7 @@ def test_bader_failure_still_returns_result( structure = Structure( Lattice.cubic(3.0), ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] ) - mock_parse_vasprun.return_value = structure + mock_parse_vasprun.return_value = (structure, None, None, None) mock_compress.return_value = [[[1.0]]] mock_get_optimade.return_value = _make_mock_optimade_dict() mock_bader.return_value = (None, None) @@ -1265,6 +1266,98 @@ def test_partial_charge_fields(self): assert row["ddec6_charges"] is None +# --------------------------------------------------------------------------- +# TestVasprunForces +# --------------------------------------------------------------------------- + + +class TestVasprunForces: + def test_parse_vasprun_output_uses_last_ionic_step_and_returns_energy(self): + """parse_vasprun_output returns forces, stress, and energy from the static vasprun.""" + + from pymatgen.core import Lattice, Structure + + # Build a minimal valid vasprun.xml matching pymatgen's Vasprun parser + # We mock at the Vasprun level to avoid needing a real file + mock_vasprun = MagicMock() + structure = Structure( + Lattice.cubic(3.0), ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] + ) + mock_vasprun.final_structure = structure + mock_vasprun.ionic_steps = [ + { + "forces": [[0.1, 0.2, 0.3], [-0.1, -0.2, -0.3]], + "stress": [[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]], + } + ] + mock_vasprun.final_energy = -12.345 + + with patch("lematerial_fetcher.fetcher.lematrho.utils.Vasprun", return_value=mock_vasprun): + result_structure, forces, stress, energy = parse_vasprun_output(b"dummy") + + assert result_structure is structure + assert forces == [[0.1, 0.2, 0.3], [-0.1, -0.2, -0.3]] + assert stress == [[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]] + assert energy == pytest.approx(-12.345) + + def test_parse_vasprun_output_energy_none_when_missing(self): + """parse_vasprun_output returns None energy when final_energy is not parseable.""" + from pymatgen.core import Lattice, Structure + + mock_vasprun = MagicMock() + structure = Structure(Lattice.cubic(3.0), ["Si"], [[0, 0, 0]]) + mock_vasprun.final_structure = structure + mock_vasprun.ionic_steps = [] + mock_vasprun.final_energy = "not_a_number" + + with patch("lematerial_fetcher.fetcher.lematrho.utils.Vasprun", return_value=mock_vasprun): + _, forces, stress, energy = parse_vasprun_output(b"dummy") + + assert forces is None + assert stress is None + assert energy is None + + def test_vasprun_forces_and_stress_in_row(self, mock_config, no_tools): + """Forces, stress_tensor, energy, energy_corrected, and functional reach the Parquet row.""" + from pymatgen.core import Lattice, Structure + + structure = Structure( + Lattice.cubic(3.0), ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] + ) + mock_forces = [[0.05, 0.0, 0.0], [-0.05, 0.0, 0.0]] + mock_stress = [[1.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 3.0]] + mock_energy = -12.345 + + with ( + patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") as mock_client, + patch( + "lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3", + return_value=b"mock", + ), + patch( + "lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output", + return_value=(structure, mock_forces, mock_stress, mock_energy), + ), + patch( + "lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar", + return_value=[[[1.0]]], + ), + patch( + "lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen", + return_value=_make_mock_optimade_dict(), + ), + ): + mock_client.return_value = MagicMock() + result = LeMatRhoDirectPipeline._process_material("mp-123", mock_config, no_tools) + + assert result is not None + assert result["forces"] == mock_forces + assert result["stress_tensor"] == mock_stress + assert result["energy"] == pytest.approx(mock_energy) + assert result["energy_corrected"] == pytest.approx(mock_energy) # R2SCAN: no correction + assert result["functional"] == "r2scan" + + # --------------------------------------------------------------------------- # TestPushToHuggingface # --------------------------------------------------------------------------- @@ -1394,7 +1487,7 @@ class TestProcessMaterialWithDdec6: @patch("lematerial_fetcher.fetcher.lematrho.pipeline.run_ddec6_from_bytes") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_optimade_from_pymatgen") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.compress_chgcar") - @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_structure") + @patch("lematerial_fetcher.fetcher.lematrho.pipeline.parse_vasprun_output") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.download_gz_file_from_s3") @patch("lematerial_fetcher.fetcher.lematrho.pipeline.get_aws_client") def test_ddec6_populates_charges( @@ -1416,7 +1509,7 @@ def test_ddec6_populates_charges( structure = Structure( Lattice.cubic(3.0), ["Si", "O"], [[0, 0, 0], [0.5, 0.5, 0.5]] ) - mock_parse_vasprun.return_value = structure + mock_parse_vasprun.return_value = (structure, None, None, None) mock_compress.return_value = [[[1.0] * 10] * 10] * 10 mock_get_optimade.return_value = _make_mock_optimade_dict() mock_ddec6.return_value = [0.3, -0.3] @@ -1512,6 +1605,6 @@ def test_process_single_material(self): result = LeMatRhoDirectPipeline._process_material(material_id, config, no_tools) assert result is not None assert result["immutable_id"] == material_id - assert result["functional"] == "pbe" + assert result["functional"] == "r2scan" for col in PARQUET_COLUMNS: assert col in result