Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 106 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
FROM python:3.11-slim AS builder

# Download uv
RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates build-essential
ADD https://astral.sh/uv/install.sh /uv-installer.sh
RUN sh /uv-installer.sh && rm /uv-installer.sh
ENV PATH="/root/.local/bin/:$PATH"

# Install the project into `/app`
WORKDIR /app

# Enable bytecode compilation
ENV UV_COMPILE_BYTECODE=1

# Copy from the cache instead of linking since it's a mounted volume
ENV UV_LINK_MODE=copy

# Install the project's dependencies using the lockfile and settings
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=uv.lock,target=uv.lock \
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
uv sync --locked --no-install-project --no-dev

# Then, add the rest of the project source code and install it
# Installing separately from its dependencies allows optimal layer caching
COPY . /app
RUN --mount=type=cache,target=/root/.cache/uv \
uv sync --locked --no-dev

ENV PATH="/app/.venv/bin:$PATH"

RUN apt-get update && apt-get install -y \
postgresql \
postgresql-contrib \
default-mysql-server \
&& rm -rf /var/lib/apt/lists/*

# Final stage
FROM python:3.11-slim

ENV PATH="/app/.venv/bin:$PATH"

# Copy Python environment and app from builder
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin
COPY --from=builder /app /app

RUN apt-get update && apt-get install -y \
postgresql \
postgresql-contrib \
default-mysql-server \
&& rm -rf /var/lib/apt/lists/*

# Set up PostgreSQL
RUN service postgresql start && \
su - postgres -c "createuser -s root" && \
su - postgres -c "createdb lematerial" && \
su - postgres -c "psql -c \"CREATE USER lematerial WITH PASSWORD 'lematerial';\"" && \
su - postgres -c "psql -c \"ALTER USER lematerial WITH SUPERUSER;\"" && \
su - postgres -c "psql -c \"GRANT ALL PRIVILEGES ON DATABASE lematerial TO lematerial;\"" && \
# Update pg_hba.conf to use md5 authentication for the lematerial user
pg_hba_path=$(find /etc/postgresql -name "pg_hba.conf") && \
echo "host all lematerial 127.0.0.1/32 md5" >> "$pg_hba_path" && \
echo "host all lematerial ::1/128 md5" >> "$pg_hba_path" && \
service postgresql stop

# Create necessary directories and set permissions
RUN mkdir -p /var/lib/mysql /var/run/mysqld /docker-entrypoint-initdb.d && \
chown -R mysql:mysql /var/lib/mysql /var/run/mysqld

# Initialize MariaDB
RUN mysql_install_db --user=mysql --datadir=/var/lib/mysql && \
service mariadb start && \
mariadb -u root -e "CREATE DATABASE IF NOT EXISTS lematerial;" && \
mariadb -u root -e "CREATE USER 'lematerial'@'localhost' IDENTIFIED BY 'lematerial';" && \
mariadb -u root -e "GRANT ALL PRIVILEGES ON lematerial.* TO 'lematerial'@'localhost';" && \
mariadb -u root -e "FLUSH PRIVILEGES;" && \
service mariadb stop

# Create necessary directories
RUN mkdir -p /app/logs /root/.cache/lematerial_fetcher

# Copy startup script
COPY lemat-traj.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/lemat-traj.sh

# Set environment variables
ENV LEMATERIALFETCHER_DB_PASSWORD=lematerial \
LEMATERIALFETCHER_MYSQL_PASSWORD=lematerial \
LEMATERIALFETCHER_DB_USER=lematerial \
LEMATERIALFETCHER_DB_NAME=lematerial \
LEMATERIALFETCHER_MYSQL_USER=lematerial \
LEMATERIALFETCHER_MYSQL_DATABASE=lematerial \
LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_USER=lematerial \
LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_PASSWORD=lematerial \
LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_NAME=lematerial \
LEMATERIALFETCHER_TRANSFORMER_DEST_DB_USER=lematerial \
LEMATERIALFETCHER_TRANSFORMER_DEST_DB_PASSWORD=lematerial \
LEMATERIALFETCHER_TRANSFORMER_DEST_DB_NAME=lematerial

# Expose ports
EXPOSE 5432 3306

ENTRYPOINT []

CMD ["lemat-traj.sh"]
69 changes: 69 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,72 @@ This code base is the property of Entalpic and is licensed under the Apache Lice
```text
Copyright 2025 Entalpic
```

## Docker Setup

For easy deployment and execution, we provide a Docker setup that includes both PostgreSQL and MySQL databases. This setup allows you to run the entire pipeline with a single command.

### Building the Docker Image

```bash
docker build -t lematerial-fetcher .
```

### Running the Pipeline

1. **Basic Run**
```bash
docker run -it lematerial-fetcher run-pipeline
```

2. **With Hugging Face Integration**
```bash
docker run -it \
-e LEMATERIALFETCHER_HF_TOKEN=your_huggingface_token \
lematerial-fetcher run-pipeline
```

3. **Interactive Shell**
```bash
docker run -it lematerial-fetcher bash
```

4. **Persistent Data Storage**
```bash
docker run -it \
-v $(pwd)/data:/app/data \
-v $(pwd)/logs:/app/logs \
lematerial-fetcher run-pipeline
```

### Database Access

The Docker container exposes the following ports:
- PostgreSQL: 5432
- MySQL: 3306

You can connect to the databases using these credentials:
- PostgreSQL:
- Host: localhost
- Port: 5432
- User: root
- Password: root
- Database: lematerial

- MySQL:
- Host: localhost
- Port: 3306
- User: root
- Password: root
- Database: lematerial

### Environment Variables

You can customize the setup using environment variables:
```bash
docker run -it \
-e LEMATERIALFETCHER_DB_PASSWORD=your_password \
-e LEMATERIALFETCHER_MYSQL_PASSWORD=your_mysql_password \
-e LEMATERIALFETCHER_HF_TOKEN=your_huggingface_token \
lematerial-fetcher run-pipeline
```
60 changes: 60 additions & 0 deletions lemat-traj.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash
set -e

echo "WARNING: Make sure that your .env file is not interfering with the Docker's environment variables."

# Start PostgreSQL and MariaDB services
echo "Starting PostgreSQL service..."
service postgresql start

echo "Starting MariaDB service..."
service mariadb start

echo "Starting the full pipeline..."
source /app/.venv/bin/activate

# Materials Project Pipeline
echo "Fetching Materials Project structures..."
lematerial-fetcher mp fetch --table-name mp_structures --num-workers 12

echo "Fetching Materials Project tasks..."
lematerial-fetcher mp fetch --tasks --table-name mp_tasks --num-workers 12

echo "Transforming Materials Project data..."
lematerial-fetcher mp transform --traj --task-source-table-name mp_tasks --table-name mp_structures --dest-table-name transformed_mp_structures --num-workers 12 --batch-size 1000 --db-fetch-batch-size 10

# UNCOMMENT FOR ALEXANDRIA

# # Alexandria Pipeline
# echo "Fetching Alexandria trajectories..."
# lematerial-fetcher alexandria fetch --traj --table-name alex_structures --functional pbe --num-workers 10
# lematerial-fetcher alexandria fetch --traj --table-name alex_structures --functional pbesol --num-workers 10

# echo "Transforming Alexandria data..."
# lematerial-fetcher alexandria transform --traj --table-name alex_structures --dest-table-name transformed_alex_structures --num-workers 10 --batch-size 1000 --db-fetch-batch-size 10

# UNCOMMENT FOR OQMD

# # OQMD Pipeline
# echo "Fetching OQMD structures..."
# lematerial-fetcher oqmd fetch --table-name oqmd_structures

# echo "Transforming OQMD data..."
# lematerial-fetcher oqmd transform --traj --table-name oqmd_structures --dest-table-name transformed_oqmd_structures --num-workers 10 --batch-size 1000 --db-fetch-batch-size 10

# Push to Hugging Face (if HF_TOKEN is set)
if [ ! -z "$LEMATERIALFETCHER_HF_TOKEN" ]; then
echo "Pushing data to Hugging Face..."
lematerial-fetcher push --table-name transformed_mp_structures \
--hf-repo-id LeMat-Traj \
# UNCOMMENT FOR ALEXANDRIA and OQMD
# --table-name transformed_alex_structures \
# --table-name transformed_oqmd_structures \
--chunk-size 1000000 \
--num-workers 12 \
--hf-token $LEMATERIALFETCHER_HF_TOKEN
else
echo "Skipping Hugging Face push (HF_TOKEN not set)"
fi

echo "Pipeline completed successfully!"
27 changes: 27 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env sh

# FROM: https://github.com/astral-sh/uv-docker-example/blob/main/run.sh

# --rm Remove the container after exiting
# --volume .:/app Mount the current directory to `/app` so code changes don't require an image rebuild
# --volume /app/.venv Mount the virtual environment separately, so the developer's environment doesn't end up in the container
# --publish 8000:8000 Expose the web server port 8000 to the host
# -it $(docker build -q .) Build the image, then use it as a run target
# $@ Pass any arguments to the container

if [ -t 1 ]; then
INTERACTIVE="-it"
else
INTERACTIVE=""
fi

docker run \
--rm \
-p 5432:5432 \
-p 3306:3306 \
--volume .:/app \
--volume /app/.venv \
$INTERACTIVE \
--name lematerial \
$(docker build -q .) \
"$@"
1 change: 0 additions & 1 deletion src/lematerial_fetcher/fetcher/mp/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ def get_items_to_process(self) -> ItemsInfo:
self.config.mp_bucket_name == "materialsproject-build"
and self.config.mp_bucket_prefix in ["collections", "collections/"]
):
breakpoint()
prefix = get_latest_collection_version_prefix(
self.aws_client,
self.config.mp_bucket_name,
Expand Down