diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..086e654 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,106 @@ +FROM python:3.11-slim AS builder + +# Download uv +RUN apt-get update && apt-get install -y --no-install-recommends curl ca-certificates build-essential +ADD https://astral.sh/uv/install.sh /uv-installer.sh +RUN sh /uv-installer.sh && rm /uv-installer.sh +ENV PATH="/root/.local/bin/:$PATH" + +# Install the project into `/app` +WORKDIR /app + +# Enable bytecode compilation +ENV UV_COMPILE_BYTECODE=1 + +# Copy from the cache instead of linking since it's a mounted volume +ENV UV_LINK_MODE=copy + +# Install the project's dependencies using the lockfile and settings +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=uv.lock,target=uv.lock \ + --mount=type=bind,source=pyproject.toml,target=pyproject.toml \ + uv sync --locked --no-install-project --no-dev + +# Then, add the rest of the project source code and install it +# Installing separately from its dependencies allows optimal layer caching +COPY . /app +RUN --mount=type=cache,target=/root/.cache/uv \ + uv sync --locked --no-dev + +ENV PATH="/app/.venv/bin:$PATH" + +RUN apt-get update && apt-get install -y \ + postgresql \ + postgresql-contrib \ + default-mysql-server \ + && rm -rf /var/lib/apt/lists/* + +# Final stage +FROM python:3.11-slim + +ENV PATH="/app/.venv/bin:$PATH" + +# Copy Python environment and app from builder +COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages +COPY --from=builder /usr/local/bin /usr/local/bin +COPY --from=builder /app /app + +RUN apt-get update && apt-get install -y \ + postgresql \ + postgresql-contrib \ + default-mysql-server \ + && rm -rf /var/lib/apt/lists/* + +# Set up PostgreSQL +RUN service postgresql start && \ + su - postgres -c "createuser -s root" && \ + su - postgres -c "createdb lematerial" && \ + su - postgres -c "psql -c \"CREATE USER lematerial WITH PASSWORD 'lematerial';\"" && \ + su - postgres -c "psql -c \"ALTER USER lematerial WITH SUPERUSER;\"" && \ + su - postgres -c "psql -c \"GRANT ALL PRIVILEGES ON DATABASE lematerial TO lematerial;\"" && \ + # Update pg_hba.conf to use md5 authentication for the lematerial user + pg_hba_path=$(find /etc/postgresql -name "pg_hba.conf") && \ + echo "host all lematerial 127.0.0.1/32 md5" >> "$pg_hba_path" && \ + echo "host all lematerial ::1/128 md5" >> "$pg_hba_path" && \ + service postgresql stop + +# Create necessary directories and set permissions +RUN mkdir -p /var/lib/mysql /var/run/mysqld /docker-entrypoint-initdb.d && \ + chown -R mysql:mysql /var/lib/mysql /var/run/mysqld + +# Initialize MariaDB +RUN mysql_install_db --user=mysql --datadir=/var/lib/mysql && \ + service mariadb start && \ + mariadb -u root -e "CREATE DATABASE IF NOT EXISTS lematerial;" && \ + mariadb -u root -e "CREATE USER 'lematerial'@'localhost' IDENTIFIED BY 'lematerial';" && \ + mariadb -u root -e "GRANT ALL PRIVILEGES ON lematerial.* TO 'lematerial'@'localhost';" && \ + mariadb -u root -e "FLUSH PRIVILEGES;" && \ + service mariadb stop + +# Create necessary directories +RUN mkdir -p /app/logs /root/.cache/lematerial_fetcher + +# Copy startup script +COPY lemat-traj.sh /usr/local/bin/ +RUN chmod +x /usr/local/bin/lemat-traj.sh + +# Set environment variables +ENV LEMATERIALFETCHER_DB_PASSWORD=lematerial \ + LEMATERIALFETCHER_MYSQL_PASSWORD=lematerial \ + LEMATERIALFETCHER_DB_USER=lematerial \ + LEMATERIALFETCHER_DB_NAME=lematerial \ + LEMATERIALFETCHER_MYSQL_USER=lematerial \ + LEMATERIALFETCHER_MYSQL_DATABASE=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_USER=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_PASSWORD=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_SOURCE_DB_NAME=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_DEST_DB_USER=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_DEST_DB_PASSWORD=lematerial \ + LEMATERIALFETCHER_TRANSFORMER_DEST_DB_NAME=lematerial + +# Expose ports +EXPOSE 5432 3306 + +ENTRYPOINT [] + +CMD ["lemat-traj.sh"] diff --git a/README.md b/README.md index 3709e10..20a9547 100644 --- a/README.md +++ b/README.md @@ -225,3 +225,72 @@ This code base is the property of Entalpic and is licensed under the Apache Lice ```text Copyright 2025 Entalpic ``` + +## Docker Setup + +For easy deployment and execution, we provide a Docker setup that includes both PostgreSQL and MySQL databases. This setup allows you to run the entire pipeline with a single command. + +### Building the Docker Image + +```bash +docker build -t lematerial-fetcher . +``` + +### Running the Pipeline + +1. **Basic Run** + ```bash + docker run -it lematerial-fetcher run-pipeline + ``` + +2. **With Hugging Face Integration** + ```bash + docker run -it \ + -e LEMATERIALFETCHER_HF_TOKEN=your_huggingface_token \ + lematerial-fetcher run-pipeline + ``` + +3. **Interactive Shell** + ```bash + docker run -it lematerial-fetcher bash + ``` + +4. **Persistent Data Storage** + ```bash + docker run -it \ + -v $(pwd)/data:/app/data \ + -v $(pwd)/logs:/app/logs \ + lematerial-fetcher run-pipeline + ``` + +### Database Access + +The Docker container exposes the following ports: +- PostgreSQL: 5432 +- MySQL: 3306 + +You can connect to the databases using these credentials: +- PostgreSQL: + - Host: localhost + - Port: 5432 + - User: root + - Password: root + - Database: lematerial + +- MySQL: + - Host: localhost + - Port: 3306 + - User: root + - Password: root + - Database: lematerial + +### Environment Variables + +You can customize the setup using environment variables: +```bash +docker run -it \ + -e LEMATERIALFETCHER_DB_PASSWORD=your_password \ + -e LEMATERIALFETCHER_MYSQL_PASSWORD=your_mysql_password \ + -e LEMATERIALFETCHER_HF_TOKEN=your_huggingface_token \ + lematerial-fetcher run-pipeline +``` diff --git a/lemat-traj.sh b/lemat-traj.sh new file mode 100755 index 0000000..f1f1bfb --- /dev/null +++ b/lemat-traj.sh @@ -0,0 +1,60 @@ +#!/bin/bash +set -e + +echo "WARNING: Make sure that your .env file is not interfering with the Docker's environment variables." + +# Start PostgreSQL and MariaDB services +echo "Starting PostgreSQL service..." +service postgresql start + +echo "Starting MariaDB service..." +service mariadb start + +echo "Starting the full pipeline..." +source /app/.venv/bin/activate + +# Materials Project Pipeline +echo "Fetching Materials Project structures..." +lematerial-fetcher mp fetch --table-name mp_structures --num-workers 12 + +echo "Fetching Materials Project tasks..." +lematerial-fetcher mp fetch --tasks --table-name mp_tasks --num-workers 12 + +echo "Transforming Materials Project data..." +lematerial-fetcher mp transform --traj --task-source-table-name mp_tasks --table-name mp_structures --dest-table-name transformed_mp_structures --num-workers 12 --batch-size 1000 --db-fetch-batch-size 10 + +# UNCOMMENT FOR ALEXANDRIA + +# # Alexandria Pipeline +# echo "Fetching Alexandria trajectories..." +# lematerial-fetcher alexandria fetch --traj --table-name alex_structures --functional pbe --num-workers 10 +# lematerial-fetcher alexandria fetch --traj --table-name alex_structures --functional pbesol --num-workers 10 + +# echo "Transforming Alexandria data..." +# lematerial-fetcher alexandria transform --traj --table-name alex_structures --dest-table-name transformed_alex_structures --num-workers 10 --batch-size 1000 --db-fetch-batch-size 10 + +# UNCOMMENT FOR OQMD + +# # OQMD Pipeline +# echo "Fetching OQMD structures..." +# lematerial-fetcher oqmd fetch --table-name oqmd_structures + +# echo "Transforming OQMD data..." +# lematerial-fetcher oqmd transform --traj --table-name oqmd_structures --dest-table-name transformed_oqmd_structures --num-workers 10 --batch-size 1000 --db-fetch-batch-size 10 + +# Push to Hugging Face (if HF_TOKEN is set) +if [ ! -z "$LEMATERIALFETCHER_HF_TOKEN" ]; then + echo "Pushing data to Hugging Face..." + lematerial-fetcher push --table-name transformed_mp_structures \ + --hf-repo-id LeMat-Traj \ + # UNCOMMENT FOR ALEXANDRIA and OQMD + # --table-name transformed_alex_structures \ + # --table-name transformed_oqmd_structures \ + --chunk-size 1000000 \ + --num-workers 12 \ + --hf-token $LEMATERIALFETCHER_HF_TOKEN +else + echo "Skipping Hugging Face push (HF_TOKEN not set)" +fi + +echo "Pipeline completed successfully!" diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..960dece --- /dev/null +++ b/run.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env sh + +# FROM: https://github.com/astral-sh/uv-docker-example/blob/main/run.sh + +# --rm Remove the container after exiting +# --volume .:/app Mount the current directory to `/app` so code changes don't require an image rebuild +# --volume /app/.venv Mount the virtual environment separately, so the developer's environment doesn't end up in the container +# --publish 8000:8000 Expose the web server port 8000 to the host +# -it $(docker build -q .) Build the image, then use it as a run target +# $@ Pass any arguments to the container + +if [ -t 1 ]; then + INTERACTIVE="-it" +else + INTERACTIVE="" +fi + +docker run \ + --rm \ + -p 5432:5432 \ + -p 3306:3306 \ + --volume .:/app \ + --volume /app/.venv \ + $INTERACTIVE \ + --name lematerial \ + $(docker build -q .) \ + "$@" diff --git a/src/lematerial_fetcher/fetcher/mp/fetch.py b/src/lematerial_fetcher/fetcher/mp/fetch.py index 421d9c5..7967855 100644 --- a/src/lematerial_fetcher/fetcher/mp/fetch.py +++ b/src/lematerial_fetcher/fetcher/mp/fetch.py @@ -66,7 +66,6 @@ def get_items_to_process(self) -> ItemsInfo: self.config.mp_bucket_name == "materialsproject-build" and self.config.mp_bucket_prefix in ["collections", "collections/"] ): - breakpoint() prefix = get_latest_collection_version_prefix( self.aws_client, self.config.mp_bucket_name,