Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
470a98b
Fix Docker permission issues comprehensively
jplfaria Jul 1, 2025
f5793fa
Fix Docker permission issues by removing named volumes
jplfaria Jul 1, 2025
506a99e
Fix Docker syntax error and clean up test files
jplfaria Jul 1, 2025
a4c3995
Fix ontology source file path in .env
jplfaria Jul 1, 2025
fc1d4df
Add ability to skip non-core ontology analysis
jplfaria Jul 1, 2025
0eea2df
Fix ROBOT memory allocation issue
jplfaria Jul 1, 2025
ee438fc
Fix handling of in-house ontologies in analyze_core_ontologies.py
jplfaria Jul 1, 2025
08cec45
Fix handling of compressed (.gz) ontology files
jplfaria Jul 1, 2025
cd63794
Fix GitHub Actions test-small-dataset job
jplfaria Jul 1, 2025
8433f4f
Add symbolic links for test configuration files
jplfaria Jul 1, 2025
8277be0
Fix memory_monitor.py TypeError when process cmdline is None
jplfaria Jul 1, 2025
b696dfb
Add comprehensive unit test suite for CDM Ontologies Pipeline
jplfaria Jul 1, 2025
fe2782d
Clean up repository structure and fix symlinks
jplfaria Jul 1, 2025
f92f3f6
Fix unit tests to match actual module functions
jplfaria Jul 1, 2025
2af2ed1
Fix remaining unit test issues
jplfaria Jul 1, 2025
5f764c9
Fix final unit test issues
jplfaria Jul 1, 2025
bff1618
Fix remaining test failures: correct mock return values and expected …
jplfaria Jul 1, 2025
0ebb5f4
Fix integration test: mock shutil.which for ROBOT and remove unused m…
jplfaria Jul 1, 2025
de09f32
Fix integration test: properly mock file existence check for semsql
jplfaria Jul 1, 2025
cf74ff1
Fix integration test: remove core_ontologies_analysis.json assertion
jplfaria Jul 1, 2025
1b6504d
Fix integration test: remove file existence assertions, just check wo…
jplfaria Jul 1, 2025
b0a50a8
Fix Docker permissions and SemsQL memory settings
jplfaria Jul 1, 2025
5a10f66
Fix ROBOT memory override issue in merge_ontologies
jplfaria Jul 1, 2025
f7ea24b
Enhance memory monitoring clarity with process-specific breakdowns
jplfaria Jul 2, 2025
e6e9c98
Fix memory monitor tests for enhanced logging
jplfaria Jul 2, 2025
3f8cf16
Implement timestamped output directories and improved logging
jplfaria Jul 4, 2025
5567e78
Update documentation across repository
jplfaria Jul 4, 2025
f6a79e6
Fix empty directories created in outputs folder
jplfaria Jul 4, 2025
adb9ccf
Remove unnecessary directory creation from docker-compose.yml
jplfaria Jul 4, 2025
0e44fd4
Fix memory monitor to properly track semsql process
jplfaria Jul 4, 2025
8c114ef
fix: reduce memory monitor verbosity
jplfaria Jul 4, 2025
407beba
feat: add custom SemsQL prefixes for SEED, EC, and geonames
jplfaria Jul 7, 2025
92c676d
feat: improve ontology download behavior with smart version tracking
jplfaria Jul 7, 2025
2384bb1
fix: update tests for new download behavior
jplfaria Jul 7, 2025
6091882
Implement comprehensive run summary feature for workflow tracking
jplfaria Jul 7, 2025
62de21b
Update documentation and add tests for run summary feature
jplfaria Jul 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Dataset Configuration
DATASET_SIZE=large
ONTOLOGIES_SOURCE_FILE=ontologies_source.txt
ONTOLOGIES_SOURCE_FILE=config/ontologies_source.txt

# Memory Configuration (1TB standard for all tools and environments)
ROBOT_JAVA_ARGS=-Xmx1024g -XX:MaxMetaspaceSize=8g -XX:+UseG1GC
Expand All @@ -20,4 +20,7 @@ ENABLE_MEMORY_MONITORING=true
# Output Configuration
ENABLE_TSV_EXPORT=true
ENABLE_PARQUET_EXPORT=true
CREATE_UTILS_LOGS=true
CREATE_UTILS_LOGS=true

# Workflow Control
SKIP_NON_CORE_ANALYSIS=true
29 changes: 23 additions & 6 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ env:
IMAGE_NAME: ${{ github.repository }}

jobs:
test:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
Expand All @@ -26,22 +26,26 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-cov flake8
pip install pytest pytest-cov pytest-mock flake8

- name: Lint with flake8
run: |
# Stop the build if there are Python syntax errors or undefined names
flake8 scripts --count --select=E9,F63,F7,F82 --show-source --statistics
flake8 scripts cdm_ontologies --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings
flake8 scripts --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
flake8 scripts cdm_ontologies --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics

- name: Run unit tests
run: |
pytest tests/ -v --cov=scripts --cov=cdm_ontologies --cov-report=term-missing

- name: Test CLI
run: |
python -m cdm_ontologies --help
python -m cdm_ontologies analyze-core --help

build-and-push:
needs: test
needs: unit-tests
runs-on: ubuntu-latest
permissions:
contents: read
Expand Down Expand Up @@ -107,10 +111,23 @@ jobs:
- name: Run Docker test pipeline
run: |
docker run --rm \
--user root \
-v ${{ github.workspace }}:/home/ontology/workspace \
-w /home/ontology/workspace \
-e DATASET_SIZE=test \
-e ONTOLOGIES_SOURCE_FILE=config/ontologies_source_test.txt \
-e ROBOT_JAVA_ARGS="-Xmx8g -XX:MaxMetaspaceSize=1g -XX:+UseG1GC" \
-e RELATION_GRAPH_JAVA_ARGS="-Xmx8g -XX:MaxMetaspaceSize=1g -XX:+UseG1GC" \
-e SEMSQL_MEMORY_LIMIT=8g \
-e PYTHON_MEMORY_LIMIT=8g \
-e JAVA_PARALLEL_GC=true \
-e ENABLE_MEMORY_MONITORING=true \
-e ENABLE_TSV_EXPORT=true \
-e ENABLE_PARQUET_EXPORT=true \
-e CREATE_UTILS_LOGS=true \
-e SKIP_RESOURCE_CHECK=true \
-e SKIP_NON_CORE_ANALYSIS=true \
-e PYTHONPATH=scripts \
-e HOST_UID=1001 \
-e HOST_GID=1001 \
${{ env.REGISTRY }}/${{ steps.lowercase-test.outputs.IMAGE_NAME_LOWER }}:${{ steps.lowercase-test.outputs.DOCKER_TAG }} \
python -m cdm_ontologies.cli run-all
32 changes: 26 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,21 +1,35 @@
# Ontology data directories (populated during pipeline runs)
ontology_data_owl/
!ontology_data_owl_test/ # Keep test data for demonstration
# Keep example test data for documentation
!ontology_data_owl_test/
ontology_data_owl_test/*
!ontology_data_owl_test/*.owl
!ontology_data_owl_test/non-base-ontologies/
!ontology_data_owl_test/non-base-ontologies/*.owl

# Output directories
outputs/
!outputs_test/ # Keep test outputs for demonstration
# Keep example test output for documentation
!outputs_test/
outputs_test/*
!outputs_test/run_20250704_001300/
# Symlinks to latest runs
outputs/latest
outputs_test/latest

# Claude guidance file (user-specific)
CLAUDE.md

# Original scripts directory (archived)
og_scripts/

# Archived test files from semsql_custom_prefixes
semsql_custom_prefixes/archived/

# Large ontology files
ontology_versions/backups/*.owl
ontology_versions_test/
*.owl
!*_test.owl

# Large compressed files
*.gz
Expand Down Expand Up @@ -67,13 +81,19 @@ wheels/
*.egg

# Logs (but keep important logs like download_history.log)
logs/*.log
logs/nohup_cdm_prod.out
logs/nohup_cdm_test.out
!logs/
logs/*
!logs/cdm_ontologies_test_20250704_001300.log
!logs/nohup_cdm_test_20250704_001300.out
!logs/.gitkeep
# Global log exclusion (but allow specific ones above)
*.log
!logs/cdm_ontologies_test_20250704_001300.log
!ontology_versions*/download_history.log

# Cache directory
.cache/

# Virtual environments
venv/
ENV/
Expand Down
27 changes: 17 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ RUN apt-get update && apt-get install -y \
# Utilities
vim \
htop \
# For dynamic user switching
gosu \
&& rm -rf /var/lib/apt/lists/*

# Set Python3 as default
Expand Down Expand Up @@ -65,28 +67,33 @@ RUN cargo install --git https://github.com/ontodev/rdftab.rs --root /home/ontolo

# Set up environment paths
ENV PATH="/home/ontology/tools/bin:/home/ontology/tools:/home/ontology/tools/relation-graph/bin:${PATH}"
ENV ROBOT_JAVA_ARGS="-Xmx8g"
ENV _JAVA_OPTIONS="-Xmx8g"
# Memory settings are provided by .env file, not hardcoded here

# Copy requirements and install Python dependencies globally as root
COPY requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir -r /tmp/requirements.txt && rm /tmp/requirements.txt

# Copy permission fix script
COPY --chmod=755 fix-permissions.sh /usr/local/bin/fix-permissions.sh
# Copy and append custom prefixes to SemsQL
COPY semsql_custom_prefixes/custom_prefixes.csv /tmp/custom_prefixes.csv
RUN python3 -c "import semsql; print(semsql.__file__.replace('__init__.py', 'builder/prefixes/prefixes.csv'))" > /tmp/semsql_prefix_path.txt && \
SEMSQL_PREFIX_PATH=$(cat /tmp/semsql_prefix_path.txt) && \
tail -n +2 /tmp/custom_prefixes.csv >> "${SEMSQL_PREFIX_PATH}" && \
echo "✅ Appended $(tail -n +2 /tmp/custom_prefixes.csv | wc -l) custom prefixes to SemsQL" && \
rm /tmp/custom_prefixes.csv /tmp/semsql_prefix_path.txt

# Switch to dynamic user
USER ${USER_ID}:${GROUP_ID}
WORKDIR /home/ontology/workspace
# Copy entrypoint script
COPY --chmod=755 docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh

# Copy the application code with proper ownership
COPY --chown=${USER_ID}:${GROUP_ID} . .

# Set environment for permission fixes
ENV HOST_UID=${USER_ID}
ENV HOST_GID=${GROUP_ID}
# Set working directory
WORKDIR /home/ontology/workspace

# Note: Output directories are created by host mount and script logic

# Use entrypoint for dynamic user handling
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]

# Default command
CMD ["python", "-m", "cdm_ontologies", "--help"]
77 changes: 44 additions & 33 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,45 +43,45 @@ install:

# Run the complete workflow
.PHONY: run-workflow
run-workflow: setup
run-workflow:
@echo "Starting CDM Ontologies workflow..."
@echo "Dataset size: $(DATASET_SIZE)"
@echo "Java memory: $(ROBOT_JAVA_ARGS)"
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli run-all
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) $(SCRIPTS_DIR)/workflow_wrapper.py

# Run individual workflow steps
.PHONY: analyze-core
analyze-core: setup
analyze-core:
@echo "Analyzing core ontologies..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli analyze-core

.PHONY: analyze-non-core
analyze-non-core: setup
analyze-non-core:
@echo "Analyzing non-core ontologies..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli analyze-non-core

.PHONY: create-base
create-base: setup
create-base:
@echo "Creating pseudo-base ontologies..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli create-base

.PHONY: merge
merge: setup
merge:
@echo "Merging ontologies..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli merge

.PHONY: create-db
create-db: setup
create-db:
@echo "Creating semantic SQL database..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli create-db

.PHONY: extract-tables
extract-tables: setup
extract-tables:
@echo "Extracting SQL tables to TSV..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli extract-tables

.PHONY: create-parquet
create-parquet: setup
create-parquet:
@echo "Creating Parquet files..."
@PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli create-parquet

Expand All @@ -92,6 +92,14 @@ clean:
@rm -rf outputs/*
@rm -rf logs/*

# Clean Docker volumes and containers
.PHONY: docker-clean
docker-clean:
@echo "Cleaning Docker volumes and containers..."
@docker compose down -v 2>/dev/null || true
@docker volume rm kbase_cdm_ontologies_cdm-outputs kbase_cdm_ontologies_cdm-cache 2>/dev/null || true
@echo "Docker cleanup complete"

# Clean everything including downloaded ontologies
.PHONY: clean-all
clean-all: clean
Expand Down Expand Up @@ -136,9 +144,9 @@ test-create-parquet: setup
@ONTOLOGIES_SOURCE_FILE=config/ontologies_source_test.txt PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli create-parquet

.PHONY: test-workflow
test-workflow: setup
test-workflow:
@echo "Testing complete workflow with test dataset..."
@ONTOLOGIES_SOURCE_FILE=config/ontologies_source_test.txt PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) -m cdm_ontologies.cli run-all
@ONTOLOGIES_SOURCE_FILE=config/ontologies_source_test.txt PYTHONPATH=$(SCRIPTS_DIR):$(PYTHONPATH) $(PYTHON) $(SCRIPTS_DIR)/workflow_wrapper.py

# Docker targets
.PHONY: docker-build
Expand All @@ -149,58 +157,60 @@ docker-build:
.PHONY: docker-run-production
docker-run-production: docker-build
@echo "Running pipeline with production dataset..."
@mkdir -p outputs .cache
@ENV_FILE=.env UID=$(UID) GID=$(GID) docker compose run --rm cdm-ontologies
@echo "Fixing any permission issues..."
@docker run --rm -v "$(PWD):/workspace" --user root alpine:latest sh -c "chown -R $(UID):$(GID) /workspace/outputs /workspace/ontology_data_owl /workspace/logs 2>/dev/null || true"

.PHONY: docker-run-prod
docker-run-prod: docker-build
@echo "Running pipeline with production dataset (30+ ontologies)..."
@mkdir -p outputs logs .cache
@ENV_FILE=.env UID=$(UID) GID=$(GID) docker compose run --rm cdm-ontologies
@echo "Fixing any permission issues..."
@docker run --rm -v "$(PWD):/workspace" --user root alpine:latest sh -c "chown -R $(UID):$(GID) /workspace/outputs /workspace/ontology_data_owl /workspace/logs 2>/dev/null || true"

.PHONY: docker-run-prod-nohup
docker-run-prod-nohup: docker-build
@echo "Starting production pipeline in background with nohup..."
@mkdir -p logs
@nohup bash -c 'ENV_FILE=.env UID=$(UID) GID=$(GID) docker compose run --rm cdm-ontologies && \
docker run --rm -v "$(PWD):/workspace" --user root alpine:latest sh -c "chown -R $(UID):$(GID) /workspace/outputs /workspace/ontology_data_owl /workspace/logs 2>/dev/null || true"' \
> logs/nohup_cdm_prod.out 2>&1 &
@echo "Production pipeline started in background. PID: $$!"
@echo "Monitor progress with: make docker-prod-status"
@echo "Or check the log file: tail -f logs/nohup_cdm_prod.out"
@mkdir -p logs outputs .cache
@TIMESTAMP=$$(date +%Y%m%d_%H%M%S); \
nohup bash -c 'ENV_FILE=.env UID=$(UID) GID=$(GID) WORKFLOW_TIMESTAMP='"$$TIMESTAMP"' docker compose run --rm cdm-ontologies' \
> logs/nohup_cdm_prod_$$TIMESTAMP.out 2>&1 & \
echo "Production pipeline started in background. PID: $$!"; \
echo "Monitor progress with: make docker-prod-status"; \
echo "Or check the log file: tail -f logs/nohup_cdm_prod_$$TIMESTAMP.out"

.PHONY: docker-prod-status
docker-prod-status:
@if [ -f logs/nohup_cdm_prod.out ]; then \
tail -f logs/nohup_cdm_prod.out; \
@LATEST_LOG=$$(ls -t logs/nohup_cdm_prod_*.out 2>/dev/null | head -1); \
if [ -n "$$LATEST_LOG" ]; then \
echo "Following latest production log: $$LATEST_LOG"; \
tail -f $$LATEST_LOG; \
else \
echo "No production run log found. Start with: make docker-run-prod-nohup"; \
fi

.PHONY: docker-test
docker-test: docker-build
@echo "Running pipeline with test dataset in Docker..."
@mkdir -p outputs_test logs .cache
@ENV_FILE=.env.test UID=$(UID) GID=$(GID) docker compose run --rm cdm-ontologies make test-workflow
@echo "Fixing any permission issues..."
@docker run --rm -v "$(PWD):/workspace" --user root alpine:latest sh -c "chown -R $(UID):$(GID) /workspace/outputs_test /workspace/ontology_data_owl_test /workspace/logs 2>/dev/null || true"

.PHONY: docker-test-nohup
docker-test-nohup: docker-build
@echo "Starting test pipeline in background with nohup..."
@mkdir -p logs
@nohup bash -c 'ENV_FILE=.env.test UID=$(UID) GID=$(GID) docker compose run --rm cdm-ontologies make test-workflow && \
@TIMESTAMP=$$(date +%Y%m%d_%H%M%S); \
nohup bash -c 'ENV_FILE=.env.test UID=$(UID) GID=$(GID) WORKFLOW_TIMESTAMP='"$$TIMESTAMP"' docker compose run --rm cdm-ontologies make test-workflow && \
docker run --rm -v "$(PWD):/workspace" --user root alpine:latest sh -c "chown -R $(UID):$(GID) /workspace/outputs_test /workspace/ontology_data_owl_test /workspace/logs 2>/dev/null || true"' \
> logs/nohup_cdm_test.out 2>&1 &
@echo "Test pipeline started in background. PID: $$!"
@echo "Monitor progress with: make docker-test-status"
@echo "Or check the log file: tail -f logs/nohup_cdm_test.out"
> logs/nohup_cdm_test_$$TIMESTAMP.out 2>&1 & \
echo "Test pipeline started in background. PID: $$!"; \
echo "Monitor progress with: make docker-test-status"; \
echo "Or check the log file: tail -f logs/nohup_cdm_test_$$TIMESTAMP.out"

.PHONY: docker-test-status
docker-test-status:
@if [ -f logs/nohup_cdm_test.out ]; then \
tail -f logs/nohup_cdm_test.out; \
@LATEST_LOG=$$(ls -t logs/nohup_cdm_test_*.out 2>/dev/null | head -1); \
if [ -n "$$LATEST_LOG" ]; then \
echo "Following latest test log: $$LATEST_LOG"; \
tail -f $$LATEST_LOG; \
else \
echo "No test run log found. Start with: make docker-test-nohup"; \
fi
Expand Down Expand Up @@ -245,6 +255,7 @@ help:
@echo " make docker-run-prod - Run with production dataset (30+ ontologies)"
@echo " make docker-run-prod-nohup - Run production in background with nohup"
@echo " make docker-prod-status - Monitor production run progress"
@echo " make docker-clean - Clean Docker volumes and containers"
@echo ""
@echo "Environment variables:"
@echo " ENV_FILE=.env - Use production dataset configuration (default)"
Expand Down
Loading