From 9b317e7686a8a509725ceb43b74a6faefa7c6b6f Mon Sep 17 00:00:00 2001 From: liefeld Date: Tue, 19 May 2026 08:27:38 -0700 Subject: [PATCH 1/3] added disk cleanup tasks and corrected the instructions in the readme --- README.md | 7 +- caper/caper/apps.py | 11 +++ caper/caper/background_tasks.py | 156 +++++++++++++++++++++++++++++--- 3 files changed, 158 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c72c660..08113bb 100755 --- a/README.md +++ b/README.md @@ -440,9 +440,8 @@ The server is currently running on an EC2 instance through Docker. The ports act > `git fetch` > `git pull` > `git checkout tags/` -- if any static (css, js, images) have changed, run sync_static_to_s3.sh to copy the files into the S3 bucket -- Restart the server -> `./stop-server.sh` -> `./start-server.sh` +- Restart the server - use the same script as the nightly cron to stop and restart the docker container. During server startup it automatically syncs static files to S3 +> `/home/ubuntu/stop-and-start-repo.sh` + diff --git a/caper/caper/apps.py b/caper/caper/apps.py index 69be6b1..d96c8eb 100644 --- a/caper/caper/apps.py +++ b/caper/caper/apps.py @@ -45,6 +45,17 @@ def ready(self): except Exception as e: logger.warning(f"Could not determine AmpliconSuiteAggregator version: {e}") + # Remove orphaned temp dirs left by any previous run that crashed or + # was killed before it could clean up after itself. + try: + from .background_tasks import cleanup_stale_temp_dirs + # Use a short min-age (5 min) at startup: no tasks are running yet, + # so anything older than a few minutes is safely orphaned. + removed = cleanup_stale_temp_dirs(min_age_seconds=300) + logger.info(f"Startup temp dir cleanup: removed {removed} orphaned dir(s)") + except Exception as e: + logger.warning(f"Startup temp dir cleanup failed: {e}") + # Start the S3 sync in a background thread sync_thread = threading.Thread(target=self.sync_static_to_s3, daemon=True) sync_thread.start() diff --git a/caper/caper/background_tasks.py b/caper/caper/background_tasks.py index 525d204..97a58d2 100644 --- a/caper/caper/background_tasks.py +++ b/caper/caper/background_tasks.py @@ -10,6 +10,8 @@ import uuid import datetime import os +import shutil +import time from concurrent.futures import ThreadPoolExecutor @@ -32,6 +34,87 @@ def _get_tasks_collection(): # How long before a 'running' task is considered stale/dead (seconds). _STALE_THRESHOLD_SECONDS = 20 * 60 # 20 minutes +# Root directory where project temp dirs are created. +_DEFAULT_TMP_ROOT = os.getenv('CAPER_TMP_ROOT', './tmp') + +# Periodic cleanup interval: every 6 hours. +_CLEANUP_INTERVAL_SECONDS = 6 * 60 * 60 + +# Minimum dir age before the cleanup daemon will touch it. +# 3× the stale threshold (60 min) gives active tasks a generous safety buffer. +_CLEANUP_MIN_AGE_SECONDS = _STALE_THRESHOLD_SECONDS * 3 + + +def _remove_temp_dir(path: str) -> None: + """Remove a temp directory if it exists, logging the outcome.""" + if not path: + return + try: + if os.path.exists(path): + shutil.rmtree(path, ignore_errors=True) + logging.info(f"Removed temp directory: {path}") + except Exception: + logging.exception(f"Failed to remove temp directory: {path}") + + +def cleanup_stale_temp_dirs( + tmp_root: str = _DEFAULT_TMP_ROOT, + min_age_seconds: int = _CLEANUP_MIN_AGE_SECONDS, +) -> int: + """ + Remove directories in tmp_root that have no corresponding running task in + MongoDB and whose mtime is older than min_age_seconds. + + The mtime-based age check provides a secondary safety net: a directory + that an active aggregation is still writing to will have a recent mtime + and will not be removed even if its MongoDB record is temporarily invisible. + + Returns the number of directories removed. + """ + tmp_root = os.path.abspath(tmp_root) + if not os.path.isdir(tmp_root): + return 0 + + # Collect the absolute temp_dir paths of all currently-running tasks. + active_temp_dirs: set[str] = set() + try: + col = _get_tasks_collection() + for doc in col.find({'state': 'running', 'temp_dir': {'$exists': True, '$ne': None}}): + td = doc.get('temp_dir') + if td: + active_temp_dirs.add(os.path.abspath(td)) + except Exception: + logging.exception("cleanup_stale_temp_dirs: failed to query MongoDB — skipping cleanup") + return 0 + + cutoff = time.time() - min_age_seconds + removed = 0 + + try: + with os.scandir(tmp_root) as entries: + for entry in entries: + if not entry.is_dir(follow_symlinks=False): + continue + try: + if entry.stat(follow_symlinks=False).st_mtime > cutoff: + continue # Too recent — still potentially active + abs_path = os.path.abspath(entry.path) + if abs_path in active_temp_dirs: + continue # Claimed by a running task + shutil.rmtree(abs_path, ignore_errors=True) + logging.info(f"cleanup_stale_temp_dirs: removed orphaned temp dir {abs_path}") + removed += 1 + except Exception: + logging.exception(f"cleanup_stale_temp_dirs: error processing {entry.path}") + except OSError: + logging.exception(f"cleanup_stale_temp_dirs: cannot scan {tmp_root}") + + if removed: + logging.info(f"cleanup_stale_temp_dirs: removed {removed} orphaned dir(s) from {tmp_root}") + else: + logging.debug(f"cleanup_stale_temp_dirs: no orphaned dirs found in {tmp_root}") + return removed + class BackgroundTaskTracker: """ @@ -40,13 +123,22 @@ class BackgroundTaskTracker: the same status. """ - def __init__(self, max_workers: int = 4, thread_name_prefix: str = 'caper_worker'): + def __init__( + self, + max_workers: int = 4, + thread_name_prefix: str = 'caper_worker', + cleanup_interval_seconds: int = _CLEANUP_INTERVAL_SECONDS, + tmp_root: str = _DEFAULT_TMP_ROOT, + ): self._executor = ThreadPoolExecutor( max_workers=max_workers, thread_name_prefix=thread_name_prefix, ) self._max_workers = max_workers self._col = None # lazy + self._cleanup_interval = cleanup_interval_seconds + self._tmp_root = tmp_root + self._start_cleanup_daemon() def _collection(self): if self._col is None: @@ -62,36 +154,56 @@ def _collection(self): # Public interface # ------------------------------------------------------------------ - def submit(self, fn, *args, task_label: str = None, **kwargs): - """Submit *fn* to the thread pool and record it in MongoDB.""" + def submit(self, fn, *args, task_label: str = None, temp_dir: str = None, **kwargs): + """Submit *fn* to the thread pool and record it in MongoDB. + + temp_dir: if provided, its absolute path is stored in the task record + so the periodic cleanup daemon can skip directories that belong to + running tasks. The directory is also removed in a finally block after + fn returns, catching the success path which the existing code does not + clean up. + """ if task_label is None: task_label = getattr(fn, '__name__', str(fn)) task_id = uuid.uuid4().hex now = datetime.datetime.utcnow() + doc = { + '_id': task_id, + 'label': task_label, + 'state': 'running', + 'started_at': now.isoformat(timespec='seconds'), + 'updated_at': now, + 'worker_pid': os.getpid(), + } + if temp_dir is not None: + doc['temp_dir'] = os.path.abspath(temp_dir) + # Write to MongoDB first so it's visible immediately col = self._collection() if col is not None: try: - col.insert_one({ - '_id': task_id, - 'label': task_label, - 'state': 'running', - 'started_at': now.isoformat(timespec='seconds'), - 'updated_at': now, - 'worker_pid': os.getpid(), - }) + col.insert_one(doc) except Exception: logging.exception("Failed to insert background task record") + abs_temp_dir = os.path.abspath(temp_dir) if temp_dir is not None else None + def _wrapped(): try: fn(*args, **kwargs) self._mark_task(task_id, 'completed') - except Exception as exc: + except Exception: self._mark_task(task_id, 'failed') raise + finally: + # Belt-and-suspenders: the success path in + # _process_and_aggregate_files does not call rmtree, so this + # finally block is the primary cleanup for the happy path. + # Error paths that already called rmtree are harmless repeats. + if abs_temp_dir is not None: + _remove_temp_dir(abs_temp_dir) future = self._executor.submit(_wrapped) return future @@ -163,6 +275,26 @@ def _mark_task(self, task_id: str, state: str): except Exception: logging.exception(f"Failed to mark background task {task_id} as {state}") + def _start_cleanup_daemon(self): + """Start a daemon thread that sweeps tmp_root every cleanup_interval seconds.""" + interval = self._cleanup_interval + tmp_root = self._tmp_root + + def _loop(): + while True: + time.sleep(interval) + try: + cleanup_stale_temp_dirs(tmp_root) + except Exception: + logging.exception("Periodic temp dir cleanup failed") + + t = threading.Thread(target=_loop, daemon=True, name='caper_tmp_cleanup') + t.start() + logging.info( + f"Started temp dir cleanup daemon " + f"(interval={interval}s, root={os.path.abspath(tmp_root)})" + ) + # --------------------------------------------------------------------------- # Module-level singleton – import this in views.py and views_apis.py From 455274ef3532f7700ca13e8902e93fd42886f37e Mon Sep 17 00:00:00 2001 From: liefeld Date: Tue, 19 May 2026 08:36:22 -0700 Subject: [PATCH 2/3] adding section on manual disk space cleanup, though it should not be necessary if the automated systems are doing their thing --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 08113bb..accdb5b 100755 --- a/README.md +++ b/README.md @@ -443,5 +443,30 @@ The server is currently running on an EC2 instance through Docker. The ports act - Restart the server - use the same script as the nightly cron to stop and restart the docker container. During server startup it automatically syncs static files to S3 > `/home/ubuntu/stop-and-start-repo.sh` +## 4. Manual disk space cleanup + +The server writes temporary working directories under `./tmp/` (relative to the project root) during project creation and editing. These are cleaned up automatically by the server, but if disk space runs low you can reclaim it manually. **Always check that no project creation or editing is in progress before deleting anything.** First, SSH into the EC2 instance and check for active background tasks: + +```bash +cd /home/ubuntu/AmpliconRepository-prod/caper +python manage.py shell -c \ + "from caper.background_tasks import get_background_task_status; import json; print(json.dumps(get_background_task_status(), indent=2))" +``` + +If `active_count` is 0, it is safe to remove all temp directories. To see how much space they are using and then remove them: + +```bash +du -sh ./tmp/*/ # check sizes +rm -rf ./tmp/*/ # remove all temp dirs +``` + +If you want to be cautious and leave any directories that may belong to a recently-started task (for example, if you are unsure whether `active_count` reflects all workers), limit deletion to directories that have not been written to in at least two hours: + +```bash +find ./tmp -mindepth 1 -maxdepth 1 -type d -mmin +120 -exec rm -rf {} + +``` + +Note that restarting the server (`/home/ubuntu/stop-and-start-repo.sh`) also triggers an automatic cleanup of any orphaned temp directories left by previous runs. + From ceeb048fdceb9abad23ffce0d001c920e42cbb06 Mon Sep 17 00:00:00 2001 From: liefeld Date: Tue, 19 May 2026 09:09:09 -0700 Subject: [PATCH 3/3] adding section on AWS setup --- README.md | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/README.md b/README.md index accdb5b..91cf083 100755 --- a/README.md +++ b/README.md @@ -399,7 +399,105 @@ The server is currently running on an EC2 instance through Docker. The ports act **Note:** While we provide a Dockerfile, local deployment of the site using the docker will only properly work on AWS. Local deployment should be done with a local install using the steps above. +## 0. Setting up a new server on AWS EC2 +This section covers provisioning a fresh EC2 instance. The app container is built on the host and is not published to any registry — it must be built locally. Source code is mounted into the running container at runtime, so routine code updates do not require a container rebuild; only changes to `requirements.txt` or the `Dockerfile` do. + +### i. Provision the EC2 instance + +- **AMI**: Ubuntu Server 22.04 LTS +- **Instance type**: choose based on expected load (t3.medium or larger recommended) +- **Security group**: open inbound ports for SSH (22), HTTP (80), HTTPS (443), and the app port defined by `AMPLICON_ENV_PORT` in `config.sh`; Neo4j ports 7474 and 7687 should be restricted to internal access only +- After the instance is running, register it with the appropriate AWS Application Load Balancer target group: + - Production: `ampliconrepo-https` + - Dev: `dev-ampliconrepository-org` + +### ii. Install Docker + +SSH into the new instance and install Docker Engine: + +```bash +sudo apt-get update +sudo apt-get install -y ca-certificates curl gnupg +sudo install -m 0755 -d /etc/apt/keyrings +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg +sudo chmod a+r /etc/apt/keyrings/docker.gpg +echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ + https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" \ + | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null +sudo apt-get update +sudo apt-get install -y docker-ce docker-ce-cli containerd.io +sudo usermod -aG docker ubuntu +# Log out and back in for the group membership to take effect +``` + +### iii. Create the two working checkouts + +Two separate git checkouts are maintained to allow container rebuilds without interrupting a running server: + +- **Running server**: `/home/ubuntu/AmpliconRepository-` — the live checkout whose source is mounted directly into the running container (`dev` or `prod`) +- **Build directory**: `/home/ubuntu/ampRepo_for_docker_build/AmpliconRepository` — used only for `docker build`; changes here do not affect the running server + +```bash +# Running server checkout (use AmpliconRepository-dev for the dev environment) +git clone https://github.com/AmpliconSuite/AmpliconRepository.git /home/ubuntu/AmpliconRepository-prod + +# Build-only checkout +mkdir -p /home/ubuntu/ampRepo_for_docker_build +git clone https://github.com/AmpliconSuite/AmpliconRepository.git /home/ubuntu/ampRepo_for_docker_build/AmpliconRepository +``` + +### iv. Create required directories and place config.sh + +```bash +cd /home/ubuntu/AmpliconRepository-prod # or AmpliconRepository-dev +mkdir -p logs tmp neo4j/conf neo4j/data +``` + +Place `config.sh` at `caper/config.sh` inside the running-server checkout. The checkout root is mounted as `/srv/` inside the container, so the file will be read from `/srv/caper/config.sh` at runtime. `config.sh` contains all environment-specific secrets and settings — database connection strings, OAuth keys, S3 bucket names, and so on. Obtain it from another team member. **Do not commit it to git.** + +MongoDB connectivity uses the existing DocumentDB cluster; the connection string is provided via the `DB_URI_SECRET` variable in `config.sh`. + +### v. Configure AWS credentials + +The container mounts `/home/ubuntu/.aws` so that the application can reach S3. Use one of: + +- **IAM instance role** (preferred for EC2): attach a role with the required S3 permissions to the instance — no credential file is needed +- **Access keys**: place credentials in `/home/ubuntu/.aws/credentials` in the standard AWS shared credentials format + +### vi. Build the Docker image + +Build from the build-only checkout so the running server is not affected. The image packages the Python virtual environment and system dependencies; source code is supplied at runtime via volume mount and does not need to be re-baked on every code change. + +```bash +cd /home/ubuntu/ampRepo_for_docker_build/AmpliconRepository/caper +source caper/config.sh # sets AMPLICON_ENV and other variables +docker build -t genepattern/amplicon-repo:${AMPLICON_ENV} . +``` + +### vii. Start Neo4j + +Neo4j runs as a Docker container. Run the start script from the running-server checkout root (where `caper/config.sh` is visible at the relative path the script expects): + +```bash +cd /home/ubuntu/AmpliconRepository-prod # or AmpliconRepository-dev +./start-neo4j-container.sh +``` + +Neo4j data and configuration are persisted in `neo4j/data/` and `neo4j/conf/` under the checkout root. To stop Neo4j: + +```bash +./stop-neo4j-container.sh +``` + +### viii. Start the application server + +```bash +cd /home/ubuntu/AmpliconRepository-prod # or AmpliconRepository-dev +./start-server.sh +``` + +This launches a gunicorn container named `amplicon-${AMPLICON_ENV}` with the source checkout mounted at `/srv/`. Confirm it is running with `docker ps`. Logs are written to the `logs/` directory in the checkout root. ## 1. How to start the server - SSH into the EC2 instance (called `ampliconrepo-ubuntu-20.04`)