diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml index f589d89da..70cc0f713 100644 --- a/.github/workflows/frontend-ci.yml +++ b/.github/workflows/frontend-ci.yml @@ -61,6 +61,8 @@ jobs: DJANGO_MINIO_STORAGE_SECRET_KEY: minioSecretKey DJANGO_STORAGE_BUCKET_NAME: dandi-bucket DJANGO_DANDI_DANDISETS_BUCKET_NAME: dandi-bucket + # TODO: Add env var + # DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME: dandi-embargo-dandisets DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME: dandiapi-dandisets-logs DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME: dandiapi-embargo-dandisets-logs DJANGO_DANDI_WEB_APP_URL: http://localhost:8085 diff --git a/dandiapi/analytics/models.py b/dandiapi/analytics/models.py index ee2fa5586..0f1381fe2 100644 --- a/dandiapi/analytics/models.py +++ b/dandiapi/analytics/models.py @@ -13,6 +13,12 @@ class ProcessedS3Log(models.Model): ], ) + # TODO: we need a variable to tell us where this s3 log file lives + # if only 2 buckets (public, embargo), we can still use a boolean + # (currently this is historically_embargoed) + # do we want to change the name or leave it? + # else, we could use an enum to distinguish between 3+ buckets + # Represents if this s3 log file was embargoed prior to the embargo re-design. # If this field is True, the log file lives in the S3 bucket pointed to by the # DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME setting. diff --git a/dandiapi/analytics/tasks/__init__.py b/dandiapi/analytics/tasks/__init__.py index 3314bc305..5e6e88312 100644 --- a/dandiapi/analytics/tasks/__init__.py +++ b/dandiapi/analytics/tasks/__init__.py @@ -26,26 +26,38 @@ LogBucket = str +# TODO: Revert function def & use bucket var +# def _bucket_objects_after(bucket: str, after: str | None) -> Generator[dict, None, None]: def _bucket_objects_after(after: str | None) -> Generator[dict, None, None]: + # TODO: bucket s3 = get_boto_client(get_storage()) kwargs = {} if after: kwargs['StartAfter'] = after paginator = s3.get_paginator('list_objects_v2') + # TODO: bucket + # for page in paginator.paginate(Bucket=bucket, **kwargs): for page in paginator.paginate(Bucket=settings.DANDI_DANDISETS_LOG_BUCKET_NAME, **kwargs): yield from page.get('Contents', []) - +# TODO: Revert function def & use bucket var +# def collect_s3_log_records_task(bucket: LogBucket) -> None: @shared_task(queue='s3-log-processing', soft_time_limit=60, time_limit=80) def collect_s3_log_records_task() -> None: """Dispatch a task per S3 log file to process for download counts.""" + # TODO: bucket after = ProcessedS3Log.objects.aggregate(last_log=Max('name'))['last_log'] + # TODO: bucket + # for s3_log_object in _bucket_objects_after(bucket, after): + # process_s3_log_file_task.delay(bucket, s3_log_object['Key']) for s3_log_object in _bucket_objects_after(after): process_s3_log_file_task.delay(s3_log_object['Key']) +# TODO: Revert function def ? +# def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None: @shared_task(queue='s3-log-processing', soft_time_limit=120, time_limit=140) def process_s3_log_file_task(s3_log_key: str) -> None: """ diff --git a/dandiapi/analytics/tests/test_download_counts.py b/dandiapi/analytics/tests/test_download_counts.py index 679c21478..f8e66cfc3 100644 --- a/dandiapi/analytics/tests/test_download_counts.py +++ b/dandiapi/analytics/tests/test_download_counts.py @@ -44,8 +44,11 @@ def s3_log_file(s3_log_bucket, asset_blob): s3.delete_object(Bucket=s3_log_bucket, Key=log_file_name) +# TODO: add s3_log_bucket back in as a parameter +# def test_processing_s3_log_files(s3_log_bucket, s3_log_file, asset_blob): @pytest.mark.django_db def test_processing_s3_log_files(s3_log_file, asset_blob): + # TODO: s3_log_bucket collect_s3_log_records_task() asset_blob.refresh_from_db() @@ -53,12 +56,16 @@ def test_processing_s3_log_files(s3_log_file, asset_blob): assert asset_blob.download_count == 1 +# TODO: add s3_log_bucket back in as a parameter +# def test_processing_s3_log_files_idempotent(s3_log_bucket, s3_log_file, asset_blob): @pytest.mark.django_db def test_processing_s3_log_files_idempotent(s3_log_file, asset_blob): # this tests that the outer task which collects the log files to process is # idempotent, in other words, it uses StartAfter correctly. + # TODO: s3_log_bucket collect_s3_log_records_task() # run the task again, it should skip the existing log record + # TODO: s3_log_bucket collect_s3_log_records_task() asset_blob.refresh_from_db() @@ -66,12 +73,16 @@ def test_processing_s3_log_files_idempotent(s3_log_file, asset_blob): assert asset_blob.download_count == 1 +# TODO: add s3_log_bucket back in as a parameter +# def test_processing_s3_log_file_task_idempotent(s3_log_bucket, s3_log_file, asset_blob): @pytest.mark.django_db def test_processing_s3_log_file_task_idempotent(s3_log_file, asset_blob): # this tests that the inner task which processes a single log file is # idempotent, utilizing the unique constraint on ProcessedS3Log correctly. + # TODO: s3_log_bucket process_s3_log_file_task(s3_log_file) # run the task again, it should ignore the new log + # TODO: s3_log_bucket process_s3_log_file_task(s3_log_file) asset_blob.refresh_from_db() diff --git a/dandiapi/api/management/commands/cleanup_blobs.py b/dandiapi/api/management/commands/cleanup_blobs.py index 4051616f8..ea2783e08 100644 --- a/dandiapi/api/management/commands/cleanup_blobs.py +++ b/dandiapi/api/management/commands/cleanup_blobs.py @@ -6,6 +6,7 @@ from dandiapi.api.models.upload import AssetBlob +# TODO: handle embargo bucket BUCKET = settings.DANDI_DANDISETS_BUCKET_NAME diff --git a/dandiapi/api/manifests.py b/dandiapi/api/manifests.py index 7b12ede31..e0d2d1d4f 100644 --- a/dandiapi/api/manifests.py +++ b/dandiapi/api/manifests.py @@ -19,6 +19,10 @@ def _s3_url(path: str) -> str: """Turn an object path into a fully qualified S3 URL.""" + # TODO: determine which bucket name to pass in + # if embargoed: + # storage = create_s3_storage(settings.DANDI_DANDISETS_EMBARGO_BUCKET_NAME) + # else: storage = create_s3_storage(settings.DANDI_DANDISETS_BUCKET_NAME) signed_url = storage.url(path) # Strip off the query parameters from the presigning, as they are different every time diff --git a/dandiapi/api/models/asset.py b/dandiapi/api/models/asset.py index 92ba2a633..ab6f08358 100644 --- a/dandiapi/api/models/asset.py +++ b/dandiapi/api/models/asset.py @@ -60,7 +60,9 @@ class AssetBlob(TimeStampedModel): SHA256_REGEX = r'[0-9a-f]{64}' ETAG_REGEX = r'[0-9a-f]{32}(-[1-9][0-9]*)?' + # TODO: do we need an indicator of embargo vs. private? embargoed = models.BooleanField(default=False) + # TODO: storage and upload_to will be dependent on bucket blob = models.FileField(blank=True, storage=get_storage, upload_to=get_storage_prefix) blob_id = models.UUIDField(unique=True) sha256 = models.CharField( # noqa: DJ001 diff --git a/dandiapi/api/models/upload.py b/dandiapi/api/models/upload.py index f2569c70c..9329956fc 100644 --- a/dandiapi/api/models/upload.py +++ b/dandiapi/api/models/upload.py @@ -20,6 +20,7 @@ class Upload(models.Model): # noqa: DJ008 dandiset = models.ForeignKey(Dandiset, related_name='uploads', on_delete=models.CASCADE) + # TODO: storage and upload_to will be dependent on bucket blob = models.FileField(blank=True, storage=get_storage, upload_to=get_storage_prefix) embargoed = models.BooleanField(default=False) @@ -45,6 +46,7 @@ class Meta: def object_key(upload_id): upload_id = str(upload_id) return ( + # TODO: determine which bucket f'{settings.DANDI_DANDISETS_BUCKET_PREFIX}' f'blobs/{upload_id[0:3]}/{upload_id[3:6]}/{upload_id}' ) diff --git a/dandiapi/api/services/asset/__init__.py b/dandiapi/api/services/asset/__init__.py index b50dfa6c1..78d2ec718 100644 --- a/dandiapi/api/services/asset/__init__.py +++ b/dandiapi/api/services/asset/__init__.py @@ -173,6 +173,7 @@ def add_asset_to_version( # embargoed blob results in that blob being unembargoed. # NOTE: This only applies to asset blobs, as zarrs cannot belong to # multiple dandisets at once. + # TODO: unsure if this logic would need to change? are there unintended side-effects? if ( asset_blob is not None and asset_blob.embargoed diff --git a/dandiapi/api/services/embargo/__init__.py b/dandiapi/api/services/embargo/__init__.py index b571613f0..7c79dd610 100644 --- a/dandiapi/api/services/embargo/__init__.py +++ b/dandiapi/api/services/embargo/__init__.py @@ -33,6 +33,7 @@ @transaction.atomic() def unembargo_dandiset(ds: Dandiset, user: User): """Unembargo a dandiset by copying all embargoed asset blobs to the public bucket.""" + # TODO: Move embargoed dandiset to public bucket logger.info('Unembargoing Dandiset %s', ds.identifier) logger.info('\t%s assets', ds.draft_version.assets.count()) diff --git a/dandiapi/api/services/embargo/utils.py b/dandiapi/api/services/embargo/utils.py index d546e242c..0f3ef1d43 100644 --- a/dandiapi/api/services/embargo/utils.py +++ b/dandiapi/api/services/embargo/utils.py @@ -53,6 +53,7 @@ def newfn(*args, **kwargs): @retry(times=3, exceptions=(Exception,)) def _delete_object_tags(client: S3Client, blob: str): + # TODO: determine which bucket name to pass in client.delete_object_tagging( Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Key=blob, @@ -62,6 +63,7 @@ def _delete_object_tags(client: S3Client, blob: str): @retry(times=3, exceptions=(Exception,)) def _delete_zarr_object_tags(client: S3Client, zarr: str): paginator = client.get_paginator('list_objects_v2') + # TODO: determine which bucket name to pass in pages = paginator.paginate( Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Prefix=zarr_s3_path(zarr_id=zarr) ) diff --git a/dandiapi/api/storage.py b/dandiapi/api/storage.py index 0fa1e1a34..e42af85ff 100644 --- a/dandiapi/api/storage.py +++ b/dandiapi/api/storage.py @@ -369,6 +369,10 @@ def get_storage_params(storage: Storage): def get_storage() -> Storage: + # TODO: add parameter or split into function per bucket name + # if embargoed: + # return create_s3_storage(settings.DANDI_DANDISETS_EMBARGO_BUCKET_NAME) + # else: return create_s3_storage(settings.DANDI_DANDISETS_BUCKET_NAME) diff --git a/dandiapi/api/tasks/scheduled.py b/dandiapi/api/tasks/scheduled.py index 9deac4f84..dfe25654a 100644 --- a/dandiapi/api/tasks/scheduled.py +++ b/dandiapi/api/tasks/scheduled.py @@ -158,6 +158,7 @@ def register_scheduled_tasks(sender: Celery, **kwargs): sender.add_periodic_task(timedelta(minutes=10), refresh_materialized_view_search.s()) # Process new S3 logs every hour + # TODO: pass in bucket name to collect_s3_log_records_task sender.add_periodic_task(timedelta(hours=1), collect_s3_log_records_task.s()) # Run garbage collection once a day diff --git a/dandiapi/api/tests/test_audit.py b/dandiapi/api/tests/test_audit.py index d9c0521ca..65043d5c9 100644 --- a/dandiapi/api/tests/test_audit.py +++ b/dandiapi/api/tests/test_audit.py @@ -380,6 +380,7 @@ def test_audit_finalize_zarr( boto = get_boto_client() zarr_archive = ZarrArchive.objects.get(zarr_id=zarr.zarr_id) for path in paths: + # TODO: test both buckets? boto.put_object( Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Key=zarr_archive.s3_path(path), Body=b'a' ) @@ -421,6 +422,7 @@ def test_audit_delete_zarr_chunks( boto = get_boto_client() zarr_archive = ZarrArchive.objects.get(zarr_id=zarr.zarr_id) for path in paths: + # TODO: test both buckets? boto.put_object( Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Key=zarr_archive.s3_path(path), Body=b'a' ) diff --git a/dandiapi/conftest.py b/dandiapi/conftest.py index 332b8640e..86c67d450 100644 --- a/dandiapi/conftest.py +++ b/dandiapi/conftest.py @@ -97,6 +97,7 @@ def base_s3_storage_factory(bucket_name: str) -> S3Storage: def s3_storage_factory(): + # TODO: Add bucket_name parameter OR new function for embargo bucket return base_s3_storage_factory(settings.DANDI_DANDISETS_BUCKET_NAME) @@ -105,6 +106,7 @@ def base_minio_storage_factory(bucket_name: str) -> MinioStorage: def minio_storage_factory() -> MinioStorage: + # TODO: Add bucket_name parameter OR new function for embargo bucket return base_minio_storage_factory(settings.DANDI_DANDISETS_BUCKET_NAME) diff --git a/dandiapi/settings.py b/dandiapi/settings.py index a72d246ed..dacb4fddd 100644 --- a/dandiapi/settings.py +++ b/dandiapi/settings.py @@ -86,6 +86,8 @@ def mutate_configuration(configuration: type[ComposedConfiguration]): os.environ['DANDI_ALLOW_LOCALHOST_URLS'] = 'True' DANDI_DANDISETS_BUCKET_NAME = values.Value(environ_required=True) + # TODO: add embargo bucket name constant + # DANDI_DANDISETS_EMBARGO_BUCKET_NAME = values.Value(environ_required=True) DANDI_DANDISETS_BUCKET_PREFIX = values.Value(default='', environ=True) DANDI_DANDISETS_LOG_BUCKET_NAME = values.Value(environ_required=True) DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = values.Value(environ_required=True) @@ -156,6 +158,8 @@ def mutate_configuration(config: type[ComposedConfiguration]): class TestingConfiguration(DandiMixin, TestingBaseConfiguration): DANDI_DANDISETS_BUCKET_NAME = 'test-dandiapi-dandisets' + # TODO: add embargo bucket name constant + # DANDI_DANDISETS_EMBARGO_BUCKET_NAME = 'test-embargo-dandiapi-dandisets' DANDI_DANDISETS_BUCKET_PREFIX = 'test-prefix/' DANDI_DANDISETS_LOG_BUCKET_NAME = 'test-dandiapi-dandisets-logs' DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = 'test-embargo-dandiapi-dandisets-logs' diff --git a/dandiapi/zarr/models.py b/dandiapi/zarr/models.py index 622b2c037..4255a7d45 100644 --- a/dandiapi/zarr/models.py +++ b/dandiapi/zarr/models.py @@ -34,6 +34,7 @@ class ZarrArchiveStatus(models.TextChoices): class ZarrArchive(TimeStampedModel): UUID_REGEX = r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}' INGEST_ERROR_MSG = 'Zarr archive is currently ingesting or has already ingested' + # TODO: pass in bucket name (?) storage = get_storage() class Meta: diff --git a/dev/.env.docker-compose b/dev/.env.docker-compose index 3e063c287..798995aa3 100644 --- a/dev/.env.docker-compose +++ b/dev/.env.docker-compose @@ -7,6 +7,8 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey DJANGO_STORAGE_BUCKET_NAME=django-storage DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets +# TODO: Add emarbgo bucket env var +DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs DJANGO_DANDI_WEB_APP_URL=http://localhost:8085 diff --git a/dev/.env.docker-compose-native b/dev/.env.docker-compose-native index feeed7df3..95eebee4a 100644 --- a/dev/.env.docker-compose-native +++ b/dev/.env.docker-compose-native @@ -7,6 +7,8 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage DJANGO_STORAGE_BUCKET_NAME=django-storage DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets +# TODO: Add emarbgo bucket env var +DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs DJANGO_DANDI_WEB_APP_URL=http://localhost:8085 diff --git a/doc/design/private-and-embargo-bucket-design.md b/doc/design/private-and-embargo-bucket-design.md new file mode 100644 index 000000000..e41a74e02 --- /dev/null +++ b/doc/design/private-and-embargo-bucket-design.md @@ -0,0 +1,3 @@ +# Design private and embargoed data into a private bucket. + +