Skip to content

Investigate splitting private data into its own bucket #70

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: apl-setup
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/frontend-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ jobs:
DJANGO_MINIO_STORAGE_SECRET_KEY: minioSecretKey
DJANGO_STORAGE_BUCKET_NAME: dandi-bucket
DJANGO_DANDI_DANDISETS_BUCKET_NAME: dandi-bucket
# TODO: Add env var
# DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME: dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME: dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME: dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL: http://localhost:8085
Expand Down
6 changes: 6 additions & 0 deletions dandiapi/analytics/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ class ProcessedS3Log(models.Model):
],
)

# TODO: we need a variable to tell us where this s3 log file lives
# if only 2 buckets (public, embargo), we can still use a boolean
# (currently this is historically_embargoed)
# do we want to change the name or leave it?
# else, we could use an enum to distinguish between 3+ buckets

# Represents if this s3 log file was embargoed prior to the embargo re-design.
# If this field is True, the log file lives in the S3 bucket pointed to by the
# DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME setting.
Expand Down
14 changes: 13 additions & 1 deletion dandiapi/analytics/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,26 +26,38 @@
LogBucket = str


# TODO: Revert function def & use bucket var
# def _bucket_objects_after(bucket: str, after: str | None) -> Generator[dict, None, None]:
def _bucket_objects_after(after: str | None) -> Generator[dict, None, None]:
# TODO: bucket
s3 = get_boto_client(get_storage())
kwargs = {}
if after:
kwargs['StartAfter'] = after

paginator = s3.get_paginator('list_objects_v2')
# TODO: bucket
# for page in paginator.paginate(Bucket=bucket, **kwargs):
for page in paginator.paginate(Bucket=settings.DANDI_DANDISETS_LOG_BUCKET_NAME, **kwargs):
yield from page.get('Contents', [])


# TODO: Revert function def & use bucket var
# def collect_s3_log_records_task(bucket: LogBucket) -> None:
@shared_task(queue='s3-log-processing', soft_time_limit=60, time_limit=80)
def collect_s3_log_records_task() -> None:
"""Dispatch a task per S3 log file to process for download counts."""
# TODO: bucket
after = ProcessedS3Log.objects.aggregate(last_log=Max('name'))['last_log']

# TODO: bucket
# for s3_log_object in _bucket_objects_after(bucket, after):
# process_s3_log_file_task.delay(bucket, s3_log_object['Key'])
for s3_log_object in _bucket_objects_after(after):
process_s3_log_file_task.delay(s3_log_object['Key'])


# TODO: Revert function def ?
# def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None:
@shared_task(queue='s3-log-processing', soft_time_limit=120, time_limit=140)
def process_s3_log_file_task(s3_log_key: str) -> None:
"""
Expand Down
11 changes: 11 additions & 0 deletions dandiapi/analytics/tests/test_download_counts.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,34 +44,45 @@ def s3_log_file(s3_log_bucket, asset_blob):
s3.delete_object(Bucket=s3_log_bucket, Key=log_file_name)


# TODO: add s3_log_bucket back in as a parameter
# def test_processing_s3_log_files(s3_log_bucket, s3_log_file, asset_blob):
@pytest.mark.django_db
def test_processing_s3_log_files(s3_log_file, asset_blob):
# TODO: s3_log_bucket
collect_s3_log_records_task()
asset_blob.refresh_from_db()

assert ProcessedS3Log.objects.count() == 1
assert asset_blob.download_count == 1


# TODO: add s3_log_bucket back in as a parameter
# def test_processing_s3_log_files_idempotent(s3_log_bucket, s3_log_file, asset_blob):
@pytest.mark.django_db
def test_processing_s3_log_files_idempotent(s3_log_file, asset_blob):
# this tests that the outer task which collects the log files to process is
# idempotent, in other words, it uses StartAfter correctly.
# TODO: s3_log_bucket
collect_s3_log_records_task()
# run the task again, it should skip the existing log record
# TODO: s3_log_bucket
collect_s3_log_records_task()
asset_blob.refresh_from_db()

assert ProcessedS3Log.objects.count() == 1
assert asset_blob.download_count == 1


# TODO: add s3_log_bucket back in as a parameter
# def test_processing_s3_log_file_task_idempotent(s3_log_bucket, s3_log_file, asset_blob):
@pytest.mark.django_db
def test_processing_s3_log_file_task_idempotent(s3_log_file, asset_blob):
# this tests that the inner task which processes a single log file is
# idempotent, utilizing the unique constraint on ProcessedS3Log correctly.
# TODO: s3_log_bucket
process_s3_log_file_task(s3_log_file)
# run the task again, it should ignore the new log
# TODO: s3_log_bucket
process_s3_log_file_task(s3_log_file)
asset_blob.refresh_from_db()

Expand Down
1 change: 1 addition & 0 deletions dandiapi/api/management/commands/cleanup_blobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from dandiapi.api.models.upload import AssetBlob

# TODO: handle embargo bucket
BUCKET = settings.DANDI_DANDISETS_BUCKET_NAME


Expand Down
4 changes: 4 additions & 0 deletions dandiapi/api/manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@

def _s3_url(path: str) -> str:
"""Turn an object path into a fully qualified S3 URL."""
# TODO: determine which bucket name to pass in
# if embargoed:
# storage = create_s3_storage(settings.DANDI_DANDISETS_EMBARGO_BUCKET_NAME)
# else:
storage = create_s3_storage(settings.DANDI_DANDISETS_BUCKET_NAME)
signed_url = storage.url(path)
# Strip off the query parameters from the presigning, as they are different every time
Expand Down
2 changes: 2 additions & 0 deletions dandiapi/api/models/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,9 @@ class AssetBlob(TimeStampedModel):
SHA256_REGEX = r'[0-9a-f]{64}'
ETAG_REGEX = r'[0-9a-f]{32}(-[1-9][0-9]*)?'

# TODO: do we need an indicator of embargo vs. private?
embargoed = models.BooleanField(default=False)
# TODO: storage and upload_to will be dependent on bucket
blob = models.FileField(blank=True, storage=get_storage, upload_to=get_storage_prefix)
blob_id = models.UUIDField(unique=True)
sha256 = models.CharField( # noqa: DJ001
Expand Down
2 changes: 2 additions & 0 deletions dandiapi/api/models/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Upload(models.Model): # noqa: DJ008

dandiset = models.ForeignKey(Dandiset, related_name='uploads', on_delete=models.CASCADE)

# TODO: storage and upload_to will be dependent on bucket
blob = models.FileField(blank=True, storage=get_storage, upload_to=get_storage_prefix)
embargoed = models.BooleanField(default=False)

Expand All @@ -45,6 +46,7 @@ class Meta:
def object_key(upload_id):
upload_id = str(upload_id)
return (
# TODO: determine which bucket
f'{settings.DANDI_DANDISETS_BUCKET_PREFIX}'
f'blobs/{upload_id[0:3]}/{upload_id[3:6]}/{upload_id}'
)
Expand Down
1 change: 1 addition & 0 deletions dandiapi/api/services/asset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ def add_asset_to_version(
# embargoed blob results in that blob being unembargoed.
# NOTE: This only applies to asset blobs, as zarrs cannot belong to
# multiple dandisets at once.
# TODO: unsure if this logic would need to change? are there unintended side-effects?
if (
asset_blob is not None
and asset_blob.embargoed
Expand Down
1 change: 1 addition & 0 deletions dandiapi/api/services/embargo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
@transaction.atomic()
def unembargo_dandiset(ds: Dandiset, user: User):
"""Unembargo a dandiset by copying all embargoed asset blobs to the public bucket."""
# TODO: Move embargoed dandiset to public bucket
logger.info('Unembargoing Dandiset %s', ds.identifier)
logger.info('\t%s assets', ds.draft_version.assets.count())

Expand Down
2 changes: 2 additions & 0 deletions dandiapi/api/services/embargo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def newfn(*args, **kwargs):

@retry(times=3, exceptions=(Exception,))
def _delete_object_tags(client: S3Client, blob: str):
# TODO: determine which bucket name to pass in
client.delete_object_tagging(
Bucket=settings.DANDI_DANDISETS_BUCKET_NAME,
Key=blob,
Expand All @@ -62,6 +63,7 @@ def _delete_object_tags(client: S3Client, blob: str):
@retry(times=3, exceptions=(Exception,))
def _delete_zarr_object_tags(client: S3Client, zarr: str):
paginator = client.get_paginator('list_objects_v2')
# TODO: determine which bucket name to pass in
pages = paginator.paginate(
Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Prefix=zarr_s3_path(zarr_id=zarr)
)
Expand Down
4 changes: 4 additions & 0 deletions dandiapi/api/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,10 @@ def get_storage_params(storage: Storage):


def get_storage() -> Storage:
# TODO: add parameter or split into function per bucket name
# if embargoed:
# return create_s3_storage(settings.DANDI_DANDISETS_EMBARGO_BUCKET_NAME)
# else:
return create_s3_storage(settings.DANDI_DANDISETS_BUCKET_NAME)


Expand Down
1 change: 1 addition & 0 deletions dandiapi/api/tasks/scheduled.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ def register_scheduled_tasks(sender: Celery, **kwargs):
sender.add_periodic_task(timedelta(minutes=10), refresh_materialized_view_search.s())

# Process new S3 logs every hour
# TODO: pass in bucket name to collect_s3_log_records_task
sender.add_periodic_task(timedelta(hours=1), collect_s3_log_records_task.s())

# Run garbage collection once a day
Expand Down
2 changes: 2 additions & 0 deletions dandiapi/api/tests/test_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ def test_audit_finalize_zarr(
boto = get_boto_client()
zarr_archive = ZarrArchive.objects.get(zarr_id=zarr.zarr_id)
for path in paths:
# TODO: test both buckets?
boto.put_object(
Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Key=zarr_archive.s3_path(path), Body=b'a'
)
Expand Down Expand Up @@ -421,6 +422,7 @@ def test_audit_delete_zarr_chunks(
boto = get_boto_client()
zarr_archive = ZarrArchive.objects.get(zarr_id=zarr.zarr_id)
for path in paths:
# TODO: test both buckets?
boto.put_object(
Bucket=settings.DANDI_DANDISETS_BUCKET_NAME, Key=zarr_archive.s3_path(path), Body=b'a'
)
Expand Down
2 changes: 2 additions & 0 deletions dandiapi/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def base_s3_storage_factory(bucket_name: str) -> S3Storage:


def s3_storage_factory():
# TODO: Add bucket_name parameter OR new function for embargo bucket
return base_s3_storage_factory(settings.DANDI_DANDISETS_BUCKET_NAME)


Expand All @@ -105,6 +106,7 @@ def base_minio_storage_factory(bucket_name: str) -> MinioStorage:


def minio_storage_factory() -> MinioStorage:
# TODO: Add bucket_name parameter OR new function for embargo bucket
return base_minio_storage_factory(settings.DANDI_DANDISETS_BUCKET_NAME)


Expand Down
4 changes: 4 additions & 0 deletions dandiapi/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def mutate_configuration(configuration: type[ComposedConfiguration]):
os.environ['DANDI_ALLOW_LOCALHOST_URLS'] = 'True'

DANDI_DANDISETS_BUCKET_NAME = values.Value(environ_required=True)
# TODO: add embargo bucket name constant
# DANDI_DANDISETS_EMBARGO_BUCKET_NAME = values.Value(environ_required=True)
DANDI_DANDISETS_BUCKET_PREFIX = values.Value(default='', environ=True)
DANDI_DANDISETS_LOG_BUCKET_NAME = values.Value(environ_required=True)
DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = values.Value(environ_required=True)
Expand Down Expand Up @@ -156,6 +158,8 @@ def mutate_configuration(config: type[ComposedConfiguration]):

class TestingConfiguration(DandiMixin, TestingBaseConfiguration):
DANDI_DANDISETS_BUCKET_NAME = 'test-dandiapi-dandisets'
# TODO: add embargo bucket name constant
# DANDI_DANDISETS_EMBARGO_BUCKET_NAME = 'test-embargo-dandiapi-dandisets'
DANDI_DANDISETS_BUCKET_PREFIX = 'test-prefix/'
DANDI_DANDISETS_LOG_BUCKET_NAME = 'test-dandiapi-dandisets-logs'
DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = 'test-embargo-dandiapi-dandisets-logs'
Expand Down
1 change: 1 addition & 0 deletions dandiapi/zarr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class ZarrArchiveStatus(models.TextChoices):
class ZarrArchive(TimeStampedModel):
UUID_REGEX = r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}'
INGEST_ERROR_MSG = 'Zarr archive is currently ingesting or has already ingested'
# TODO: pass in bucket name (?)
storage = get_storage()

class Meta:
Expand Down
2 changes: 2 additions & 0 deletions dev/.env.docker-compose
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey
DJANGO_STORAGE_BUCKET_NAME=django-storage
DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage
DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets
# TODO: Add emarbgo bucket env var
DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL=http://localhost:8085
Expand Down
2 changes: 2 additions & 0 deletions dev/.env.docker-compose-native
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey
DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage
DJANGO_STORAGE_BUCKET_NAME=django-storage
DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets
# TODO: Add emarbgo bucket env var
DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL=http://localhost:8085
Expand Down
3 changes: 3 additions & 0 deletions doc/design/private-and-embargo-bucket-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Design private and embargoed data into a private bucket.


Loading