aplbrain · NEStock · May 12, 2025 · May 14, 2025 · May 14, 2025 · May 15, 2025
diff --git a/.github/workflows/backend-ci.yml b/.github/workflows/backend-ci.yml
@@ -61,3 +61,5 @@ jobs:
           DJANGO_DANDI_DEV_EMAIL: [email protected]
           DJANGO_DANDI_ADMIN_EMAIL: [email protected]
           DANDI_ALLOW_LOCALHOST_URLS: 1
+          DJANGO_ALLOW_PRIVATE: true
+          DJANGO_USE_PRIVATE_BUCKET_FOR_EMBARGOED: true
diff --git a/.github/workflows/frontend-ci.yml b/.github/workflows/frontend-ci.yml
@@ -73,14 +73,18 @@ jobs:
       DJANGO_MINIO_STORAGE_SECRET_KEY: minioSecretKey
       DJANGO_STORAGE_BUCKET_NAME: dandi-bucket
       DJANGO_DANDI_DANDISETS_BUCKET_NAME: dandi-bucket
+      DJANGO_DANDI_DANDISETS_PRIVATE_BUCKET_NAME: dandi-private-dandisets
       DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME: dandiapi-dandisets-logs
       DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME: dandiapi-embargo-dandisets-logs
+      DJANGO_DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME: dandi-private-dandisets-logs
       DJANGO_DANDI_WEB_APP_URL: http://localhost:8085
       DJANGO_DANDI_API_URL: http://localhost:8000
       DJANGO_DANDI_JUPYTERHUB_URL: https://hub.dandiarchive.org/
       DJANGO_DANDI_DEV_EMAIL: [email protected]
       DJANGO_DANDI_ADMIN_EMAIL: [email protected]
       DANDI_ALLOW_LOCALHOST_URLS: 1
+      DJANGO_ALLOW_PRIVATE: true
+      DJANGO_USE_PRIVATE_BUCKET_FOR_EMBARGOED: true
 
       # Web client env vars
       VITE_APP_DANDI_API_ROOT: http://localhost:8000/api/

diff --git a/dandiapi/analytics/migrations/0003_processeds3log_private.py b/dandiapi/analytics/migrations/0003_processeds3log_private.py
@@ -0,0 +1,32 @@
+# Generated by Django 4.2.19 on 2025-06-17 13:56
+from __future__ import annotations
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        (
+            'analytics',
+            '0002_remove_processeds3log_analytics_processeds3log_unique_name_embargoed_and_more',
+        ),
+    ]
+
+    operations = [
+        migrations.RemoveConstraint(
+            model_name='processeds3log',
+            name='analytics_processeds3log_unique_name_embargoed',
+        ),
+        migrations.AddField(
+            model_name='processeds3log',
+            name='private',
+            field=models.BooleanField(default=False),
+        ),
+        migrations.AddConstraint(
+            model_name='processeds3log',
+            constraint=models.UniqueConstraint(
+                fields=('name', 'private', 'historically_embargoed'),
+                name='analytics_processeds3log_unique_name_embargoed',
+            ),
+        ),
+    ]
diff --git a/dandiapi/analytics/models.py b/dandiapi/analytics/models.py
@@ -13,15 +13,23 @@ class ProcessedS3Log(models.Model):
         ],
     )
 
+    # Represents if this s3 log file is private (including embargoed) or public.
+    # If private is True, the log file lives in the S3 bucket pointed to by
+    # DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME.
+    # If private is False...
+    #   & historically_embargoed is False, the log file lives in the S3
+    #   bucket pointed to by DANDI_DANDISETS_LOG_BUCKET_NAME.
+    #   & historically_embargoed is True, the log file lives in the S3
+    #   bucket pointed to by DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME.
+    private = models.BooleanField(default=False)
+
     # Represents if this s3 log file was embargoed prior to the embargo re-design.
-    # If this field is True, the log file lives in the S3 bucket pointed to by the
-    # DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME setting.
     historically_embargoed = models.BooleanField(default=False)
 
     class Meta:
         constraints = [
             models.UniqueConstraint(
-                fields=['name', 'historically_embargoed'],
+                fields=['name', 'private', 'historically_embargoed'],
                 name='%(app_label)s_%(class)s_unique_name_embargoed',
             )
         ]

diff --git a/dandiapi/analytics/tasks/__init__.py b/dandiapi/analytics/tasks/__init__.py
@@ -14,8 +14,8 @@
 from s3logparse import s3logparse
 
 from dandiapi.analytics.models import ProcessedS3Log
-from dandiapi.api.models.asset import AssetBlob
-from dandiapi.api.storage import get_boto_client, get_storage
+from dandiapi.api.models import PrivateAssetBlob, PublicAssetBlob
+from dandiapi.api.storage import get_boto_client, get_private_storage, get_storage
 
 if TYPE_CHECKING:
     from collections.abc import Generator
@@ -24,43 +24,71 @@
 
 # should be one of the DANDI_DANDISETS_*_LOG_BUCKET_NAME settings
 LogBucket = str
+# Log buckets actively used in the system
+ACTIVE_LOG_BUCKETS = (
+    {settings.DANDI_DANDISETS_LOG_BUCKET_NAME}
+    if not settings.ALLOW_PRIVATE
+    else {
+        settings.DANDI_DANDISETS_LOG_BUCKET_NAME,
+        settings.DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME,
+    }
+)
 
 
-def _bucket_objects_after(after: str | None) -> Generator[dict, None, None]:
-    s3 = get_boto_client(get_storage())
+def _bucket_objects_after(bucket: str, after: str | None) -> Generator[dict, None, None]:
+    # Check that bucket name is valid
+    if bucket not in ACTIVE_LOG_BUCKETS:
+        raise ValueError(f'Non-log bucket: {bucket}')
+    private = bucket == settings.DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME
+
+    s3 = get_boto_client(get_storage() if not private else get_private_storage())
+
     kwargs = {}
     if after:
         kwargs['StartAfter'] = after
 
     paginator = s3.get_paginator('list_objects_v2')
-    for page in paginator.paginate(Bucket=settings.DANDI_DANDISETS_LOG_BUCKET_NAME, **kwargs):
+    for page in paginator.paginate(Bucket=bucket, **kwargs):
         yield from page.get('Contents', [])
 
 
 @shared_task(queue='s3-log-processing', soft_time_limit=60, time_limit=80)
-def collect_s3_log_records_task() -> None:
+def collect_s3_log_records_task(bucket: LogBucket) -> None:
     """Dispatch a task per S3 log file to process for download counts."""
-    after = ProcessedS3Log.objects.aggregate(last_log=Max('name'))['last_log']
+    # Check that bucket name is valid
+    if bucket not in ACTIVE_LOG_BUCKETS:
+        raise RuntimeError
+    private = bucket == settings.DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME
 
-    for s3_log_object in _bucket_objects_after(after):
-        process_s3_log_file_task.delay(s3_log_object['Key'])
+    after = ProcessedS3Log.objects.filter(private=private).aggregate(last_log=Max('name'))[
+        'last_log'
+    ]
+
+    for s3_log_object in _bucket_objects_after(bucket, after):
+        process_s3_log_file_task.delay(bucket, s3_log_object['Key'])
 
 
 @shared_task(queue='s3-log-processing', soft_time_limit=120, time_limit=140)
-def process_s3_log_file_task(s3_log_key: str) -> None:
+def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None:
     """
     Process a single S3 log file for download counts.
 
     Creates a ProcessedS3Log entry and updates the download counts for the relevant
     asset blobs. Prevents duplicate processing with a unique constraint on the ProcessedS3Log name.
     """
+    # Check that bucket name is valid
+    if bucket not in ACTIVE_LOG_BUCKETS:
+        raise RuntimeError
+    private = bucket == settings.DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME
+    BlobModel = PublicAssetBlob if not private else PrivateAssetBlob  # noqa: N806
+
     # short circuit if the log file has already been processed. note that this doesn't guarantee
     # exactly once processing, that's what the unique constraint on ProcessedS3Log is for.
-    if ProcessedS3Log.objects.filter(name=s3_log_key.split('/')[-1]).exists():
+    if ProcessedS3Log.objects.filter(name=s3_log_key.split('/')[-1], private=private).exists():
         return
 
-    s3 = get_boto_client(get_storage())
-    data = s3.get_object(Bucket=settings.DANDI_DANDISETS_LOG_BUCKET_NAME, Key=s3_log_key)
+    s3 = get_boto_client(get_storage() if not private else get_private_storage())
+    data = s3.get_object(Bucket=bucket, Key=s3_log_key)
     download_counts = Counter()
 
     for log_entry in s3logparse.parse_log_lines(
@@ -71,14 +99,14 @@ def process_s3_log_file_task(s3_log_key: str) -> None:
 
     with transaction.atomic():
         try:
-            log = ProcessedS3Log(name=s3_log_key.split('/')[-1])
+            log = ProcessedS3Log(name=s3_log_key.split('/')[-1], private=private)
             # disable constraint validation checking so duplicate errors can be detected and
             # ignored. the rest of the full_clean errors should still be raised.
             log.full_clean(validate_constraints=False)
             log.save()
         except IntegrityError as e:
             if '_unique_name' in str(e):
-                logger.info('Already processed log file %s', s3_log_key)
+                logger.info('Already processed log file %s, private=%s', s3_log_key, private)
             return
 
         # we need to store all of the fully hydrated blob objects in memory in order to use
@@ -88,12 +116,12 @@ def process_s3_log_file_task(s3_log_key: str) -> None:
 
         # batch the blob queries to avoid a large WHERE IN clause
         for batch in batched(download_counts, 1_000):
-            asset_blobs += AssetBlob.objects.filter(blob__in=batch)
+            asset_blobs += BlobModel.objects.filter(blob__in=batch)
 
         for asset_blob in asset_blobs:
             asset_blob.download_count = F('download_count') + download_counts[asset_blob.blob]
 
         # note this task is run serially per log file. this is to avoid the contention between
         # multiple log files trying to update the same blobs. this serialization is enforced through
         # the task queue configuration.
-        AssetBlob.objects.bulk_update(asset_blobs, ['download_count'], batch_size=1_000)
+        BlobModel.objects.bulk_update(asset_blobs, ['download_count'], batch_size=1_000)
diff --git a/dandiapi/analytics/tests/test_download_counts.py b/dandiapi/analytics/tests/test_download_counts.py
@@ -5,17 +5,24 @@
 
 from dandiapi.analytics.models import ProcessedS3Log
 from dandiapi.analytics.tasks import collect_s3_log_records_task, process_s3_log_file_task
-from dandiapi.api.storage import create_s3_storage, get_boto_client
+from dandiapi.api.storage import (
+    create_s3_storage,
+    get_boto_client,
+    get_private_storage,
+    get_storage,
+)
 
 
 @pytest.fixture
 def s3_log_bucket():
+    # This file is testing the processing of s3 logs, so we believe we only need to test 1 bucket
     return create_s3_storage(settings.DANDI_DANDISETS_LOG_BUCKET_NAME).bucket_name
 
 
 @pytest.fixture
 def s3_log_file(s3_log_bucket, asset_blob):
-    s3 = get_boto_client()
+    private = s3_log_bucket == settings.DANDI_DANDISETS_PRIVATE_LOG_BUCKET_NAME
+    s3 = get_boto_client(get_storage() if not private else get_private_storage())
 
     log_file_name = '2019-02-06-00-00-38-5C5B0E0CA8F2B1B5'
     s3.put_object(
@@ -45,34 +52,34 @@ def s3_log_file(s3_log_bucket, asset_blob):
 
 
 @pytest.mark.django_db
-def test_processing_s3_log_files(s3_log_file, asset_blob):
-    collect_s3_log_records_task()
+def test_processing_s3_log_files(s3_log_bucket, s3_log_file, asset_blob):
+    collect_s3_log_records_task(s3_log_bucket)
     asset_blob.refresh_from_db()
 
     assert ProcessedS3Log.objects.count() == 1
     assert asset_blob.download_count == 1
 
 
 @pytest.mark.django_db
-def test_processing_s3_log_files_idempotent(s3_log_file, asset_blob):
+def test_processing_s3_log_files_idempotent(s3_log_bucket, s3_log_file, asset_blob):
     # this tests that the outer task which collects the log files to process is
     # idempotent, in other words, it uses StartAfter correctly.
-    collect_s3_log_records_task()
+    collect_s3_log_records_task(s3_log_bucket)
     # run the task again, it should skip the existing log record
-    collect_s3_log_records_task()
+    collect_s3_log_records_task(s3_log_bucket)
     asset_blob.refresh_from_db()
 
     assert ProcessedS3Log.objects.count() == 1
     assert asset_blob.download_count == 1
 
 
 @pytest.mark.django_db
-def test_processing_s3_log_file_task_idempotent(s3_log_file, asset_blob):
+def test_processing_s3_log_file_task_idempotent(s3_log_bucket, s3_log_file, asset_blob):
     # this tests that the inner task which processes a single log file is
     # idempotent, utilizing the unique constraint on ProcessedS3Log correctly.
-    process_s3_log_file_task(s3_log_file)
+    process_s3_log_file_task(s3_log_bucket, s3_log_file)
     # run the task again, it should ignore the new log
-    process_s3_log_file_task(s3_log_file)
+    process_s3_log_file_task(s3_log_bucket, s3_log_file)
     asset_blob.refresh_from_db()
 
     assert ProcessedS3Log.objects.count() == 1

diff --git a/dandiapi/api/admin.py b/dandiapi/api/admin.py
@@ -18,12 +18,13 @@
 
 from dandiapi.api.models import (
     Asset,
-    AssetBlob,
     AuditRecord,
     Dandiset,
     DandisetStar,
     GarbageCollectionEvent,
-    Upload,
+    PrivateAssetBlob,
+    PublicAssetBlob,
+    PublicUpload,
     UserMetadata,
     Version,
 )
@@ -185,7 +186,8 @@ def number_of_assets(self, obj):
         return obj.number_of_assets
 
 
-@admin.register(AssetBlob)
+@admin.register(PublicAssetBlob)
+@admin.register(PrivateAssetBlob)
 class AssetBlobAdmin(admin.ModelAdmin):
     search_fields = ['blob']
     list_display = ['id', 'blob_id', 'blob', 'references', 'size', 'sha256', 'modified', 'created']
@@ -196,16 +198,17 @@ def get_queryset(self, request):
 
 
 class AssetBlobInline(LimitedTabularInline):
-    model = AssetBlob
+    model = PublicAssetBlob
 
 
 @admin.register(Asset)
 class AssetAdmin(admin.ModelAdmin):
-    autocomplete_fields = ['blob', 'zarr', 'versions']
+    autocomplete_fields = ['public_blob', 'private_blob', 'zarr', 'versions']
     fields = [
         'asset_id',
         'path',
-        'blob',
+        'public_blob',
+        'private_blob',
         'zarr',
         'metadata',
         'versions',
@@ -217,18 +220,19 @@ class AssetAdmin(admin.ModelAdmin):
         'id',
         'asset_id',
         'path',
-        'blob',
+        'public_blob',
+        'private_blob',
         'zarr',
         'status',
         'size',
         'modified',
         'created',
     ]
     list_display_links = ['id', 'asset_id', 'path']
-    list_select_related = ['zarr', 'blob']
+    list_select_related = ['zarr', 'public_blob', 'private_blob']
 
 
-@admin.register(Upload)
+@admin.register(PublicUpload)
 class UploadAdmin(admin.ModelAdmin):
     list_display = ['id', 'upload_id', 'blob', 'etag', 'upload_id', 'size', 'created']
     list_display_links = ['id', 'upload_id']