diff --git a/brainscore_language/benchmarks/pereira2018/benchmark.py b/brainscore_language/benchmarks/pereira2018/benchmark.py index ac9d73ff..d8fdf8e9 100644 --- a/brainscore_language/benchmarks/pereira2018/benchmark.py +++ b/brainscore_language/benchmarks/pereira2018/benchmark.py @@ -12,13 +12,10 @@ def Pereira2018_243sentences(): return _Pereira2018ExperimentLinear(experiment='243sentences', ceiling_s3_kwargs=dict( - version_id='CHl_9aFHIWVnPW_njePfy28yzggKuUPw', sha1='5e23de899883828f9c886aec304bc5aa0f58f66c', raw_kwargs=dict( - version_id='uZye03ENmn.vKB5mARUGhcIY_DjShtPD', sha1='525a6ac8c14ad826c63fdd71faeefb8ba542d5ac', raw_kwargs=dict( - version_id='XVTo58Po5YrNjTuDIWrmfHI0nbN2MVZa', sha1='34ba453dc7e8a19aed18cc9bca160e97b4a80be5' ) ) @@ -27,13 +24,10 @@ def Pereira2018_243sentences(): def Pereira2018_384sentences(): return _Pereira2018ExperimentLinear(experiment='384sentences', ceiling_s3_kwargs=dict( - version_id='sjlnXr5wXUoGv6exoWu06C4kYI0KpZLk', sha1='fc895adc52fd79cea3040961d65d8f736a9d3e29', raw_kwargs=dict( - version_id='Hi74r9UKfpK0h0Bjf5DL.JgflGoaknrA', sha1='ce2044a7713426870a44131a99bfc63d8843dae0', raw_kwargs=dict( - version_id='m4dq_ouKWZkYtdyNPMSP0p6rqb7wcYpi', sha1='fe9fb24b34fd5602e18e34006ac5ccc7d4c825b8' ) ) @@ -76,8 +70,8 @@ def _load_data(self, experiment: str) -> NeuroidAssembly: data.attrs['identifier'] = f"{data.identifier}.{experiment}" return data - def _load_ceiling(self, identifier: str, version_id: str, sha1: str, assembly_prefix="ceiling_", raw_kwargs=None): - ceiling = load_from_s3(identifier, cls=Score, assembly_prefix=assembly_prefix, version_id=version_id, sha1=sha1) + def _load_ceiling(self, identifier: str, sha1: str, version_id: str = None, assembly_prefix="ceiling_", raw_kwargs=None): + ceiling = load_from_s3(identifier, sha1=sha1, cls=Score, assembly_prefix=assembly_prefix, version_id=version_id) if raw_kwargs: # recursively load raw attributes raw = self._load_ceiling(identifier=identifier, assembly_prefix=assembly_prefix + "raw_", **raw_kwargs) ceiling.attrs['raw'] = raw diff --git a/brainscore_language/data/blank2014/__init__.py b/brainscore_language/data/blank2014/__init__.py index efb80e73..da70e5b7 100644 --- a/brainscore_language/data/blank2014/__init__.py +++ b/brainscore_language/data/blank2014/__init__.py @@ -18,5 +18,4 @@ data_registry['Blank2014.fROI'] = lambda: load_from_s3( identifier="Blank2014.fROI", - version_id="qM.uLV8ltOHM297r2SaGteYMX4Vy.oHB", sha1="af1e868821b897cb1684e4c8dcd33977121ef552") diff --git a/brainscore_language/data/blank2014/data_packaging.py b/brainscore_language/data/blank2014/data_packaging.py index c04ceff5..79ce6aca 100644 --- a/brainscore_language/data/blank2014/data_packaging.py +++ b/brainscore_language/data/blank2014/data_packaging.py @@ -28,8 +28,7 @@ def upload_blank2014(): assembly = load_blank2014() upload_data_assembly(assembly, - assembly_identifier="Blank2014.fROI", - bucket_name="brainscore-language") + assembly_identifier="Blank2014.fROI") # This file requires nltk_contrib to be installed to run. nltk_contrib is not part of requirements.txt because this diff --git a/brainscore_language/data/fedorenko2016/__init__.py b/brainscore_language/data/fedorenko2016/__init__.py index d9d4880a..00de8ffa 100644 --- a/brainscore_language/data/fedorenko2016/__init__.py +++ b/brainscore_language/data/fedorenko2016/__init__.py @@ -19,5 +19,4 @@ data_registry['Fedorenko2016.language'] = lambda: load_from_s3( identifier="Fedorenko2016.language", - version_id="qvB7YZfEjbXEE64bODNLlQlZKWGpgPhy", sha1="2966b6d78e972a72068aa6907377483f427e8d9a") diff --git a/brainscore_language/data/fedorenko2016/data_packaging.py b/brainscore_language/data/fedorenko2016/data_packaging.py index 7f8f27a2..11b1dbc4 100644 --- a/brainscore_language/data/fedorenko2016/data_packaging.py +++ b/brainscore_language/data/fedorenko2016/data_packaging.py @@ -21,8 +21,7 @@ def upload_fedorenko2016(): assembly = load_fedorenko2016() upload_data_assembly(assembly, - assembly_identifier="Fedorenko2016.language", - bucket_name="brainscore-language") + assembly_identifier="Fedorenko2016.language") # adapted from diff --git a/brainscore_language/data/futrell2018/__init__.py b/brainscore_language/data/futrell2018/__init__.py index c68ebf6a..4f60dcaa 100644 --- a/brainscore_language/data/futrell2018/__init__.py +++ b/brainscore_language/data/futrell2018/__init__.py @@ -18,7 +18,6 @@ def load_assembly(): assembly = load_from_s3( identifier="Futrell2018", - version_id="MpR.gIXN8UrUnqwQyj.kCrh4VWrBvsGf", sha1="381ccc8038fbdb31235b5f3e1d350f359b5e287f") assembly.attrs['bibtex'] = BIBTEX return assembly diff --git a/brainscore_language/data/pereira2018/__init__.py b/brainscore_language/data/pereira2018/__init__.py index 47ea0f09..0a1b7cb3 100644 --- a/brainscore_language/data/pereira2018/__init__.py +++ b/brainscore_language/data/pereira2018/__init__.py @@ -19,9 +19,7 @@ data_registry['Pereira2018.language'] = lambda: load_from_s3( identifier="Pereira2018.language", - version_id="fq0gh.P7ThLu6DWUulho5W_F.YTEhDqJ", sha1="f8434b4022f5b2c862f0ff2854d5b3f5f2a7fb96") data_registry['Pereira2018.auditory'] = lambda: load_from_s3( identifier="Pereira2018.auditory", - version_id=".lCMuSrGBlsEgLtZDOApLlr3h2szCmoC", sha1="08e576bd3b8caf64850bb879abf07ae228ff1f5f") diff --git a/brainscore_language/data/tuckute2024/__init__.py b/brainscore_language/data/tuckute2024/__init__.py index 9b4a5d2c..b2bf878c 100644 --- a/brainscore_language/data/tuckute2024/__init__.py +++ b/brainscore_language/data/tuckute2024/__init__.py @@ -14,5 +14,4 @@ data_registry["Tuckute2024.language"] = lambda: load_from_s3( identifier="Tuckute2024.language", - version_id="BB.DbwqLB4OhDR64duqojNdL0CRd4RmG", sha1="5c8fc7f3e24cc1af5f5296459377b638b6492641") \ No newline at end of file diff --git a/brainscore_language/data/tuckute2024/data_packaging.py b/brainscore_language/data/tuckute2024/data_packaging.py index e90bce46..febb9438 100644 --- a/brainscore_language/data/tuckute2024/data_packaging.py +++ b/brainscore_language/data/tuckute2024/data_packaging.py @@ -4,16 +4,14 @@ import numpy as np import pandas as pd -from brainio.assemblies import NeuroidAssembly +from brainscore_core.supported_data_standards.brainio.assemblies import NeuroidAssembly from pathlib import Path from brainscore_language.utils.s3 import upload_data_assembly def upload_tuckute2024(): assembly = load_tuckute2024_5subj(source=os.path.join(Path(__file__).parent, 'brain-lang-data_participant_20230728.csv'), roi="lang_LH_netw") upload_data_assembly(assembly, - assembly_identifier=f"Tuckute2024.language", - bucket_name="brainscore-language" - ) + assembly_identifier="Tuckute2024.language") def groupby_coord(df: pd.DataFrame, coord_col: str = 'item_id', diff --git a/brainscore_language/data/tuckute2024/test.py b/brainscore_language/data/tuckute2024/test.py index ba846a6f..e501e64f 100644 --- a/brainscore_language/data/tuckute2024/test.py +++ b/brainscore_language/data/tuckute2024/test.py @@ -1,15 +1,11 @@ import numpy as np from brainscore_language import load_dataset -from brainscore_language.data.tuckute2024.data_packaging import load_tuckute2024_5subj + class TestData: - def test_language(self, - load_from_cache: bool = False): - if load_from_cache: - assembly = load_dataset('tuckute2024_5subj_lang_LH_netw') - else: - assembly = load_tuckute2024_5subj() + def test_language(self): + assembly = load_dataset('Tuckute2024.language') assert assembly.dims == ('presentation', 'neuroid') assert assembly.shape == (1000, 1) @@ -19,6 +15,3 @@ def test_language(self, assert np.unique(assembly.stimulus_id.values).shape[0] == 1000 assert assembly.neuroid_id.values == [1] assert 1.28 < np.max(assembly.data) < 1.29 - - - diff --git a/brainscore_language/utils/s3.py b/brainscore_language/utils/s3.py index ec08e98a..d3f7035b 100644 --- a/brainscore_language/utils/s3.py +++ b/brainscore_language/utils/s3.py @@ -8,8 +8,15 @@ _logger = logging.getLogger(__name__) +# S3 bucket configuration, following the same pattern as brainscore_vision. +# The bucket path "brainscore-storage/brainscore-language" means: +# - actual S3 bucket: brainscore-storage +# - key prefix: brainscore-language/ +_BUCKET = "brainscore-storage" +_FOLDER = "brainscore-language" -def upload_data_assembly(assembly, assembly_identifier, bucket_name="brainscore-language", assembly_prefix="assy_"): + +def upload_data_assembly(assembly, assembly_identifier, bucket_name=_BUCKET, assembly_prefix="assy_"): # adapted from # https://github.com/mschrimpf/brainio/blob/8a40a3558d0b86072b9e221808f19005c7cb8c17/brainio/packaging.py#L217 @@ -19,21 +26,22 @@ def upload_data_assembly(assembly, assembly_identifier, bucket_name="brainscore- assembly_store_identifier = assembly_prefix + assembly_identifier.replace(".", "_") netcdf_file_name = assembly_store_identifier + ".nc" target_netcdf_path = Path(fetch.get_local_data_path()) / assembly_store_identifier / netcdf_file_name - s3_key = netcdf_file_name + s3_key = f"{_FOLDER}/{netcdf_file_name}" # write to disk and upload netcdf_kf_sha1 = write_netcdf(assembly, target_netcdf_path) response = upload_to_s3(target_netcdf_path, bucket_name, s3_key) _logger.debug(f"Uploaded {assembly_store_identifier} to S3 " - f"with key={s3_key}, sha1={netcdf_kf_sha1}, version_id={response['VersionId']}: {response}") + f"with key={s3_key}, sha1={netcdf_kf_sha1}, version_id={response.get('VersionId')}: {response}") response['sha1'] = netcdf_kf_sha1 return response -def load_from_s3(identifier, version_id, sha1, assembly_prefix="assy_", cls=NeuroidAssembly) -> DataAssembly: +def load_from_s3(identifier, sha1, version_id=None, assembly_prefix="assy_", cls=NeuroidAssembly) -> DataAssembly: filename = f"{assembly_prefix}{identifier.replace('.', '_')}.nc" + remote_path = f"{_FOLDER}/{filename}" file_path = fetch_file(location_type="S3", - location=f"https://brainscore-language.s3.amazonaws.com/{filename}", + location=f"https://{_BUCKET}.s3.amazonaws.com/{remote_path}", version_id=version_id, sha1=sha1) loader = AssemblyLoader(cls=cls, file_path=file_path)