diff --git a/.travis.yml b/.travis.yml index 41ed926..ade14e8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,8 @@ language: python -python: -- 3.6 +env: + global: + - secure: askFJXE0Ipfz2kacrnR2Jb5FSj1QSKlGBas0bMayjryOOoGbX7B4qUfo4yus4t2W5LOmWrWejkHd7dR/z8Umr7Q3V4pLYNOxVNR7+hqwtlDhsCoxMfxfU4RPmn9CQ9ZSNAbLLuaKYqYaUPsVBY5gdcIfe5edwQuZqnwPd1zo+IlCjAWvBARPKKu5zGxMKIN6PimicHz3IeDGWFDGvAMlRk2kZjSmL72hSmnOE4ysb2gtA/CRGFnbW9Ao09TTeJtZJ825p1R1MSB0o5ktQoXHln3XcbaEQOFIoWugh1RMpbJxtAozCkfBAcVyRsTH+VUM2lzdOy6xFIHi7+/uaT9F2kfPcffCpC/1B7GJmgVrdDuW9O+7CgoAr7MbH08moP/tTfcywxYWbkCLqXNFZpxeU1KTuqW0aG62+tbe5GCEi0AnmlJ3f0p1We/Ef8fPA+TYTPR0gth9FnnD10KNcaOKx7QXLEjczSkCNiQ5IlmPC7Gou/kwGf/GD5oZCz2BqgALdTs+PqLyAnV5QeduXl3amgX2ATJTDFskytF8chN1bldo9fGKC1WUOU7ICrWJK7bwywBZfftHbONGsgkhp7UvtyHcOH8Rrj62SjCtCOnDRs2VF5YDvS28w52E+bWRP1UHpRA9gCf+tyNCPBA74CBj9aGE+A8+0++pU41ny4J7a3M= + - secure: HU8j4oFWR+FEfjz7syhUH8NYF0v6+Qbm+EtCPyyPpqk7qEWSSOezve4FDFQpAHjzOXA03bqLLJJ7sRU8845fda8gDesZvGJ8j0YgqGz/nudTeKXT+6GKZEuk589xAV01lcWacocPNiteFWWqwdBnDer2kwC6xNILNRJa+aOvDC7re4OVrOy+ZuICazb/EA9SIgaCSqYWqZIVcfaIuSmL3JFjdNQXRGYEqIa0yAN37u63JQsWplxZ0lxbBjqEAVqSE6/6S305Mlhv0XFyPL3bRGwGnm1gv9cmmbgpl3weHIxRa5uCaGK9++TUQqo+k+mSTjR912jUHdqdFAKJoiHfXsJ/SG0aFxe1jMTffHORdXYVeW4icRbWFcA5eD7EaFHiwuUBx/JuCQmGCEBymTTdh3NGCvXj/qFIUH6magWID3K3QYuTlouiEsj/YPTZY210YrDXAnfV/0NmwB6YGv5jMXEeFvJz5mxUiGrsut3+rk26N4dHObPZLJ6bBb2r+JknzmjN0Oigd7ly8/HC+47zHqEeg17cBc4UNS5VeQ1dC0CK+dvNcTcZryhGgFwB3YRuORsNb6lx/y38PPH8aH0KcKC9wsMwvEHqptAhjM7HZ00d1Bfc4Tn3qR/uPuN41H3dQfAeYvODym7j/1ICQqU0nxYso3vx0awLJQ3ybg6aTH0= matrix: include: - python: 3.7 @@ -9,8 +11,4 @@ matrix: install: - pip install -e . script: -- pytest -env: - global: - - secure: askFJXE0Ipfz2kacrnR2Jb5FSj1QSKlGBas0bMayjryOOoGbX7B4qUfo4yus4t2W5LOmWrWejkHd7dR/z8Umr7Q3V4pLYNOxVNR7+hqwtlDhsCoxMfxfU4RPmn9CQ9ZSNAbLLuaKYqYaUPsVBY5gdcIfe5edwQuZqnwPd1zo+IlCjAWvBARPKKu5zGxMKIN6PimicHz3IeDGWFDGvAMlRk2kZjSmL72hSmnOE4ysb2gtA/CRGFnbW9Ao09TTeJtZJ825p1R1MSB0o5ktQoXHln3XcbaEQOFIoWugh1RMpbJxtAozCkfBAcVyRsTH+VUM2lzdOy6xFIHi7+/uaT9F2kfPcffCpC/1B7GJmgVrdDuW9O+7CgoAr7MbH08moP/tTfcywxYWbkCLqXNFZpxeU1KTuqW0aG62+tbe5GCEi0AnmlJ3f0p1We/Ef8fPA+TYTPR0gth9FnnD10KNcaOKx7QXLEjczSkCNiQ5IlmPC7Gou/kwGf/GD5oZCz2BqgALdTs+PqLyAnV5QeduXl3amgX2ATJTDFskytF8chN1bldo9fGKC1WUOU7ICrWJK7bwywBZfftHbONGsgkhp7UvtyHcOH8Rrj62SjCtCOnDRs2VF5YDvS28w52E+bWRP1UHpRA9gCf+tyNCPBA74CBj9aGE+A8+0++pU41ny4J7a3M= - - secure: HU8j4oFWR+FEfjz7syhUH8NYF0v6+Qbm+EtCPyyPpqk7qEWSSOezve4FDFQpAHjzOXA03bqLLJJ7sRU8845fda8gDesZvGJ8j0YgqGz/nudTeKXT+6GKZEuk589xAV01lcWacocPNiteFWWqwdBnDer2kwC6xNILNRJa+aOvDC7re4OVrOy+ZuICazb/EA9SIgaCSqYWqZIVcfaIuSmL3JFjdNQXRGYEqIa0yAN37u63JQsWplxZ0lxbBjqEAVqSE6/6S305Mlhv0XFyPL3bRGwGnm1gv9cmmbgpl3weHIxRa5uCaGK9++TUQqo+k+mSTjR912jUHdqdFAKJoiHfXsJ/SG0aFxe1jMTffHORdXYVeW4icRbWFcA5eD7EaFHiwuUBx/JuCQmGCEBymTTdh3NGCvXj/qFIUH6magWID3K3QYuTlouiEsj/YPTZY210YrDXAnfV/0NmwB6YGv5jMXEeFvJz5mxUiGrsut3+rk26N4dHObPZLJ6bBb2r+JknzmjN0Oigd7ly8/HC+47zHqEeg17cBc4UNS5VeQ1dC0CK+dvNcTcZryhGgFwB3YRuORsNb6lx/y38PPH8aH0KcKC9wsMwvEHqptAhjM7HZ00d1Bfc4Tn3qR/uPuN41H3dQfAeYvODym7j/1ICQqU0nxYso3vx0awLJQ3ybg6aTH0= +- if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then pytest; else pytest -m "not private_access"; fi diff --git a/brainio_contrib/packaging.py b/brainio_contrib/packaging.py index 0b7f894..b56b5e9 100644 --- a/brainio_contrib/packaging.py +++ b/brainio_contrib/packaging.py @@ -72,7 +72,7 @@ def add_stimulus_set_metadata_and_lookup_to_db(proto_stimulus_set, stimulus_set_ return stim_set_model -def package_stimulus_set(proto_stimulus_set, stimulus_set_name, bucket_name="brainio-dicarlo"): +def package_stimulus_set(proto_stimulus_set, stimulus_set_name, bucket_name="brainio-contrib"): """ Package a set of images along with their metadata for the BrainIO system. :param proto_stimulus_set: A pandas DataFrame containing one row for each image, and the columns ['image_current_local_file_path', 'image_id', 'image_path_within_store'] and columns for all stimulus-set-specific metadata @@ -115,7 +115,7 @@ def add_data_assembly_lookup_to_db(assembly_name, stim_set_model, bucket_name, n def package_data_assembly(proto_data_assembly, data_assembly_name, stimulus_set_name, - assembly_class="NeuronRecordingAssembly", bucket_name="brainio-dicarlo"): + assembly_class="NeuronRecordingAssembly", bucket_name="brainio-contrib"): """ Package a set of data along with its metadata for the BrainIO system. :param proto_data_assembly: An xarray DataArray containing experimental measurements and all related metadata. diff --git a/environment.yml b/environment.yml index 80c5136..0c1c52c 100644 --- a/environment.yml +++ b/environment.yml @@ -28,3 +28,4 @@ dependencies: - pip: - dataset - ipyvolume + - wget diff --git a/mkgu_packaging/cifar/__init__.py b/mkgu_packaging/cifar/__init__.py new file mode 100644 index 0000000..40567c3 --- /dev/null +++ b/mkgu_packaging/cifar/__init__.py @@ -0,0 +1,28 @@ +import os +import tarfile + +import wget + + +def download_data(out_dir, remote_url, local_untarred_dir=None): + filepath = os.path.join(out_dir, os.path.basename(remote_url)) + if not os.path.isfile(filepath): + wget.download(remote_url, out=filepath) + if local_untarred_dir and not os.path.isdir(os.path.join(out_dir, local_untarred_dir)): + untar(filepath, out_dir=out_dir) + + +def untar(filepath, out_dir='.'): + tar = tarfile.open(filepath, "r:gz") + tar.extractall(path=out_dir) + + +def main(): + from mkgu_packaging.cifar.cifar10 import main as main10 + from mkgu_packaging.cifar.cifar100 import main as main100 + main10() + main100() + + +if __name__ == '__main__': + main() diff --git a/mkgu_packaging/cifar/cifar10.py b/mkgu_packaging/cifar/cifar10.py new file mode 100644 index 0000000..4e70aeb --- /dev/null +++ b/mkgu_packaging/cifar/cifar10.py @@ -0,0 +1,82 @@ +import os +import pickle + +import numpy as np +import pandas as pd +from PIL import Image +from result_caching import store +from tqdm import tqdm + +from brainio_base.stimuli import StimulusSet +from brainio_collection.knownfile import KnownFile as kf +from brainio_contrib.packaging import package_stimulus_set +from mkgu_packaging.cifar import download_data + + +def collect_stimuli_batch(pickle_file, stimuli_type, label_names, batch=1, write_image=True): + with open(pickle_file, 'rb') as f: + data = pickle.load(f, encoding='bytes') + images = np.asarray(data[b'data'].T).astype('uint8') + labels = data[b'labels'] + label_names = [label_names[label_index] for label_index in labels] + stimuli = [] + stimuli_dir = os.path.join(os.path.dirname(pickle_file), stimuli_type) + os.makedirs(stimuli_dir, exist_ok=True) + for i in tqdm(range(images.shape[1]), desc=f'{stimuli_type}: {batch}'): + image_file_name = f'{stimuli_type}-{(batch - 1) * 10000 + i}.png' + image_file_path = os.path.join(stimuli_dir, image_file_name) + if write_image: + image = Image.fromarray(images[:, i].reshape(3, 32, 32).transpose([1, 2, 0])) + image.save(image_file_path) + im_kf = kf(image_file_path) + stimuli.append({ + 'image_id': im_kf.sha1, + 'image_file_name': image_file_name, + 'image_path_within_store': image_file_name, + 'type': stimuli_type, + 'batch': batch, + 'image_file_sha1': im_kf.sha1, + 'image_current_local_file_path': image_file_path, + 'label': labels[i], + 'label_name': label_names[i], + }) + stimuli = pd.DataFrame(stimuli) + assert len(stimuli) == 10000 + assert len(np.unique(stimuli["image_id"])) == len(stimuli) + return stimuli + + +@store(identifier_ignore=['write_image']) +def collect_stimuli(stimuli_directory, write_image=True): + batches_path = os.path.join(stimuli_directory, 'cifar-10-batches-py') + with open(os.path.join(batches_path, 'batches.meta'), 'rb') as f: + label_names = pickle.load(f)['label_names'] + test = collect_stimuli_batch(os.path.join(batches_path, 'test_batch'), stimuli_type='test', + label_names=label_names, write_image=write_image) + train = None + for batch in [1, 2, 3, 4, 5]: + train_batch = collect_stimuli_batch(os.path.join(batches_path, f'data_batch_{batch}'), stimuli_type='train', + batch=batch, label_names=label_names, + write_image=write_image) + train = pd.concat([train, train_batch]) if train is not None else train_batch + stimuli = pd.concat([train, test]) + assert len(stimuli) == 60000 + assert len(stimuli[stimuli['type'] == 'test']) == 10000 + assert len(stimuli[stimuli['type'] == 'train']) == 50000 + return stimuli + + +def main(): + stimuli_dir = os.path.dirname(__file__) + download_data(out_dir=stimuli_dir, remote_url='https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', + local_untarred_dir='cifar-10-batches-py') + stimuli = collect_stimuli(stimuli_dir, write_image=True) + stimuli = StimulusSet(stimuli) + stimuli.name = "cifar10" + + print("packaging") + package_stimulus_set(stimuli, stimulus_set_name=stimuli.name, bucket_name="brainio-contrib") + + +if __name__ == '__main__': + main() diff --git a/mkgu_packaging/cifar/cifar100.py b/mkgu_packaging/cifar/cifar100.py new file mode 100644 index 0000000..ab1d2fa --- /dev/null +++ b/mkgu_packaging/cifar/cifar100.py @@ -0,0 +1,84 @@ +import os +import pickle + +import numpy as np +import pandas as pd +from PIL import Image +from result_caching import store +from tqdm import tqdm + +from brainio_base.stimuli import StimulusSet +from brainio_collection.knownfile import KnownFile as kf +from brainio_contrib.packaging import package_stimulus_set +from mkgu_packaging.cifar import download_data + + +def collect_stimuli_batch(pickle_file, stimuli_type, coarse_label_names, fine_label_names, write_image=True): + with open(pickle_file, 'rb') as f: + data = pickle.load(f, encoding='bytes') + images = np.asarray(data[b'data'].T).astype('uint8') + coarse_labels, fine_labels = data[b'coarse_labels'], data[b'fine_labels'] + coarse_label_names = [coarse_label_names[label_index] for label_index in coarse_labels] + fine_label_names = [fine_label_names[label_index] for label_index in fine_labels] + stimuli = [] + stimuli_dir = os.path.join(os.path.dirname(pickle_file), stimuli_type + '-images') + os.makedirs(stimuli_dir, exist_ok=True) + for i in tqdm(range(images.shape[1]), desc=stimuli_type): + image_file_name = f'{stimuli_type}-{i}.png' + image_file_path = os.path.join(stimuli_dir, image_file_name) + if write_image: + image = Image.fromarray(images[:, i].reshape(3, 32, 32).transpose([1, 2, 0])) + image.save(image_file_path) + im_kf = kf(image_file_path) + stimuli.append({ + 'image_id': im_kf.sha1, + 'image_file_name': image_file_name, + 'image_path_within_store': image_file_name, + 'type': stimuli_type, + 'image_file_sha1': im_kf.sha1, + 'image_current_local_file_path': image_file_path, + 'coarse_label': coarse_labels[i], + 'fine_label': fine_labels[i], + 'coarse_label_name': coarse_label_names[i], + 'fine_label_name': fine_label_names[i], + }) + stimuli = pd.DataFrame(stimuli) + assert len(stimuli) == (10000 if stimuli_type == 'test' else 50000) + # no test for len(np.unique(image_id)) == len(stimuli). there are duplicates. + return stimuli + + +@store(identifier_ignore=['write_image']) +def collect_stimuli(stimuli_directory, write_image=True): + data_path = os.path.join(stimuli_directory, 'cifar-100-python') + with open(os.path.join(data_path, 'meta'), 'rb') as f: + meta = pickle.load(f) + coarse_label_names = meta['coarse_label_names'] + fine_label_names = meta['fine_label_names'] + test = collect_stimuli_batch(os.path.join(data_path, 'test'), stimuli_type='test', + coarse_label_names=coarse_label_names, fine_label_names=fine_label_names, + write_image=write_image) + train = collect_stimuli_batch(os.path.join(data_path, 'train'), stimuli_type='train', + coarse_label_names=coarse_label_names, fine_label_names=fine_label_names, + write_image=write_image) + stimuli = pd.concat([train, test]) + assert len(stimuli) == 60000 + assert len(stimuli[stimuli['type'] == 'test']) == 10000 + assert len(stimuli[stimuli['type'] == 'train']) == 50000 + return stimuli + + +def main(): + stimuli_dir = os.path.dirname(__file__) + download_data(out_dir=stimuli_dir, remote_url='https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz', + local_untarred_dir='cifar-100-python') + stimuli = collect_stimuli(stimuli_dir, write_image=True) + stimuli = StimulusSet(stimuli) + stimuli.name = "cifar100" + + print("packaging") + package_stimulus_set(stimuli, stimulus_set_name=stimuli.name, bucket_name="brainio-contrib") + + +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 9024a5e..6eec250 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ requirements = [ "brainio_base @ git+https://github.com/brain-score/brainio_base", "brainio_collection @ git+https://github.com/brain-score/brainio_collection", + "wget", # test_requirements "pytest", ] diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000..34e6595 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,15 @@ +# Unit Tests +## Markers +Unit tests have various markers that denote possible issues in the travis build: + +* **private_access**: tests that require access to a private ressource, such as assemblies on S3 (travis pull request builds can not have private access) + +Use the following syntax to mark a test: +``` +@pytest.mark.private_access +def test_something(...): + assert False +``` + +To skip a specific marker, run e.g. `pytest -m "not private_access"`. +To skip multiple markers, run e.g. `pytest -m "not private_access and not memory_intense"`. diff --git a/tests/test_packaging.py b/tests/test_packaging.py index ccf3588..19fd634 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -88,6 +88,7 @@ def test_add_stimulus_set_metadata_and_lookup_to_db(transaction): assert len(pw_query) == 25 +@pytest.mark.private_access def test_package_stimulus_set(transaction): proto = prep_proto_stim() stim_set_name = "dicarlo.test." + now()