DistributedScience
diff --git a/‎.travis.yml
Lines changed: 23 additions & 0 deletions b/‎.travis.yml
Lines changed: 23 additions & 0 deletions
diff --git a/‎LICENSE
Lines changed: 39 additions & 0 deletions b/‎LICENSE
Lines changed: 39 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 67 additions & 0 deletions b/‎README.md
Lines changed: 67 additions & 0 deletions
diff --git a/‎config.py
Lines changed: 44 additions & 0 deletions b/‎config.py
Lines changed: 44 additions & 0 deletions
diff --git a/‎files/ManualMetadata.py
Lines changed: 29 additions & 0 deletions b/‎files/ManualMetadata.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎files/batches.sh
Lines changed: 7 additions & 0 deletions b/‎files/batches.sh
Lines changed: 7 additions & 0 deletions
diff --git a/‎files/exampleFleet_us-east-1.json
Lines changed: 45 additions & 0 deletions b/‎files/exampleFleet_us-east-1.json
Lines changed: 45 additions & 0 deletions
diff --git a/‎files/exampleFleet_us-west-2.json
Lines changed: 45 additions & 0 deletions b/‎files/exampleFleet_us-west-2.json
Lines changed: 45 additions & 0 deletions
diff --git a/‎files/exampleJob.json
Lines changed: 14 additions & 0 deletions b/‎files/exampleJob.json
Lines changed: 14 additions & 0 deletions
diff --git a/‎files/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎files/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎python2worker/Dockerfile
Lines changed: 70 additions & 0 deletions b/‎python2worker/Dockerfile
Lines changed: 70 additions & 0 deletions
@@ -0,0 +1,23 @@
+language: python
+matrix:
+  include:
+    - python: 2.7
+    - python: 3.6  
+  allow_failures:
+    - python: 3.6
+install:
+  - pip install --upgrade pip
+  - pip install -r requirements.txt
+  - pip install flake8
+before_script:
+  # stop the build if there are Python syntax errors or undefined names
+  - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
+  # exit-zero treats all errors as warnings.  The GitHub editor is 127 chars wide
+  - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+script: true  # pytest
+notifications:
+  email: false
+  slack:
+    secure: kDWVy90sDY+o3g0/ZTGX2D+PTbzhtd74Whe1AJHhcUDobTUzkch8GtY9eZxybZk4nga9lQxL6YeJ72SfBBEPaLzXcUMe0YcNaBydkQHcipKZn+Vcb8kf2FiZC6YwsUYfTvvH9MPLbkZOZvsNyd0h85z+hYMB8jHsq6Yn5gf79BA=
+    on_failure: always
+    on_success: change
@@ -0,0 +1,39 @@
+Distributed-CellProfiler is distributed under the following BSD-style license:
+
+Copyright © 2020 Broad Institute, Inc.  All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the Broad Institute, Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED “AS IS.”  BROAD MAKES NO EXPRESS OR IMPLIED
+REPRESENTATIONS OR WARRANTIES OF ANY KIND REGARDING THE SOFTWARE AND
+COPYRIGHT, INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, CONFORMITY WITH ANY
+DOCUMENTATION, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER
+DEFECTS, WHETHER OR NOT DISCOVERABLE. IN NO EVENT SHALL BROAD, THE
+COPYRIGHT HOLDERS, OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF, HAVE REASON TO KNOW, OR IN
+FACT SHALL KNOW OF THE POSSIBILITY OF SUCH DAMAGE.
+
+If, by operation of law or otherwise, any of the aforementioned
+warranty disclaimers are determined inapplicable, your sole remedy,
+regardless of the form of action, including, but not limited to,
+negligence and strict liability, shall be replacement of the software
+with an updated version if one exists.
@@ -0,0 +1,67 @@
+# Distributed-CellProfiler
+Run encapsulated docker containers with CellProfiler in the Amazon Web Services infrastructure.
+
+This code is an example of how to use AWS distributed infrastructure for running CellProfiler.
+The configuration of the AWS resources is done using fabric. The worker is written in Python 
+and is encapsulated in a docker container. There are four AWS components that are needed to run 
+distributed jobs:
+
+1. An SQS queue
+2. An ECS cluster
+3. An S3 bucket
+4. A spot fleet of EC2 instances
+
+All of them can be managed through the AWS Management Console. However, this code helps to get
+started quickly and run a job autonomously if all the configuration is correct. The code includes 
+a fabric script that links all these components and prepares the infrastructure to run a distributed 
+job. When the job is completed, the code is also able to stop resources and clean up components. 
+
+## Running the code
+
+### Step 1
+Edit the config.py file with all the relevant information for your job. Then, start creating 
+the basic AWS resources by running the following script:
+
+ $ python run.py setup
+
+This script intializes the resources in AWS. Notice that the docker registry is built separately,
+and you can modify the worker code to build your own. Anytime you modify the worker code, you need
+to update the docker registry using the Makefile script inside the worker directory.
+
+### Step 2
+After the first script runs successfully, the job can now be submitted to AWS using EITHER of the 
+following commands:
+
+ $ python run.py submitJob files/exampleJob.json
+ 
+ OR
+ 
+ $ python run_batch_general.py
+
+Running either script uploads the tasks that are configured in the json file. This assumes that your 
+data is stored in S3, and the json file has the paths to find input and output directories. You have to 
+customizethe exampleJob.json file or the run_batch_general file with paths that make sense for your project. 
+The tasks that composeyour job are CP groups, and each one will be run in parallel. You need to define each 
+task in your input file to guide the parallelization.
+
+### Step 3
+After submitting the job to the queue, we can add computing power to process all tasks in AWS. This
+code starts a fleet of spot EC2 instances which will run the worker code. The worker code is encapsulated
+in docker containers, and the code uses ECS services to inject them in EC2. All this is automated
+with the following command:
+
+ $ python run.py startCluster files/exampleFleet.json
+
+After the cluster is ready, the code informs you that everything is setup, and saves the spot fleet identifier 
+in a file for further reference.
+
+### Step 4
+When the cluster is up and running, you can monitor progress using the following command:
+
+ $ python run.py monitor files/APP_NAMESpotFleetRequestId.json
+
+The file APP_NAMESpotFleetRequestId.json is created after the cluster is setup in step 3. It is 
+important to keep this monitor running if you want to automatically shutdown computing resources
+when there are no more tasks in the queue (recommended).
+
+See the wiki for more information about each step of the process.
@@ -0,0 +1,44 @@
+# Constants (User configurable)
+
+APP_NAME = 'DistributedCP'                # Used to generate derivative names unique to the application.
+
+# DOCKER REGISTRY INFORMATION:
+DOCKERHUB_TAG = 'cellprofiler/distributed-cellprofiler:2.0.0_4.0.6'
+
+# AWS GENERAL SETTINGS:
+AWS_REGION = 'us-east-1'
+AWS_PROFILE = 'default'                 # The same profile used by your AWS CLI installation
+SSH_KEY_NAME = 'your-key-file.pem'      # Expected to be in ~/.ssh
+AWS_BUCKET = 'your-bucket-name'
+
+# EC2 AND ECS INFORMATION:
+ECS_CLUSTER = 'default'
+CLUSTER_MACHINES = 3
+TASKS_PER_MACHINE = 1
+MACHINE_TYPE = ['m4.xlarge']
+MACHINE_PRICE = 0.10
+EBS_VOL_SIZE = 30                       # In GB.  Minimum allowed is 22.
+DOWNLOAD_FILES = 'False'
+
+# DOCKER INSTANCE RUNNING ENVIRONMENT:
+DOCKER_CORES = 1                        # Number of CellProfiler processes to run inside a docker container
+CPU_SHARES = DOCKER_CORES * 1024        # ECS computing units assigned to each docker container (1024 units = 1 core)
+MEMORY = 4096                           # Memory assigned to the docker container in MB
+SECONDS_TO_START = 0*60                 # Wait before the next CP process is initiated to avoid memory collisions
+
+# SQS QUEUE INFORMATION:
+SQS_QUEUE_NAME = APP_NAME + 'Queue'
+SQS_MESSAGE_VISIBILITY = 1*60           # Timeout (secs) for messages in flight (average time to be processed)
+SQS_DEAD_LETTER_QUEUE = 'arn:aws:sqs:some-region:111111100000:DeadMessages'
+
+# LOG GROUP INFORMATION:
+LOG_GROUP_NAME = APP_NAME 
+
+# REDUNDANCY CHECKS
+CHECK_IF_DONE_BOOL = 'True'  #True or False- should it check if there are a certain number of non-empty files and delete the job if yes?
+EXPECTED_NUMBER_FILES = 7    #What is the number of files that trigger skipping a job?
+MIN_FILE_SIZE_BYTES = 1      #What is the minimal number of bytes an object should be to "count"?
+NECESSARY_STRING = ''        #Is there any string that should be in the file name to "count"?
+
+# PLUGINS
+USE_PLUGINS = 'True'
@@ -0,0 +1,29 @@
+''' A script to create a list of all the metadata combinations present in a given CSV
+This is designed to be called from the command line with 
+$ python ManualMetadata.py pathtocsv/csvfile.csv "['Metadata_Metadata1','Metadata_Metadata2']" 
+'''
+from __future__ import print_function
+
+import pandas as pd
+import sys
+import ast
+
+csv=sys.argv[1]
+metadatalist=ast.literal_eval(sys.argv[2])
+
+def manualmetadata():
+    incsv=pd.read_csv(csv)
+    manmet=open(csv[:-4]+'batch.txt','w')
+    print(incsv.shape)
+    done=[]
+    for i in range(incsv.shape[0]):
+            metadatatext='{"Metadata": "'
+            for j in metadatalist:
+                metadatatext+=j+'='+str(incsv[j][i])+','
+            metadatatext=metadatatext[:-1]+'"}, \n'
+            if metadatatext not in done:
+                manmet.write(metadatatext)
+                done.append(metadatatext)
+    manmet.close()
+    print(str(len(done)), 'batches found')
+manualmetadata()
@@ -0,0 +1,7 @@
+# Command to generate batches for a single plate.
+# It generates 384*9 tasks, corresponding to 384 wells with 9 images each.
+# An image is the unit of parallelization in this example.
+#
+# You need to install parallel to run this command.
+
+parallel echo '{\"Metadata\": \"Metadata_Plate={1},Metadata_Well={2}{3},Metadata_Site={4}\"},' :::  Plate1 Plate2  ::: `echo {A..P}`  ::: `seq -w 24` ::: `seq -w 9` | sort > batches.txt
@@ -0,0 +1,45 @@
+{
+  "IamFleetRole": "arn:aws:iam::XXXXXXXXXXXXX:role/aws-ec2-spot-fleet-role",
+  "AllocationStrategy": "lowestPrice",
+  "TerminateInstancesWithExpiration": true,
+  "LaunchSpecifications": [
+    {
+      "ImageId": "ami-fad25980",
+      "KeyName": "your_key_file_name",
+      "IamInstanceProfile": {
+        "Arn": "arn:aws:iam::XXXXXXXXXXXX:instance-profile/ecsInstanceRole"
+      },
+      "BlockDeviceMappings": [
+        {
+          "DeviceName": "/dev/xvda",
+          "Ebs": {
+            "DeleteOnTermination": true,
+            "VolumeType": "gp2",
+            "VolumeSize": 8,
+            "SnapshotId": "snap-04007a196c0f3f398"
+          }
+        },
+        {
+          "DeviceName": "/dev/xvdcz",
+          "Ebs": {
+            "DeleteOnTermination": true,
+            "VolumeType": "gp2"
+          }
+        }
+      ],
+      "NetworkInterfaces": [
+        {
+          "DeviceIndex": 0,
+          "SubnetId": "subnet-WWWWWWWW",
+          "DeleteOnTermination": true,
+          "AssociatePublicIpAddress": true,
+          "Groups": [
+            "sg-ZZZZZZZZZ"
+          ]
+        }
+      ]
+    }
+  ],
+  "Type": "maintain"
+}
+
@@ -0,0 +1,45 @@
+{
+  "IamFleetRole": "arn:aws:iam::XXXXXXXXXXXXX:role/aws-ec2-spot-fleet-role",
+  "AllocationStrategy": "lowestPrice",
+  "TerminateInstancesWithExpiration": true,
+  "LaunchSpecifications": [
+    {
+      "ImageId": "ami-c9c87cb1",
+      "KeyName": "your_key_file_name",
+      "IamInstanceProfile": {
+        "Arn": "arn:aws:iam::XXXXXXXXXXXX:instance-profile/ecsInstanceRole"
+      },
+      "BlockDeviceMappings": [
+        {
+          "DeviceName": "/dev/xvda",
+          "Ebs": {
+            "DeleteOnTermination": true,
+            "VolumeType": "gp2",
+            "VolumeSize": 8,
+            "SnapshotId": "snap-0b52be5bdbda1ac5f"
+          }
+        },
+        {
+          "DeviceName": "/dev/xvdcz",
+          "Ebs": {
+            "DeleteOnTermination": true,
+            "VolumeType": "gp2"
+          }
+        }
+      ],
+      "NetworkInterfaces": [
+        {
+          "DeviceIndex": 0,
+          "SubnetId": "subnet-WWWWWWWW",
+          "DeleteOnTermination": true,
+          "AssociatePublicIpAddress": true,
+          "Groups": [
+            "sg-ZZZZZZZZZ"
+          ]
+        }
+      ]
+    }
+  ],
+  "Type": "maintain"
+}
+
@@ -0,0 +1,14 @@
+{
+  "_comment1": "Paths in this file are relative to the root of your S3 bucket",
+  "pipeline": "projects/analysis.cppipe", 
+  "data_file": "projects/list_of_images.csv", 
+  "input": "projects/input/",
+  "output": "projects/output/",
+  "output_structure": "Metadata_Plate-Metadata_Well-Metadata_Site",
+  "_comment2": "The following groups are tasks, and each will be run in parallel",
+  "groups": [
+    {"Metadata": "Metadata_Plate=Plate1,Metadata_Well=A01,Metadata_Site=1"},
+    {"Metadata": "Metadata_Plate=Plate1,Metadata_Well=A01,Metadata_Site=2"}
+  ]
+}
+
@@ -0,0 +1 @@
+boto3>=1.0.0
@@ -0,0 +1,70 @@
+#
+#                                 - [ BROAD'16 ] -
+#
+# A docker instance for accessing AWS resources
+# This wraps the cellprofiler docker registry
+#
+
+
+FROM cellprofiler/cellprofiler:3.1.9
+
+# Install S3FS 
+
+RUN apt-get -y update           && \
+    apt-get -y upgrade          && \
+    apt-get -y install 		\
+	automake 		\
+	autotools-dev 		\
+	g++ 			\
+	git 			\
+	libcurl4-gnutls-dev 	\
+	libfuse-dev 		\
+	libssl-dev 		\
+	libxml2-dev 		\
+	make pkg-config		\
+	sysstat			\
+	curl
+
+WORKDIR /usr/local/src
+RUN git clone https://github.com/s3fs-fuse/s3fs-fuse.git
+WORKDIR /usr/local/src/s3fs-fuse
+RUN ./autogen.sh
+RUN ./configure
+RUN make
+RUN make install
+
+# Install AWS CLI
+
+RUN \
+  pip install awscli 
+
+# Install boto3
+
+RUN \
+  pip install -U boto3
+
+# Install watchtower for logging
+
+RUN \
+  pip install watchtower==0.8.0
+
+# Install pandas for optional file downloading
+
+RUN pip install pandas==0.24.2
+
+# SETUP NEW ENTRYPOINT
+
+RUN mkdir -p /home/ubuntu/
+WORKDIR /home/ubuntu
+COPY cp-worker.py .
+COPY instance-monitor.py .
+COPY run-worker.sh .
+RUN chmod 755 run-worker.sh
+
+RUN git clone https://github.com/CellProfiler/CellProfiler-plugins.git
+WORKDIR /home/ubuntu/CellProfiler-plugins
+#RUN pip install -r requirements.txt
+
+WORKDIR /home/ubuntu
+ENTRYPOINT ["./run-worker.sh"]
+CMD [""]