aaronkollasch
diff --git a/‎INSTALL.md
Lines changed: 30 additions & 0 deletions b/‎INSTALL.md
Lines changed: 30 additions & 0 deletions
diff --git a/‎bin/calc_logprobs_seqs_fr
Lines changed: 5 additions & 0 deletions b/‎bin/calc_logprobs_seqs_fr
Lines changed: 5 additions & 0 deletions
diff --git a/‎bin/generate_sample_seqs_fr
Lines changed: 5 additions & 0 deletions b/‎bin/generate_sample_seqs_fr
Lines changed: 5 additions & 0 deletions
diff --git a/‎bin/run_autoregressive_fr
Lines changed: 5 additions & 0 deletions b/‎bin/run_autoregressive_fr
Lines changed: 5 additions & 0 deletions
diff --git a/‎linux_setup.sh
Lines changed: 48 additions & 0 deletions b/‎linux_setup.sh
Lines changed: 48 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 5 additions & 0 deletions b/‎requirements.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎requirements_gpu.txt
Lines changed: 5 additions & 0 deletions b/‎requirements_gpu.txt
Lines changed: 5 additions & 0 deletions
diff --git a/‎seqdesign_pt/autoregressive_model.py
Lines changed: 6 additions & 7 deletions b/‎seqdesign_pt/autoregressive_model.py
Lines changed: 6 additions & 7 deletions
diff --git a/‎seqdesign_pt/autoregressive_train.py
Lines changed: 28 additions & 28 deletions b/‎seqdesign_pt/autoregressive_train.py
Lines changed: 28 additions & 28 deletions
diff --git a/‎seqdesign_pt/aws_utils.py
Lines changed: 88 additions & 0 deletions b/‎seqdesign_pt/aws_utils.py
Lines changed: 88 additions & 0 deletions
@@ -0,0 +1,30 @@
+## Installation
+
+We recommend using SeqDesign with a GPU that supports CUDA, especially for training.
+If a GPU is available, install the [TensorFlow GPU dependencies](https://www.tensorflow.org/install/gpu), 
+then install the SeqDesign dependencies with:
+```shell script
+pip install -r requirements_gpu.txt
+```
+
+Using the [linux_setup.sh](linux_setup.sh) script, 
+installation on a fresh Ubuntu 18.04 LTS machine took 5 minutes.
+
+If no GPU is available, use:  
+```shell script
+pip install -r requirements.txt
+```
+
+Then install SeqDesign:
+```shell script
+python setup.py install
+```
+
+### Used software and versions tested:
+- python - 3.7  
+- tensorflow - 1.15  
+- numpy - 1.15  
+- scipy - 0.19  
+- sklearn - 0.18  
+
+Tested on Ubuntu 18.04 LTS
@@ -0,0 +1,5 @@
+#!/usr/bin/env python
+from seqdesign_pt.scripts.calc_logprobs_seqs_fr import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+from seqdesign_pt.scripts.generate_sample_seqs_fr import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+from seqdesign_pt.scripts.run_autoregressive_fr import main
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,48 @@
+#!/bin/bash
+# Example installation script of SeqDesign for Tensorflow-GPU from scratch
+# Tested on Ubuntu 18.04 LTS, runtime ~5 minutes including a reboot.
+# Miniconda and Tensorflow 1.12 are installed here, but a working Tensorflow 1 environment can substitute.
+# Before running this script, first run `git clone -b v3 https://github.com/debbiemarkslab/SeqDesign.git`
+# and then `cd SeqDesign`
+# If NVIDIA drivers have not been installed before, this script must be run twice, rebooting the system in between.
+
+if [ ! -f "/proc/driver/nvidia/version" ]; then
+  echo "NVIDIA driver not found; installing."
+  sudo apt update
+  sudo apt install -y --no-install-recommends nvidia-driver-430
+  echo "
+NVIDIA drivers installed.
+Please reboot your system, then run linux_setup.sh a second time."
+  exit
+fi
+
+# set up conda and the SeqDesign environment
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+sh Miniconda3-latest-Linux-x86_64.sh -b -p "$HOME"/miniconda3
+rm Miniconda3-latest-Linux-x86_64.sh
+"$HOME"/miniconda3/bin/conda init
+"$HOME"/miniconda3/bin/conda create -n seqdesign -y -c pytorch python=3.7 pytorch "tensorflow-gpu>=1.12,<2" scipy scikit-learn gitpython
+"$HOME"/miniconda3/envs/seqdesign/bin/python -c "import torch; print(torch.cuda.is_available()); print([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])"  # test GPU install
+"$HOME"/miniconda3/envs/seqdesign/bin/python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())"  # test GPU install
+
+# download SeqDesign code:
+# git clone -b v3 https://github.com/aaronkollasch/seqdesign-pytorch.git
+# cd seqdesign-pytorch || exit
+"$HOME"/miniconda3/envs/seqdesign/bin/python setup.py install  # use setup.py develop if you want to modify the code files
+
+# download demo/example data
+cd examples || exit
+./download_example_data.sh
+
+echo "
+SeqDesign installed.
+Run 'source ~/.bashrc; conda activate seqdesign' before using."
+
+# # to run training demo:
+# ./demo_train.sh
+
+# # to run calc_logprobs using trained weights:
+# ./demo_calc_logprobs.sh
+
+# # to generate sequences:
+# ./demo_generate.sh
@@ -0,0 +1,5 @@
+tensorflow>=1.12,<2
+scipy
+numpy
+scikit-learn<0.22
+gitpython
@@ -0,0 +1,5 @@
+tensorflow-gpu>=1.12,<2
+scipy
+numpy
+scikit-learn<0.22
+gitpython
@@ -7,8 +7,8 @@
 import torch.nn.functional as F
 
 import layers
-from utils import recursive_update
-from functions import nonlinearity, comb_losses, clamp
+from seqdesign_pt.utils import recursive_update
+from seqdesign_pt.functions import nonlinearity, comb_losses, clamp
 
 
 class Autoregressive(nn.Module):
@@ -45,7 +45,7 @@ def __init__(
                 "num_dilation_blocks": 6,
                 "num_layers": 9,
                 "dilation_schedule": None,
-                "transformer": False,  # TODO transformer
+                "transformer": False,
                 "inverse_temperature": False,
                 "dropout_loc": "inter",  # options = "final", "inter", "gaussian"
                 "dropout_p": 0.5,  # probability of zeroing out value, not the keep probability
@@ -410,7 +410,7 @@ def __init__(
                 "transformer": False,
                 "inverse_temperature": False,
                 "positional_embedding": True,
-                "skip_connections": False,  # TODO test effect of skip connections
+                "skip_connections": False,
                 "pos_emb_max_len": 400,
                 "pos_emb_step": 5,
                 "config": "updated",
@@ -477,7 +477,6 @@ def __init__(
         self.encoder.emb_log_sigma_one = nn.Linear(enc_params['channels'], enc_params['embedding_nnet_size'])
         self.encoder.emb_mu_out = nn.Linear(enc_params['embedding_nnet_size'], enc_params['latent_size'])
         self.encoder.emb_log_sigma_out = nn.Linear(enc_params['embedding_nnet_size'], enc_params['latent_size'])
-        # TODO try adding flow
 
         # initialize decoder modules
         dec_params = self.hyperparams['decoder']
@@ -617,7 +616,7 @@ def sampler(self, mu, log_sigma, stddev=1.):
         eps = torch.zeros_like(log_sigma).normal_(std=stddev)
         return mu + log_sigma.exp() * eps
 
-    def generate(self, mode=True):  # TODO implement fast generation
+    def generate(self, mode=True):
         for module in self.decoder.dilation_blocks():
             if hasattr(module, "generate") and callable(module.generate):
                 module.generate(mode)
@@ -637,7 +636,7 @@ def encode(self, inputs, input_masks):
         enc_params = self.hyperparams['encoder']
         nonlin = nonlinearity(enc_params['embedding_nnet_nonlinearity'])
 
-        up_val_1d = self.encoder.start_conv(inputs)  # TODO use special input for encoder
+        up_val_1d = self.encoder.start_conv(inputs)
         for convnet in self.encoder.dilation_blocks:
             up_val_1d = convnet(up_val_1d, input_masks)
 
 
@@ -8,7 +8,7 @@
 import torch.nn.functional as F
 import torch.utils.data
 
-from model_logging import Logger
+from seqdesign_pt.model_logging import Logger
 
 
 class AutoregressiveTrainer:
@@ -172,32 +172,32 @@ def train(self, steps=1e8):
 
     def validate(self, batch_size=48):
         return 0.0, 0.0
-        self.model.eval()
-        with torch.no_grad():
-            (
-                prot_decoder_input_f, prot_decoder_output_f, prot_mask_decoder,
-                prot_decoder_input_r, prot_decoder_output_r,
-                n_eff
-            ) = self.loader.dataset.generate_test_data(self, batch_size, matching=True)  # TODO write generate_test_data
-            if self.run_fr:
-                output_logits_f, output_logits_r = self.model(
-                    prot_decoder_input_f, prot_mask_decoder, prot_decoder_input_r, prot_mask_decoder)
-                output_logits = torch.cat((output_logits_f, output_logits_r), dim=0)
-                target_seqs = torch.cat((prot_decoder_output_f, prot_decoder_output_r), dim=0)
-                mask = torch.cat((prot_mask_decoder, prot_mask_decoder), dim=0)
-            else:
-                output_logits = self.model(prot_decoder_input_f, prot_mask_decoder)
-                target_seqs = prot_decoder_output_f
-                mask = prot_mask_decoder
-
-            cross_entropy = F.cross_entropy(output_logits, target_seqs.argmax(1), reduction='none')
-            cross_entropy = cross_entropy * mask.squeeze(1)
-            reconstruction_per_seq = cross_entropy.sum([1, 2]) / mask.sum([1, 2, 3])
-            reconstruction_loss = reconstruction_per_seq.mean()
-            accuracy_per_seq = target_seqs[output_logits.argmax(1, keepdim=True)].sum([1, 2]) / mask.sum([1, 2, 3])
-            avg_accuracy = accuracy_per_seq.mean()
-        self.model.train()
-        return reconstruction_loss, avg_accuracy
+        # self.model.eval()
+        # with torch.no_grad():
+        #     (
+        #         prot_decoder_input_f, prot_decoder_output_f, prot_mask_decoder,
+        #         prot_decoder_input_r, prot_decoder_output_r,
+        #         n_eff
+        #     ) = self.loader.dataset.generate_test_data(self, batch_size, matching=True)  # TODO write generate_test_data
+        #     if self.run_fr:
+        #         output_logits_f, output_logits_r = self.model(
+        #             prot_decoder_input_f, prot_mask_decoder, prot_decoder_input_r, prot_mask_decoder)
+        #         output_logits = torch.cat((output_logits_f, output_logits_r), dim=0)
+        #         target_seqs = torch.cat((prot_decoder_output_f, prot_decoder_output_r), dim=0)
+        #         mask = torch.cat((prot_mask_decoder, prot_mask_decoder), dim=0)
+        #     else:
+        #         output_logits = self.model(prot_decoder_input_f, prot_mask_decoder)
+        #         target_seqs = prot_decoder_output_f
+        #         mask = prot_mask_decoder
+        #
+        #     cross_entropy = F.cross_entropy(output_logits, target_seqs.argmax(1), reduction='none')
+        #     cross_entropy = cross_entropy * mask.squeeze(1)
+        #     reconstruction_per_seq = cross_entropy.sum([1, 2]) / mask.sum([1, 2, 3])
+        #     reconstruction_loss = reconstruction_per_seq.mean()
+        #     accuracy_per_seq = target_seqs[output_logits.argmax(1, keepdim=True)].sum([1, 2]) / mask.sum([1, 2, 3])
+        #     avg_accuracy = accuracy_per_seq.mean()
+        # self.model.train()
+        # return reconstruction_loss, avg_accuracy
 
     def test(self, data_loader, model_eval=True, num_samples=1, return_logits=False, return_ce=False):
         if model_eval:
@@ -349,7 +349,7 @@ def test(self, data_loader, model_eval=True, num_samples=1, return_logits=False,
             return output
 
     def save_state(self, last_batch=None):
-        snapshot = f"{self.params['snapshot_path']}/{self.params['snapshot_name']}/{self.model.step}.pth"
+        snapshot = f"{self.params['snapshot_path']}/{self.params['snapshot_name']}/{self.params['snapshot_name']}.ckpt-{self.model.step}.pth"
         revive_exec = f"{self.params['snapshot_path']}/revive_executable/{self.params['snapshot_name']}.sh"
         if not os.path.exists(os.path.dirname(snapshot)):
             os.makedirs(os.path.dirname(snapshot), exist_ok=True)
 
@@ -0,0 +1,88 @@
+import subprocess
+import re
+import os
+from seqdesign_pt.version import VERSION
+
+S3_FOLDER_URL = "s3://markslab-private/seqdesign"
+
+if os.path.exists('/n/groups/marks/software/aws-cli/bin/aws'):
+    AWS_BIN = '/n/groups/marks/software/aws-cli/bin/aws'
+else:
+    AWS_BIN = 'aws'
+
+
+class AWSUtility:
+    def __init__(self, s3_project=VERSION, s3_base_path=S3_FOLDER_URL):
+        self.s3_base_path = s3_base_path
+        self.s3_project = s3_project
+
+    @staticmethod
+    def run_cmd(cmd):
+        try:
+            if cmd[0] not in ('aws', AWS_BIN):
+                cmd = [AWS_BIN] + cmd
+            else:
+                cmd[0] = AWS_BIN
+            pipes = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8')
+            std_out, std_err = pipes.communicate()
+            if pipes.returncode != 0:
+                print(f"AWS CLI error: {pipes.returncode}")
+                print(std_err.strip())
+                return pipes.returncode, None, None
+            else:
+                return 0, std_out, std_err
+        except OSError:
+            print("AWS CLI not found.")
+            return 1, None, None
+
+    def s3_cp(self, local_file, s3_file, destination='s3'):
+        s3_file = f"{self.s3_base_path}/{self.s3_project}/{s3_file}"
+        if destination == 's3':
+            print("Copying file to AWS S3.")
+            src_file, dest_file = local_file, s3_file
+        else:
+            print("Copying file from AWS S3.")
+            src_file, dest_file = s3_file, local_file
+        cmd = ['s3', 'cp', src_file, dest_file]
+        code, std_out, std_err = self.run_cmd(cmd)
+        if code == 0:
+            print("Success.")
+
+    def s3_sync(self, local_folder, s3_folder, destination='s3', args=()):
+        local_folder = local_folder + ('' if local_folder.endswith('/') else '/')
+        s3_folder = s3_folder + ('' if s3_folder.endswith('/') else '/')
+        s3_folder = f"{self.s3_base_path}/{self.s3_project}/{s3_folder}"
+        if destination == 's3':
+            print("Syncing data to AWS S3.")
+            src_folder, dest_folder = local_folder, s3_folder
+        else:
+            print("Syncing data from AWS S3.")
+            src_folder, dest_folder = s3_folder, local_folder
+        cmd = ['s3', 'sync', src_folder, dest_folder, *args]
+        code, std_out, std_err = self.run_cmd(cmd)
+        if code == 0:
+            print("Success.")
+
+    def s3_get_file_grep(self, s3_folder, dest_folder, search_pattern):
+        s3_folder = s3_folder + ('' if s3_folder.endswith('/') else '/')
+        dest_folder = dest_folder + ('' if dest_folder.endswith('/') else '/')
+        s3_folder = f"{self.s3_base_path}/{self.s3_project}/{s3_folder}"
+        print(f"Finding files in {s3_folder} on AWS S3.")
+        cmd = ['s3', 'ls', s3_folder]
+        code, std_out, std_err = self.run_cmd(cmd)
+        if code != 0:
+            return False
+        filenames = re.findall(search_pattern, std_out)
+        if not filenames:
+            print("No files found.")
+            return False
+        print(f"Found: {filenames}")
+        for filename in filenames:
+            filename = f"{s3_folder}{filename}"
+            print(f"Copying file {filename} from AWS S3.")
+            cmd = ['s3', 'cp', filename, dest_folder]
+            code, std_out, std_err = self.run_cmd(cmd)
+            if code != 0:
+                return False
+            print("Success.")
+        return True