Skip to content

Commit d90ece3

Browse files
committed
Update to match tensorflow module interface
Adds a setup.py, binaries, scripts, and examples.
1 parent b1a85ed commit d90ece3

23 files changed

+857
-1537
lines changed

INSTALL.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
## Installation
2+
3+
We recommend using SeqDesign with a GPU that supports CUDA, especially for training.
4+
If a GPU is available, install the [TensorFlow GPU dependencies](https://www.tensorflow.org/install/gpu),
5+
then install the SeqDesign dependencies with:
6+
```shell script
7+
pip install -r requirements_gpu.txt
8+
```
9+
10+
Using the [linux_setup.sh](linux_setup.sh) script,
11+
installation on a fresh Ubuntu 18.04 LTS machine took 5 minutes.
12+
13+
If no GPU is available, use:
14+
```shell script
15+
pip install -r requirements.txt
16+
```
17+
18+
Then install SeqDesign:
19+
```shell script
20+
python setup.py install
21+
```
22+
23+
### Used software and versions tested:
24+
- python - 3.7
25+
- tensorflow - 1.15
26+
- numpy - 1.15
27+
- scipy - 0.19
28+
- sklearn - 0.18
29+
30+
Tested on Ubuntu 18.04 LTS

bin/calc_logprobs_seqs_fr

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python
2+
from seqdesign_pt.scripts.calc_logprobs_seqs_fr import main
3+
4+
if __name__ == "__main__":
5+
main()

bin/generate_sample_seqs_fr

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python3
2+
from seqdesign_pt.scripts.generate_sample_seqs_fr import main
3+
4+
if __name__ == "__main__":
5+
main()

bin/run_autoregressive_fr

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/usr/bin/env python3
2+
from seqdesign_pt.scripts.run_autoregressive_fr import main
3+
4+
if __name__ == "__main__":
5+
main()

linux_setup.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
# Example installation script of SeqDesign for Tensorflow-GPU from scratch
3+
# Tested on Ubuntu 18.04 LTS, runtime ~5 minutes including a reboot.
4+
# Miniconda and Tensorflow 1.12 are installed here, but a working Tensorflow 1 environment can substitute.
5+
# Before running this script, first run `git clone -b v3 https://github.com/debbiemarkslab/SeqDesign.git`
6+
# and then `cd SeqDesign`
7+
# If NVIDIA drivers have not been installed before, this script must be run twice, rebooting the system in between.
8+
9+
if [ ! -f "/proc/driver/nvidia/version" ]; then
10+
echo "NVIDIA driver not found; installing."
11+
sudo apt update
12+
sudo apt install -y --no-install-recommends nvidia-driver-430
13+
echo "
14+
NVIDIA drivers installed.
15+
Please reboot your system, then run linux_setup.sh a second time."
16+
exit
17+
fi
18+
19+
# set up conda and the SeqDesign environment
20+
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
21+
sh Miniconda3-latest-Linux-x86_64.sh -b -p "$HOME"/miniconda3
22+
rm Miniconda3-latest-Linux-x86_64.sh
23+
"$HOME"/miniconda3/bin/conda init
24+
"$HOME"/miniconda3/bin/conda create -n seqdesign -y -c pytorch python=3.7 pytorch "tensorflow-gpu>=1.12,<2" scipy scikit-learn gitpython
25+
"$HOME"/miniconda3/envs/seqdesign/bin/python -c "import torch; print(torch.cuda.is_available()); print([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())])" # test GPU install
26+
"$HOME"/miniconda3/envs/seqdesign/bin/python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())" # test GPU install
27+
28+
# download SeqDesign code:
29+
# git clone -b v3 https://github.com/aaronkollasch/seqdesign-pytorch.git
30+
# cd seqdesign-pytorch || exit
31+
"$HOME"/miniconda3/envs/seqdesign/bin/python setup.py install # use setup.py develop if you want to modify the code files
32+
33+
# download demo/example data
34+
cd examples || exit
35+
./download_example_data.sh
36+
37+
echo "
38+
SeqDesign installed.
39+
Run 'source ~/.bashrc; conda activate seqdesign' before using."
40+
41+
# # to run training demo:
42+
# ./demo_train.sh
43+
44+
# # to run calc_logprobs using trained weights:
45+
# ./demo_calc_logprobs.sh
46+
47+
# # to generate sequences:
48+
# ./demo_generate.sh

requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tensorflow>=1.12,<2
2+
scipy
3+
numpy
4+
scikit-learn<0.22
5+
gitpython

requirements_gpu.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
tensorflow-gpu>=1.12,<2
2+
scipy
3+
numpy
4+
scikit-learn<0.22
5+
gitpython

seqdesign_pt/autoregressive_model.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import torch.nn.functional as F
88

99
import layers
10-
from utils import recursive_update
11-
from functions import nonlinearity, comb_losses, clamp
10+
from seqdesign_pt.utils import recursive_update
11+
from seqdesign_pt.functions import nonlinearity, comb_losses, clamp
1212

1313

1414
class Autoregressive(nn.Module):
@@ -45,7 +45,7 @@ def __init__(
4545
"num_dilation_blocks": 6,
4646
"num_layers": 9,
4747
"dilation_schedule": None,
48-
"transformer": False, # TODO transformer
48+
"transformer": False,
4949
"inverse_temperature": False,
5050
"dropout_loc": "inter", # options = "final", "inter", "gaussian"
5151
"dropout_p": 0.5, # probability of zeroing out value, not the keep probability
@@ -410,7 +410,7 @@ def __init__(
410410
"transformer": False,
411411
"inverse_temperature": False,
412412
"positional_embedding": True,
413-
"skip_connections": False, # TODO test effect of skip connections
413+
"skip_connections": False,
414414
"pos_emb_max_len": 400,
415415
"pos_emb_step": 5,
416416
"config": "updated",
@@ -477,7 +477,6 @@ def __init__(
477477
self.encoder.emb_log_sigma_one = nn.Linear(enc_params['channels'], enc_params['embedding_nnet_size'])
478478
self.encoder.emb_mu_out = nn.Linear(enc_params['embedding_nnet_size'], enc_params['latent_size'])
479479
self.encoder.emb_log_sigma_out = nn.Linear(enc_params['embedding_nnet_size'], enc_params['latent_size'])
480-
# TODO try adding flow
481480

482481
# initialize decoder modules
483482
dec_params = self.hyperparams['decoder']
@@ -617,7 +616,7 @@ def sampler(self, mu, log_sigma, stddev=1.):
617616
eps = torch.zeros_like(log_sigma).normal_(std=stddev)
618617
return mu + log_sigma.exp() * eps
619618

620-
def generate(self, mode=True): # TODO implement fast generation
619+
def generate(self, mode=True):
621620
for module in self.decoder.dilation_blocks():
622621
if hasattr(module, "generate") and callable(module.generate):
623622
module.generate(mode)
@@ -637,7 +636,7 @@ def encode(self, inputs, input_masks):
637636
enc_params = self.hyperparams['encoder']
638637
nonlin = nonlinearity(enc_params['embedding_nnet_nonlinearity'])
639638

640-
up_val_1d = self.encoder.start_conv(inputs) # TODO use special input for encoder
639+
up_val_1d = self.encoder.start_conv(inputs)
641640
for convnet in self.encoder.dilation_blocks:
642641
up_val_1d = convnet(up_val_1d, input_masks)
643642

seqdesign_pt/autoregressive_train.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import torch.nn.functional as F
99
import torch.utils.data
1010

11-
from model_logging import Logger
11+
from seqdesign_pt.model_logging import Logger
1212

1313

1414
class AutoregressiveTrainer:
@@ -172,32 +172,32 @@ def train(self, steps=1e8):
172172

173173
def validate(self, batch_size=48):
174174
return 0.0, 0.0
175-
self.model.eval()
176-
with torch.no_grad():
177-
(
178-
prot_decoder_input_f, prot_decoder_output_f, prot_mask_decoder,
179-
prot_decoder_input_r, prot_decoder_output_r,
180-
n_eff
181-
) = self.loader.dataset.generate_test_data(self, batch_size, matching=True) # TODO write generate_test_data
182-
if self.run_fr:
183-
output_logits_f, output_logits_r = self.model(
184-
prot_decoder_input_f, prot_mask_decoder, prot_decoder_input_r, prot_mask_decoder)
185-
output_logits = torch.cat((output_logits_f, output_logits_r), dim=0)
186-
target_seqs = torch.cat((prot_decoder_output_f, prot_decoder_output_r), dim=0)
187-
mask = torch.cat((prot_mask_decoder, prot_mask_decoder), dim=0)
188-
else:
189-
output_logits = self.model(prot_decoder_input_f, prot_mask_decoder)
190-
target_seqs = prot_decoder_output_f
191-
mask = prot_mask_decoder
192-
193-
cross_entropy = F.cross_entropy(output_logits, target_seqs.argmax(1), reduction='none')
194-
cross_entropy = cross_entropy * mask.squeeze(1)
195-
reconstruction_per_seq = cross_entropy.sum([1, 2]) / mask.sum([1, 2, 3])
196-
reconstruction_loss = reconstruction_per_seq.mean()
197-
accuracy_per_seq = target_seqs[output_logits.argmax(1, keepdim=True)].sum([1, 2]) / mask.sum([1, 2, 3])
198-
avg_accuracy = accuracy_per_seq.mean()
199-
self.model.train()
200-
return reconstruction_loss, avg_accuracy
175+
# self.model.eval()
176+
# with torch.no_grad():
177+
# (
178+
# prot_decoder_input_f, prot_decoder_output_f, prot_mask_decoder,
179+
# prot_decoder_input_r, prot_decoder_output_r,
180+
# n_eff
181+
# ) = self.loader.dataset.generate_test_data(self, batch_size, matching=True) # TODO write generate_test_data
182+
# if self.run_fr:
183+
# output_logits_f, output_logits_r = self.model(
184+
# prot_decoder_input_f, prot_mask_decoder, prot_decoder_input_r, prot_mask_decoder)
185+
# output_logits = torch.cat((output_logits_f, output_logits_r), dim=0)
186+
# target_seqs = torch.cat((prot_decoder_output_f, prot_decoder_output_r), dim=0)
187+
# mask = torch.cat((prot_mask_decoder, prot_mask_decoder), dim=0)
188+
# else:
189+
# output_logits = self.model(prot_decoder_input_f, prot_mask_decoder)
190+
# target_seqs = prot_decoder_output_f
191+
# mask = prot_mask_decoder
192+
#
193+
# cross_entropy = F.cross_entropy(output_logits, target_seqs.argmax(1), reduction='none')
194+
# cross_entropy = cross_entropy * mask.squeeze(1)
195+
# reconstruction_per_seq = cross_entropy.sum([1, 2]) / mask.sum([1, 2, 3])
196+
# reconstruction_loss = reconstruction_per_seq.mean()
197+
# accuracy_per_seq = target_seqs[output_logits.argmax(1, keepdim=True)].sum([1, 2]) / mask.sum([1, 2, 3])
198+
# avg_accuracy = accuracy_per_seq.mean()
199+
# self.model.train()
200+
# return reconstruction_loss, avg_accuracy
201201

202202
def test(self, data_loader, model_eval=True, num_samples=1, return_logits=False, return_ce=False):
203203
if model_eval:
@@ -349,7 +349,7 @@ def test(self, data_loader, model_eval=True, num_samples=1, return_logits=False,
349349
return output
350350

351351
def save_state(self, last_batch=None):
352-
snapshot = f"{self.params['snapshot_path']}/{self.params['snapshot_name']}/{self.model.step}.pth"
352+
snapshot = f"{self.params['snapshot_path']}/{self.params['snapshot_name']}/{self.params['snapshot_name']}.ckpt-{self.model.step}.pth"
353353
revive_exec = f"{self.params['snapshot_path']}/revive_executable/{self.params['snapshot_name']}.sh"
354354
if not os.path.exists(os.path.dirname(snapshot)):
355355
os.makedirs(os.path.dirname(snapshot), exist_ok=True)

seqdesign_pt/aws_utils.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import subprocess
2+
import re
3+
import os
4+
from seqdesign_pt.version import VERSION
5+
6+
S3_FOLDER_URL = "s3://markslab-private/seqdesign"
7+
8+
if os.path.exists('/n/groups/marks/software/aws-cli/bin/aws'):
9+
AWS_BIN = '/n/groups/marks/software/aws-cli/bin/aws'
10+
else:
11+
AWS_BIN = 'aws'
12+
13+
14+
class AWSUtility:
15+
def __init__(self, s3_project=VERSION, s3_base_path=S3_FOLDER_URL):
16+
self.s3_base_path = s3_base_path
17+
self.s3_project = s3_project
18+
19+
@staticmethod
20+
def run_cmd(cmd):
21+
try:
22+
if cmd[0] not in ('aws', AWS_BIN):
23+
cmd = [AWS_BIN] + cmd
24+
else:
25+
cmd[0] = AWS_BIN
26+
pipes = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='UTF-8')
27+
std_out, std_err = pipes.communicate()
28+
if pipes.returncode != 0:
29+
print(f"AWS CLI error: {pipes.returncode}")
30+
print(std_err.strip())
31+
return pipes.returncode, None, None
32+
else:
33+
return 0, std_out, std_err
34+
except OSError:
35+
print("AWS CLI not found.")
36+
return 1, None, None
37+
38+
def s3_cp(self, local_file, s3_file, destination='s3'):
39+
s3_file = f"{self.s3_base_path}/{self.s3_project}/{s3_file}"
40+
if destination == 's3':
41+
print("Copying file to AWS S3.")
42+
src_file, dest_file = local_file, s3_file
43+
else:
44+
print("Copying file from AWS S3.")
45+
src_file, dest_file = s3_file, local_file
46+
cmd = ['s3', 'cp', src_file, dest_file]
47+
code, std_out, std_err = self.run_cmd(cmd)
48+
if code == 0:
49+
print("Success.")
50+
51+
def s3_sync(self, local_folder, s3_folder, destination='s3', args=()):
52+
local_folder = local_folder + ('' if local_folder.endswith('/') else '/')
53+
s3_folder = s3_folder + ('' if s3_folder.endswith('/') else '/')
54+
s3_folder = f"{self.s3_base_path}/{self.s3_project}/{s3_folder}"
55+
if destination == 's3':
56+
print("Syncing data to AWS S3.")
57+
src_folder, dest_folder = local_folder, s3_folder
58+
else:
59+
print("Syncing data from AWS S3.")
60+
src_folder, dest_folder = s3_folder, local_folder
61+
cmd = ['s3', 'sync', src_folder, dest_folder, *args]
62+
code, std_out, std_err = self.run_cmd(cmd)
63+
if code == 0:
64+
print("Success.")
65+
66+
def s3_get_file_grep(self, s3_folder, dest_folder, search_pattern):
67+
s3_folder = s3_folder + ('' if s3_folder.endswith('/') else '/')
68+
dest_folder = dest_folder + ('' if dest_folder.endswith('/') else '/')
69+
s3_folder = f"{self.s3_base_path}/{self.s3_project}/{s3_folder}"
70+
print(f"Finding files in {s3_folder} on AWS S3.")
71+
cmd = ['s3', 'ls', s3_folder]
72+
code, std_out, std_err = self.run_cmd(cmd)
73+
if code != 0:
74+
return False
75+
filenames = re.findall(search_pattern, std_out)
76+
if not filenames:
77+
print("No files found.")
78+
return False
79+
print(f"Found: {filenames}")
80+
for filename in filenames:
81+
filename = f"{s3_folder}{filename}"
82+
print(f"Copying file {filename} from AWS S3.")
83+
cmd = ['s3', 'cp', filename, dest_folder]
84+
code, std_out, std_err = self.run_cmd(cmd)
85+
if code != 0:
86+
return False
87+
print("Success.")
88+
return True

0 commit comments

Comments
 (0)