Update job generator

jonnor · jonnor · commit bd44b50f65a6 · 2019-04-07T15:56:54.000+02:00
diff --git a/README.md b/README.md
@@ -35,42 +35,3 @@ Evaluate the resulting models
 
     python3 test.py
 
-
-## Run experiments using Docker, Kubernetes and Google Cloud
-
-Create project in Google Cloud
-
-Install locally
-
-    Docker
-    google-cloud-sdk
-    kubectl
-
-Create Kubernetes cluster
-
-    gcloud container clusters create cluster --scopes storage-full --machine-type n1-highcpu-2 --num-nodes 10 \
-        --create-subnetwork name=my-subnet-0 \
-        --enable-ip-alias \
-        --enable-private-nodes \
-        --master-ipv4-cidr 172.16.0.0/28 \
-        --no-enable-basic-auth \
-        --no-issue-client-certificate \
-        --no-enable-master-authorized-networks
-
-    gcloud container clusters get-credentials cluster
-    kubectl get nodes
-
-Build Docker images and push to GKE
-
-    export PROJECT_ID="$(gcloud config get-value project -q)"
-    docker build -t gcr.io/${PROJECT_ID}/base:15 -f Dockerfile .
-    docker push gcr.io/${PROJECT_ID}/base
-
-Generate Kubernetes jobs and start them
-
-    python3 microesc/jobs.py experiments/sbcnn16k30.yaml
-    kubectl create -f data/jobs/
-
-Delete jobs
-
-    kubectl delete jobs `kubectl get jobs -o custom-columns=:.metadata.name`
diff --git a/experiments/ldcnn20k60.yaml b/experiments/ldcnn20k60.yaml
@@ -5,13 +5,11 @@ fmax: 11025
 n_fft: 1024
 hop_length: 512
 augmentations: 12
+augment: 1
 frames: 31
 batch: 400
 epochs: 50
 train_samples: 30000
 val_samples: 5000
-augment: 1
 voting: 'mean'
 voting_overlap: 0.5
-pool: '3x2'
-kernel: '5x5'
diff --git a/microesc/common.py b/microesc/common.py
@@ -30,8 +30,10 @@ def add_arguments(parser):
 
 def load_experiment(folder, name):
     path = os.path.join(folder, name+'.yaml')
+    return load_settings_path(path)
 
+def load_settings_path(path):
     with open(path, 'r') as config_file:
         settings = yaml.load(config_file.read())    
-    
+   
     return settings
diff --git a/microesc/jobs.py b/microesc/jobs.py
@@ -1,112 +1,79 @@
 
+import sys
 import os.path
-import uuid
+import subprocess
 import datetime
-import sys
+import uuid
 
-from . import common
-
-
-template = """
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: mesc-{kind}-{name}
-  labels:
-    jobgroup: microesc-{kind}
-spec:
-  template:
-    metadata:
-      name: microesc-{kind}
-      labels:
-        jobgroup: microesc-{kind}
-    spec:
-      containers:
-      - name: jobrunner
-        image: {image}
-        command: {command}
-        securityContext:
-          privileged: true
-          capabilities:
-            add:
-              - SYS_ADMIN
-        lifecycle:
-          postStart:
-            exec:
-              command: ["gcsfuse", "-o", "nonempty", "--implicit-dirs", {bucket}, {mountpoint}]
-          preStop:
-            exec:
-              command: ["fusermount", "-u", {mountpoint}]
-        resources:
-          requests:
-            cpu: "1.3"
-      restartPolicy: Never
-"""
-
-
-def array_str(a):
-    m  = ', '.join([ '"{}"'.format(p) for p in a ])
-    return '[ {} ]'.format(m)
-
-def render_job(image, script, args, mountpoint, bucket):
-    cmd = ["python3", "{}.py".format(script) ]
-
-    for k, v in args.items():
-        cmd += [ '--{}'.format(k), str(v) ]
-
-    p = dict(
-        image=image,
-        kind=script,
-        name=args['name'],
-        command=array_str(cmd),
-        bucket=bucket,
-        mountpoint=mountpoint,
-    )
-    s = template.format(**p)
-    return s
-
-def generate_train_jobs(settings, jobs_dir, image, experiment, out_dir, mountpoint, bucket):
-
-    t = datetime.datetime.now().strftime('%Y%m%d-%H%M') 
-    u = str(uuid.uuid4())[0:4]
-    name = "-".join([experiment, t, u])
+import pandas
+import numpy
 
-    folds = list(range(0, 9))
-  
-    for fold in folds:
-        args = {
-            'experiment': experiment,
-            'models': out_dir,
-            'fold': fold,
-            'name': name+'-fold{}'.format(fold),
-        }
-
-        s = render_job(image, 'train', args, mountpoint, bucket)
-
-        job_filename = "train-{}.yaml".format(fold)
-        out_path = os.path.join(jobs_dir, job_filename)
-        with open(out_path, 'w') as out:
-            out.write(s)
+from microesc import common
+
+def arglist(options):
+    args = [ "--{}={}".format(k, v) for k, v in options.items() ]
+    return args
+
+def command_for_job(options):
+    args = [
+        'python3', 'train.py'
+    ]
+    args += arglist(options)
+    return args
+
+def generate_train_jobs(experiments, settings_path, folds, overrides):
+
+    timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M') 
+    unique = str(uuid.uuid4())[0:4]   
+    def name(experiment, fold):
+        name = "-".join([experiment, timestamp, unique])
+        return name+'-fold{}'.format(fold)
+
+    def job(exname, experiment):
+
+        for fold in folds:
+            n = name(exname, fold)
+            
+            options = {
+                'name': n,
+                'fold': fold,
+                'settings': settings_path,
+            }
+            for k, v in experiment.items():
+                # overrides per experiment
+                options[k] = v
+
+            for k, v in overrides.items():
+                options[k] = v
+
+            cmd = command_for_job(options)
+            return cmd
+
+    # FIXME: better job name
+    jobs = [ job(str(idx), ex) for idx, ex in experiments.iterrows() ] 
+    return jobs
 
 def parse(args):
 
     import argparse
 
     parser = argparse.ArgumentParser(description='Generate jobs')
 
-    common.add_arguments(parser)
+    #common.add_arguments(parser)
 
     a = parser.add_argument
 
+    a('--models', default='models.csv',
+        help='%(default)s')
+    a('--settings', default='experiments/ldcnn20k60.yaml',
+        help='%(default)s')
+
 
     a('--jobs', dest='jobs_dir', default='./data/jobs',
         help='%(default)s')
 
-    a('--bucket', type=str, default='jonnor-micro-esc',
-        help='GCS bucket to write to. Default: %(default)s')
-
-    a('--image', type=str, default='gcr.io/masterthesis-231919/base:21',
-        help='Docker image to use')
+    a('--check', action='store_true',
+        help='Only run a pre-flight check')
     
     parsed = parser.parse_args(args)
 
@@ -115,16 +82,19 @@ def parse(args):
 def main():
     args = parse(sys.argv[1:])
 
-    mountpoint = '/mnt/bucket'
-    storage_dir = mountpoint+'/models'
+    models = pandas.read_csv(args.models)
+    settings = common.load_settings_path(args.settings)
 
-    name = args.experiment
-    settings = common.load_experiment(args.experiments_dir, name)
-
-    out = os.path.join(args.jobs_dir, name)
-    common.ensure_directories(out)
+    overrides = {}
+    folds = list(range(0, 9))
+    if args.check:
+        folds = (1,)
+        overrides['train_samples'] = settings['batch']*1
+        overrides['val_samples'] = settings['batch']*1
 
-    generate_train_jobs(settings, out, args.image, name, storage_dir, mountpoint, args.bucket)
-    print('wrote to', out)
+    cmds = generate_train_jobs(models, args.settings, folds, overrides)
 
+    print('\n'.join(" ".join(cmd) for cmd in cmds))
 
+if __name__ == '__main__':
+    main()
diff --git a/models.csv b/models.csv
@@ -1,33 +1,7 @@
-id,nick,conv_block,kernel_size,downsample,downsample_type,filters,    (ram_use,flash_use,maccs) (val_acc_avg, val_acc_std, test_acc_avg, test_acc_std)  (inference_time)
-0,SB-CNN,conv,5x5,3x2,maxpool,24
-1,Stride,conv,5x5,2x2,stride,
-2,DepthwiseSep,dw,5x5,2x2,stride,
-3,MobileNet,dw_pw,5x5,2x2,stride,
-4,MobileNetV2,pw_dw_pw,5x5,2x2,stride,
-
-5x9x
-
-# SpatiallySeparable. pw_sdw_pw
-
-Find out effect of better convolutional blocks on accuracy vs inference time.
-(and striding)
-(wide versus deep)
-(different voting overlaps)
-
-Stride in Keras/Tensorflow must be uniform.
-
-first all with 5x5 kernel, 2 intermediate blocks.
-Then can try 3x3 kernel, 3 intermediate blocks
-
-Use same learning rate for all.
-
-Adjust number of convolutions to make MACC approximately equal within groups.
-Ref Google paper keyword spotting. tstride/fstride?
-
-Should have a preflight check. Runs all models, in parallell, 1 epoch, 1 fold, 1/10 the samples.
-
-Then can test reporting tools based on that.
-
-Plot training curves together.
-
+conv_block,conv_size,downsample_size,downsample_type,filters
+conv,5x5,3x2,maxpool,24
+conv,5x5,2x2,stride,24
+dw,5x5,2x2,stride,24
+dw_pw,5x5,2x2,stride,24
+pw_dw_pw,5x5,2x2,stride,24
 
diff --git a/report/report.md b/report/report.md
@@ -715,6 +715,22 @@ The SB-CNN model was used as a base, with 30 mels bands. ST FP-SENSING1 function
 # Methods
 
 
+## Blabla
+<!---
+Find out effect of better convolutional blocks on accuracy vs inference time.
+(and striding)
+(wide versus deep)
+(different voting overlaps)
+-->
+
+Stride in Keras/Tensorflow must be uniform.
+
+first all with 5x5 kernel, 2 intermediate blocks.
+Then can try 3x3 kernel, 3 intermediate blocks
+
+Adjust number of convolutions to make MACC approximately equal within groups.
+Ref Google paper keyword spotting. tstride/fstride?
+
 
 
 ## Model pipeline
diff --git a/run.py b/run.py