LSSTDESC · pwhatfield · Aug 27, 2020 · Aug 28, 2020 · Aug 28, 2020 · Aug 28, 2020
diff --git a/tomo_challenge/classifiers/GPz_classifier.py b/tomo_challenge/classifiers/GPz_classifier.py
@@ -0,0 +1,158 @@
+""" This is an example tomographic bin generator using the code GPz
+
+Every classifier module needs to:
+ - have construction of the type 
+       __init__ (self, bands, options) (see examples below)
+ -  implement two functions: 
+        train (self, training_data,training_z)
+        apply (self, data).
+ - define valid_options class varible.
+
+See Classifier Documentation below.
+"""
+
+# The only extra code it needs is GPz, which can be accessed at
+#pip3 install --upgrade 'git+https://github.com/OxfordML/GPz#egg=GPz' 
+# This is unfortunately only in python2.7 at the moment...
+# It also calls two python2 scripts (GPz is in python2), classifier_train_GPz.py and classifier_predict_GPz.py
+# Train requires file_prefix to tell it where you put these files
+
+## Options:
+# bins - number of bins
+# edge_strictness - how close to the edges of the redshift bins relative to the uncertainty on the redshift permitted (higher is stricter)
+# extrapolate_threshold - how much extrapolation is permitted (lower is stricter). This is probably not hugely valuable here, but might be if the test and training data were different.
+
+from .base import Tomographer
+import numpy as np
+
+import subprocess
+
+
+class GPzBinning(Tomographer):
+    """ GPz Classifier """
+
+    # valid parameter -- see below
+    valid_options = ['bins','edge_strictness','extrapolate_threshold']
+    # this settings means arrays will be sent to train and apply instead
+    # of dictionaries
+    wants_arrays = True
+
+    def __init__ (self, bands, options):
+        """Constructor
+
+        Parameters:
+        -----------
+        bands: str
+          string containg valid bands, like 'riz' or 'griz'
+        options: dict
+          options come through here. Valid keys are listed as valid_options
+          class variable. 
+
+        Note:
+        -----
+        Valiad options are:
+            'bins' - number of tomographic bins
+            'edge_strictness [default=0.0] Essentially how big error bars can be compared to the bin edges
+            'extrapolate_threshold' [default=1.0] Essentially how much extrapolation should be allowed
+
+        """
+        self.bands = bands
+        self.opt = options
+
+    def train (self, training_data, training_z,file_prefix):
+
+
+        X_train = training_data
+        n_train,d = X_train.shape
+
+        np.savetxt('train_data.csv',training_data)
+        np.savetxt('training_z.csv',training_z)
+
+        subprocess.run(["python2", file_prefix+"classifier_train_GPz.py"])
+
+
+
+        # Sort out bin edges
+        n_bin = self.opt['bins']
+        print("Finding bins for training data")
+        # Now put the training data into redshift bins.
+        # Use zero so that the one object with minimum
+        # z in the whole survey will be in the lowest bin
+        training_bin = np.zeros(training_z.size)
+
+        # Find the edges that split the redshifts into n_z bins of
+        # equal number counts in each
+        p = np.linspace(0, 100, n_bin + 1)
+        z_edges = np.percentile(training_z, p)
+
+        # Now find all the objects in each of these bins
+        for i in range(n_bin):
+            z_low = z_edges[i]
+            z_high = z_edges[i + 1]
+            training_bin[(training_z > z_low) & (training_z < z_high)] = i
+
+        # for speed, cut down to 5% of original size
+        cut = np.random.uniform(0, 1, training_z.size) < 0.05
+        training_bin = training_bin[cut]
+        training_data = training_data[cut]
+
+
+        #self.photoz_predictor = model
+        self.z_edges = z_edges
+
+
+    def apply (self, testing_data,file_prefix):
+        """Applies training to the data.
+
+        Parameters:
+        -----------
+        Data: numpy array, size Ngalaxes x Nbands
+          testing data, each row is a galaxy, each column is a band as per
+          band defined above
+
+        Returns: 
+        tomographic_selections: numpy array, int, size Ngalaxies
+          tomographic selection for galaxies return as bin number for 
+          each galaxy.
+        """
+
+        # Save data
+        np.savetxt('test_data.csv',testing_data)
+
+        # Run GPz for predictions
+        subprocess.run(["python2", file_prefix+"classifier_predict_GPz.py"])
+
+
+
+        data= np.genfromtxt('prediction_data.csv')
+        mu=data[:,0]
+        sigma=data[:,1]
+        modelV=data[:,2]
+        noiseV=data[:,3]
+
+        z_edges=self.z_edges
+        n_bin = self.opt['bins']
+
+        edge_strictness=self.opt['edge_strictness']
+        extrapolate_threshold=self.opt['extrapolate_threshold']
+
+        tomo_bin = 0*mu
+
+        for i in range(len(mu)):
+
+            tomo_bin[i]=-1
+
+            for j in range(n_bin):
+
+                if mu[i]-edge_strictness*sigma[i]**0.5>z_edges[j] and mu[i]+edge_strictness*sigma[i]**0.5<z_edges[j+1]:
+                    tomo_bin[i]=j
+
+                if modelV[i]>extrapolate_threshold*sigma[i]:
+                    tomo_bin[i]=-1
+
+
+
+
+
+        return tomo_bin
+
diff --git a/tomo_challenge/classifiers/classifier_predict_GPz.py b/tomo_challenge/classifiers/classifier_predict_GPz.py
@@ -0,0 +1,60 @@
+import GPz
+from numpy import *
+import matplotlib.pyplot as plt
+from scipy.stats import gaussian_kde
+import numpy as np
+
+import h5py
+
+
+########### Model options ###############
+
+method = 'VC'               # select method, options = GL, VL, GD, VD, GC and VC [required]
+m = 100                      # number of basis functions to use [required]
+joint = True                # jointly learn a prior linear mean function [default=true]
+heteroscedastic = True      # learn a heteroscedastic noise process, set to false interested only in point estimates
+csl_method = 'normal'       # cost-sensitive learning option: [default='normal']
+                            #       'balanced':     to weigh rare samples more heavly during train
+                            #       'normalized':   assigns an error cost for each sample = 1/(z+1)
+                            #       'normal':       no weights assigned, all samples are equally important
+                            #
+binWidth = 0.1              # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100]
+decorrelate = True          # preprocess the data using PCA [default=False]
+
+########### Training options ###########
+
+maxIter = 500                  # maximum number of iterations [default=200]
+maxAttempts = 50              # maximum iterations to attempt if there is no progress on the validation set [default=infinity]
+trainSplit = 0.5               # percentage of data to use for training
+validSplit = 0.5               # percentage of data to use for validation
+testSplit  = 0.0               # percentage of data to use for testing
+
+########### Start of script ###########
+
+import pickle 
+file_pi = open('model.obj', 'r') 
+model = pickle.load(file_pi)
+
+X_test = np.genfromtxt('test_data.csv')
+n_test,d = X_test.shape
+
+
+########### NOTE ###########
+# you can train the model gain, eve using different data, by executing:
+# model.train(model,X,Y,options)
+
+# use the model to generate predictions for the test set
+data=model.predict(X_test[:,:].copy())
+
+predictions=np.zeros([n_test,4])
+
+for i in range(n_test):
+    predictions[i,0]=data[0][i][0] #mu
+    predictions[i,1]=data[1][i][0] #variance
+    predictions[i,2]=data[2][i][0] #modelVariance
+    predictions[i,3]=data[3][i][0] #noiseVariance
+
+
+np.savetxt('prediction_data.csv',predictions)
+
+print 'predictions made'
diff --git a/tomo_challenge/classifiers/classifier_train_GPz.py b/tomo_challenge/classifiers/classifier_train_GPz.py
@@ -0,0 +1,62 @@
+import GPz
+from numpy import *
+import matplotlib.pyplot as plt
+from scipy.stats import gaussian_kde
+import numpy as np
+
+import h5py
+
+
+########### Model options ###############
+
+method = 'VC'               # select method, options = GL, VL, GD, VD, GC and VC [required]
+m = 100                      # number of basis functions to use [required]
+joint = True                # jointly learn a prior linear mean function [default=true]
+heteroscedastic = True      # learn a heteroscedastic noise process, set to false interested only in point estimates
+csl_method = 'normal'       # cost-sensitive learning option: [default='normal']
+                            #       'balanced':     to weigh rare samples more heavly during train
+                            #       'normalized':   assigns an error cost for each sample = 1/(z+1)
+                            #       'normal':       no weights assigned, all samples are equally important
+                            #
+binWidth = 0.1              # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100]
+decorrelate = True          # preprocess the data using PCA [default=False]
+
+########### Training options ###########
+
+maxIter = 500                  # maximum number of iterations [default=200]
+maxAttempts = 50              # maximum iterations to attempt if there is no progress on the validation set [default=infinity]
+trainSplit = 0.5               # percentage of data to use for training
+validSplit = 0.5               # percentage of data to use for validation
+testSplit  = 0.0               # percentage of data to use for testing
+
+########### Start of script ###########
+
+
+
+X_train = np.genfromtxt('train_data.csv')
+n_train,d = X_train.shape
+Y_train = np.genfromtxt('training_z.csv')
+Y_train = Y_train.reshape(-1,1)
+
+
+
+
+# sample training, validation and testing sets from the data
+training,validation,testing = GPz.sample(n_train,trainSplit,validSplit,testSplit)
+
+
+# get the weights for cost-sensitive learning
+omega = GPz.getOmega(Y_train, method=csl_method)
+
+
+# initialize the initial model
+model = GPz.GP(m,method=method,joint=joint,heteroscedastic=heteroscedastic,decorrelate=decorrelate)
+
+# train the model
+model.train(X_train.copy(), Y_train.copy(), omega=omega, training=training, validation=validation, maxIter=maxIter, maxAttempts=maxAttempts)
+
+
+import pickle 
+file_pi = open('model.obj', 'w') 
+pickle.dump(model, file_pi)
+