diff --git a/tomo_challenge/classifiers/GPz_classifier.py b/tomo_challenge/classifiers/GPz_classifier.py new file mode 100644 index 00000000..47d370a7 --- /dev/null +++ b/tomo_challenge/classifiers/GPz_classifier.py @@ -0,0 +1,158 @@ +""" This is an example tomographic bin generator using the code GPz + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +# The only extra code it needs is GPz, which can be accessed at +#pip3 install --upgrade 'git+https://github.com/OxfordML/GPz#egg=GPz' +# This is unfortunately only in python2.7 at the moment... +# It also calls two python2 scripts (GPz is in python2), classifier_train_GPz.py and classifier_predict_GPz.py +# Train requires file_prefix to tell it where you put these files + +## Options: +# bins - number of bins +# edge_strictness - how close to the edges of the redshift bins relative to the uncertainty on the redshift permitted (higher is stricter) +# extrapolate_threshold - how much extrapolation is permitted (lower is stricter). This is probably not hugely valuable here, but might be if the test and training data were different. + +from .base import Tomographer +import numpy as np + +import subprocess + + +class GPzBinning(Tomographer): + """ GPz Classifier """ + + # valid parameter -- see below + valid_options = ['bins','edge_strictness','extrapolate_threshold'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + 'edge_strictness [default=0.0] Essentially how big error bars can be compared to the bin edges + 'extrapolate_threshold' [default=1.0] Essentially how much extrapolation should be allowed + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z,file_prefix): + + + X_train = training_data + n_train,d = X_train.shape + + np.savetxt('train_data.csv',training_data) + np.savetxt('training_z.csv',training_z) + + subprocess.run(["python2", file_prefix+"classifier_train_GPz.py"]) + + + + # Sort out bin edges + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # for speed, cut down to 5% of original size + cut = np.random.uniform(0, 1, training_z.size) < 0.05 + training_bin = training_bin[cut] + training_data = training_data[cut] + + + #self.photoz_predictor = model + self.z_edges = z_edges + + + def apply (self, testing_data,file_prefix): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + + # Save data + np.savetxt('test_data.csv',testing_data) + + # Run GPz for predictions + subprocess.run(["python2", file_prefix+"classifier_predict_GPz.py"]) + + + + data= np.genfromtxt('prediction_data.csv') + mu=data[:,0] + sigma=data[:,1] + modelV=data[:,2] + noiseV=data[:,3] + + z_edges=self.z_edges + n_bin = self.opt['bins'] + + edge_strictness=self.opt['edge_strictness'] + extrapolate_threshold=self.opt['extrapolate_threshold'] + + tomo_bin = 0*mu + + for i in range(len(mu)): + + tomo_bin[i]=-1 + + for j in range(n_bin): + + if mu[i]-edge_strictness*sigma[i]**0.5>z_edges[j] and mu[i]+edge_strictness*sigma[i]**0.5extrapolate_threshold*sigma[i]: + tomo_bin[i]=-1 + + + + + + return tomo_bin + diff --git a/tomo_challenge/classifiers/classifier_predict_GPz.py b/tomo_challenge/classifiers/classifier_predict_GPz.py new file mode 100644 index 00000000..7af145d4 --- /dev/null +++ b/tomo_challenge/classifiers/classifier_predict_GPz.py @@ -0,0 +1,60 @@ +import GPz +from numpy import * +import matplotlib.pyplot as plt +from scipy.stats import gaussian_kde +import numpy as np + +import h5py + + +########### Model options ############### + +method = 'VC' # select method, options = GL, VL, GD, VD, GC and VC [required] +m = 100 # number of basis functions to use [required] +joint = True # jointly learn a prior linear mean function [default=true] +heteroscedastic = True # learn a heteroscedastic noise process, set to false interested only in point estimates +csl_method = 'normal' # cost-sensitive learning option: [default='normal'] + # 'balanced': to weigh rare samples more heavly during train + # 'normalized': assigns an error cost for each sample = 1/(z+1) + # 'normal': no weights assigned, all samples are equally important + # +binWidth = 0.1 # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100] +decorrelate = True # preprocess the data using PCA [default=False] + +########### Training options ########### + +maxIter = 500 # maximum number of iterations [default=200] +maxAttempts = 50 # maximum iterations to attempt if there is no progress on the validation set [default=infinity] +trainSplit = 0.5 # percentage of data to use for training +validSplit = 0.5 # percentage of data to use for validation +testSplit = 0.0 # percentage of data to use for testing + +########### Start of script ########### + +import pickle +file_pi = open('model.obj', 'r') +model = pickle.load(file_pi) + +X_test = np.genfromtxt('test_data.csv') +n_test,d = X_test.shape + + +########### NOTE ########### +# you can train the model gain, eve using different data, by executing: +# model.train(model,X,Y,options) + +# use the model to generate predictions for the test set +data=model.predict(X_test[:,:].copy()) + +predictions=np.zeros([n_test,4]) + +for i in range(n_test): + predictions[i,0]=data[0][i][0] #mu + predictions[i,1]=data[1][i][0] #variance + predictions[i,2]=data[2][i][0] #modelVariance + predictions[i,3]=data[3][i][0] #noiseVariance + + +np.savetxt('prediction_data.csv',predictions) + +print 'predictions made' diff --git a/tomo_challenge/classifiers/classifier_train_GPz.py b/tomo_challenge/classifiers/classifier_train_GPz.py new file mode 100644 index 00000000..b0e2f25f --- /dev/null +++ b/tomo_challenge/classifiers/classifier_train_GPz.py @@ -0,0 +1,62 @@ +import GPz +from numpy import * +import matplotlib.pyplot as plt +from scipy.stats import gaussian_kde +import numpy as np + +import h5py + + +########### Model options ############### + +method = 'VC' # select method, options = GL, VL, GD, VD, GC and VC [required] +m = 100 # number of basis functions to use [required] +joint = True # jointly learn a prior linear mean function [default=true] +heteroscedastic = True # learn a heteroscedastic noise process, set to false interested only in point estimates +csl_method = 'normal' # cost-sensitive learning option: [default='normal'] + # 'balanced': to weigh rare samples more heavly during train + # 'normalized': assigns an error cost for each sample = 1/(z+1) + # 'normal': no weights assigned, all samples are equally important + # +binWidth = 0.1 # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100] +decorrelate = True # preprocess the data using PCA [default=False] + +########### Training options ########### + +maxIter = 500 # maximum number of iterations [default=200] +maxAttempts = 50 # maximum iterations to attempt if there is no progress on the validation set [default=infinity] +trainSplit = 0.5 # percentage of data to use for training +validSplit = 0.5 # percentage of data to use for validation +testSplit = 0.0 # percentage of data to use for testing + +########### Start of script ########### + + + +X_train = np.genfromtxt('train_data.csv') +n_train,d = X_train.shape +Y_train = np.genfromtxt('training_z.csv') +Y_train = Y_train.reshape(-1,1) + + + + +# sample training, validation and testing sets from the data +training,validation,testing = GPz.sample(n_train,trainSplit,validSplit,testSplit) + + +# get the weights for cost-sensitive learning +omega = GPz.getOmega(Y_train, method=csl_method) + + +# initialize the initial model +model = GPz.GP(m,method=method,joint=joint,heteroscedastic=heteroscedastic,decorrelate=decorrelate) + +# train the model +model.train(X_train.copy(), Y_train.copy(), omega=omega, training=training, validation=validation, maxIter=maxIter, maxAttempts=maxAttempts) + + +import pickle +file_pi = open('model.obj', 'w') +pickle.dump(model, file_pi) +