From 46fd9685c8c78618d0200c1319fb40895cde791f Mon Sep 17 00:00:00 2001 From: cavuoti Date: Wed, 15 Jul 2020 11:34:27 +0200 Subject: [PATCH] MLPQNA model --- tomo_challenge/classifiers/mlpqna.py | 132 +++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 tomo_challenge/classifiers/mlpqna.py diff --git a/tomo_challenge/classifiers/mlpqna.py b/tomo_challenge/classifiers/mlpqna.py new file mode 100644 index 00000000..f9cf4902 --- /dev/null +++ b/tomo_challenge/classifiers/mlpqna.py @@ -0,0 +1,132 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +from sklearn.neural_network import MLPClassifier as mlp +from sklearn.preprocessing import StandardScaler + + + + +class mlpqna(Tomographer): + """ Random Forest Classifier """ + + # valid parameter -- see below + valid_options = ['bins'] + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # for speed, cut down to 5% of original size + cut = np.random.uniform(0, 1, training_z.size) < 0.05 + training_bin = training_bin[cut] + training_data = training_data[cut] + + # Can be replaced with any classifier + actFunc ="tanh" + decayVal =0.1 + h1Layers = 2 + h1LayerNodes = 2* training_data.shape[1] -1 + h2LayerNodes = training_data.shape[1]+1 + trainEpochs = 20000 + seed = 3214234 + tolerance = 1e-20 + verboseFlag = True + warmFlag = False + maxFun = 30000 + classifier= mlp(activation=actFunc, solver='lbfgs', alpha=decayVal, hidden_layer_sizes=(h1LayerNodes, h2LayerNodes), max_iter=trainEpochs, random_state=seed, tol=tolerance, verbose=verboseFlag, warm_start=warmFlag, max_fun=maxFun) + scaler = StandardScaler() + # Fit only to the training data + scaler.fit(training_data) + # Now apply the transformations to the data: + X_norm_train = scaler.transform(training_data) + + print("Fitting classifier") + # Lots of data, so this will take some time + classifier.fit(X_norm_train, training_bin) + self.scaler=scaler + self.classifier = classifier + self.z_edges = z_edges + + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + X_norm_data = self.scaler.transform(data) + tomo_bin = self.classifier.predict(X_norm_data) + return tomo_bin +