Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions tomo_challenge/classifiers/GPz_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
""" This is an example tomographic bin generator using the code GPz

Every classifier module needs to:
- have construction of the type
__init__ (self, bands, options) (see examples below)
- implement two functions:
train (self, training_data,training_z)
apply (self, data).
- define valid_options class varible.

See Classifier Documentation below.
"""

# The only extra code it needs is GPz, which can be accessed at
#pip3 install --upgrade 'git+https://github.com/OxfordML/GPz#egg=GPz'
# This is unfortunately only in python2.7 at the moment...
# It also calls two python2 scripts (GPz is in python2), classifier_train_GPz.py and classifier_predict_GPz.py
# Train requires file_prefix to tell it where you put these files

## Options:
# bins - number of bins
# edge_strictness - how close to the edges of the redshift bins relative to the uncertainty on the redshift permitted (higher is stricter)
# extrapolate_threshold - how much extrapolation is permitted (lower is stricter). This is probably not hugely valuable here, but might be if the test and training data were different.

from .base import Tomographer
import numpy as np

import subprocess


class GPzBinning(Tomographer):
""" GPz Classifier """

# valid parameter -- see below
valid_options = ['bins','edge_strictness','extrapolate_threshold']
# this settings means arrays will be sent to train and apply instead
# of dictionaries
wants_arrays = True

def __init__ (self, bands, options):
"""Constructor

Parameters:
-----------
bands: str
string containg valid bands, like 'riz' or 'griz'
options: dict
options come through here. Valid keys are listed as valid_options
class variable.

Note:
-----
Valiad options are:
'bins' - number of tomographic bins
'edge_strictness [default=0.0] Essentially how big error bars can be compared to the bin edges
'extrapolate_threshold' [default=1.0] Essentially how much extrapolation should be allowed

"""
self.bands = bands
self.opt = options

def train (self, training_data, training_z,file_prefix):


X_train = training_data
n_train,d = X_train.shape

np.savetxt('train_data.csv',training_data)
np.savetxt('training_z.csv',training_z)

subprocess.run(["python2", file_prefix+"classifier_train_GPz.py"])



# Sort out bin edges
n_bin = self.opt['bins']
print("Finding bins for training data")
# Now put the training data into redshift bins.
# Use zero so that the one object with minimum
# z in the whole survey will be in the lowest bin
training_bin = np.zeros(training_z.size)

# Find the edges that split the redshifts into n_z bins of
# equal number counts in each
p = np.linspace(0, 100, n_bin + 1)
z_edges = np.percentile(training_z, p)

# Now find all the objects in each of these bins
for i in range(n_bin):
z_low = z_edges[i]
z_high = z_edges[i + 1]
training_bin[(training_z > z_low) & (training_z < z_high)] = i

# for speed, cut down to 5% of original size
cut = np.random.uniform(0, 1, training_z.size) < 0.05
training_bin = training_bin[cut]
training_data = training_data[cut]


#self.photoz_predictor = model
self.z_edges = z_edges


def apply (self, testing_data,file_prefix):
"""Applies training to the data.

Parameters:
-----------
Data: numpy array, size Ngalaxes x Nbands
testing data, each row is a galaxy, each column is a band as per
band defined above

Returns:
tomographic_selections: numpy array, int, size Ngalaxies
tomographic selection for galaxies return as bin number for
each galaxy.
"""

# Save data
np.savetxt('test_data.csv',testing_data)

# Run GPz for predictions
subprocess.run(["python2", file_prefix+"classifier_predict_GPz.py"])



data= np.genfromtxt('prediction_data.csv')
mu=data[:,0]
sigma=data[:,1]
modelV=data[:,2]
noiseV=data[:,3]

z_edges=self.z_edges
n_bin = self.opt['bins']

edge_strictness=self.opt['edge_strictness']
extrapolate_threshold=self.opt['extrapolate_threshold']

tomo_bin = 0*mu

for i in range(len(mu)):

tomo_bin[i]=-1

for j in range(n_bin):

if mu[i]-edge_strictness*sigma[i]**0.5>z_edges[j] and mu[i]+edge_strictness*sigma[i]**0.5<z_edges[j+1]:
tomo_bin[i]=j

if modelV[i]>extrapolate_threshold*sigma[i]:
tomo_bin[i]=-1





return tomo_bin

60 changes: 60 additions & 0 deletions tomo_challenge/classifiers/classifier_predict_GPz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import GPz
from numpy import *
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np

import h5py


########### Model options ###############

method = 'VC' # select method, options = GL, VL, GD, VD, GC and VC [required]
m = 100 # number of basis functions to use [required]
joint = True # jointly learn a prior linear mean function [default=true]
heteroscedastic = True # learn a heteroscedastic noise process, set to false interested only in point estimates
csl_method = 'normal' # cost-sensitive learning option: [default='normal']
# 'balanced': to weigh rare samples more heavly during train
# 'normalized': assigns an error cost for each sample = 1/(z+1)
# 'normal': no weights assigned, all samples are equally important
#
binWidth = 0.1 # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100]
decorrelate = True # preprocess the data using PCA [default=False]

########### Training options ###########

maxIter = 500 # maximum number of iterations [default=200]
maxAttempts = 50 # maximum iterations to attempt if there is no progress on the validation set [default=infinity]
trainSplit = 0.5 # percentage of data to use for training
validSplit = 0.5 # percentage of data to use for validation
testSplit = 0.0 # percentage of data to use for testing

########### Start of script ###########

import pickle
file_pi = open('model.obj', 'r')
model = pickle.load(file_pi)

X_test = np.genfromtxt('test_data.csv')
n_test,d = X_test.shape


########### NOTE ###########
# you can train the model gain, eve using different data, by executing:
# model.train(model,X,Y,options)

# use the model to generate predictions for the test set
data=model.predict(X_test[:,:].copy())

predictions=np.zeros([n_test,4])

for i in range(n_test):
predictions[i,0]=data[0][i][0] #mu
predictions[i,1]=data[1][i][0] #variance
predictions[i,2]=data[2][i][0] #modelVariance
predictions[i,3]=data[3][i][0] #noiseVariance


np.savetxt('prediction_data.csv',predictions)

print 'predictions made'
62 changes: 62 additions & 0 deletions tomo_challenge/classifiers/classifier_train_GPz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import GPz
from numpy import *
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
import numpy as np

import h5py


########### Model options ###############

method = 'VC' # select method, options = GL, VL, GD, VD, GC and VC [required]
m = 100 # number of basis functions to use [required]
joint = True # jointly learn a prior linear mean function [default=true]
heteroscedastic = True # learn a heteroscedastic noise process, set to false interested only in point estimates
csl_method = 'normal' # cost-sensitive learning option: [default='normal']
# 'balanced': to weigh rare samples more heavly during train
# 'normalized': assigns an error cost for each sample = 1/(z+1)
# 'normal': no weights assigned, all samples are equally important
#
binWidth = 0.1 # the width of the bin for 'balanced' cost-sensitive learning [default=range(z_spec)/100]
decorrelate = True # preprocess the data using PCA [default=False]

########### Training options ###########

maxIter = 500 # maximum number of iterations [default=200]
maxAttempts = 50 # maximum iterations to attempt if there is no progress on the validation set [default=infinity]
trainSplit = 0.5 # percentage of data to use for training
validSplit = 0.5 # percentage of data to use for validation
testSplit = 0.0 # percentage of data to use for testing

########### Start of script ###########



X_train = np.genfromtxt('train_data.csv')
n_train,d = X_train.shape
Y_train = np.genfromtxt('training_z.csv')
Y_train = Y_train.reshape(-1,1)




# sample training, validation and testing sets from the data
training,validation,testing = GPz.sample(n_train,trainSplit,validSplit,testSplit)


# get the weights for cost-sensitive learning
omega = GPz.getOmega(Y_train, method=csl_method)


# initialize the initial model
model = GPz.GP(m,method=method,joint=joint,heteroscedastic=heteroscedastic,decorrelate=decorrelate)

# train the model
model.train(X_train.copy(), Y_train.copy(), omega=omega, training=training, validation=validation, maxIter=maxIter, maxAttempts=maxAttempts)


import pickle
file_pi = open('model.obj', 'w')
pickle.dump(model, file_pi)