diff --git a/bin/challenge.py b/bin/challenge.py index 44123ed1..34865ae8 100755 --- a/bin/challenge.py +++ b/bin/challenge.py @@ -10,6 +10,8 @@ sys.path.append(root_dir) import tomo_challenge as tc +from tomo_challenge.jax_metrics import compute_scores as jc_compute_scores + @click.command() @click.argument('config_yaml', type=str) def main(config_yaml): @@ -62,7 +64,7 @@ def main(config_yaml): config['metrics']) output_file.write (f"{classifier_name} {run} {settings} {scores} \n") - + print("scores= ",scores) def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, @@ -75,12 +77,22 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, train_data = tc.dict_to_array(train_data, bands, errors=errors, colors=colors) valid_data = tc.dict_to_array(valid_data, bands, errors=errors, colors=colors) + + #JEC 23/7/2020 restrict data + train_data=train_data[:1000000,:] + valid_data=valid_data[:50000,:] + + + #JEC 23/7/2020 restrict data + train_z = train_z[:1000000] + valid_z = valid_z[:50000] + print ("Executing: ", classifier_name, bands, settings) ## first check if options are valid for key in settings.keys(): if key not in classifier.valid_options and key not in ['errors', 'colors']: - raise ValueError(f"Key {key} is not recognized by classifier {name}") + raise ValueError(f"Key {key} is not recognized by classifier {classifier_name}") print ("Initializing classifier...") C=classifier(bands, settings) @@ -92,7 +104,9 @@ def run_one(classifier_name, bands, settings, train_data, train_z, valid_data, results = C.apply(valid_data) print ("Getting metric...") - scores = tc.compute_scores(results, valid_z, metrics=metrics) + # scores = tc.compute_scores(results, valid_z, metrics=metrics) + #Use JAX code 23/7/2020 + scores = jc_compute_scores(results, valid_z, metrics="SNR_3x2,FOM_3x2,FOM_DETF_3x2") return scores diff --git a/data/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.joblib b/data/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.joblib new file mode 100644 index 00000000..cc2cb44a Binary files /dev/null and b/data/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.joblib differ diff --git a/example/my_GB.yaml b/example/jec_GB.yaml similarity index 68% rename from example/my_GB.yaml rename to example/jec_GB.yaml index e229bc54..096cbe91 100644 --- a/example/my_GB.yaml +++ b/example/jec_GB.yaml @@ -1,14 +1,15 @@ metrics: [SNR_ww, SNR_3x2, FOM_3x2] -bands: riz +bands: griz training_file: data/training.hdf5 validation_file: data/validation.hdf5 -output_file: example/my_GB_output.txt +output_file: example/my_GB_output-6bins-50estim-griz-testDump.txt run: # This is a class name which will be looked up myGradientBosster: run_3: - # my new arg for the classifier + # to save file + savefile: /Users/campagne/Travail/Software/tomo_challenge-orig/data/jec_GB-6bins-50estim-griz-testDump.joblib n_estimators: 50 # This setting is sent to the classifier bins: 6 diff --git a/example/jec_MultiClf_10bins_50x2est_JAX_opt9_BEST_DETF.txt b/example/jec_MultiClf_10bins_50x2est_JAX_opt9_BEST_DETF.txt new file mode 100644 index 00000000..076f0710 --- /dev/null +++ b/example/jec_MultiClf_10bins_50x2est_JAX_opt9_BEST_DETF.txt @@ -0,0 +1 @@ +myCombinedClassifiers run_3 {'savefile': '/sps/lsst/users/campagne/tomo_challenge/data/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.joblib', 'n_estimators': 50, 'bins': 10, 'colors': True, 'errors': False} {'SNR_3x2': 1770.5667724609375, 'FOM_3x2': 11953.8701171875, 'FOM__DETF_3x2': 176.66757202148438} diff --git a/example/jec_multiclf.yaml b/example/jec_multiclf.yaml new file mode 100644 index 00000000..10b311fd --- /dev/null +++ b/example/jec_multiclf.yaml @@ -0,0 +1,19 @@ +metrics: [SNR_3x2] +bands: griz +training_file: data/training.hdf5 +validation_file: data/validation.hdf5 +output_file: example/jec_MultiClf_10bins_50x2est_JAX_opt9_BEST_DETF.txt + +run: + # This is a class name which will be looked up + myCombinedClassifiers: + run_3: + savefile: /sps/lsst/users/campagne/tomo_challenge/data/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.joblib + n_estimators: 50 + # This setting is sent to the classifier + bins: 10 + # These special settings decide whether the + # color and error colums are passed to the classifier + # as well as the magnitudes + colors: True + errors: False diff --git a/requirements.txt b/requirements.txt index 863f11fd..d9b42808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ scikit-learn sacc -git+https://github.com/LSSTDESC/firecrown/ +#git+https://github.com/LSSTDESC/firecrown/ sacc h5py matplotlib git+https://github.com/cmbant/camb -cosmosis-standalone -pyccl +#cosmosis-standalone +#pyccl camb click progressbar diff --git a/tomo_challenge/classifiers/jec_CombineClassifier.py b/tomo_challenge/classifiers/jec_CombineClassifier.py new file mode 100644 index 00000000..499198ef --- /dev/null +++ b/tomo_challenge/classifiers/jec_CombineClassifier.py @@ -0,0 +1,200 @@ +""" +This is an example tomographic bin generator using a random forest. + +Every classifier module needs to: + - have construction of the type + __init__ (self, bands, options) (see examples below) + - implement two functions: + train (self, training_data,training_z) + apply (self, data). + - define valid_options class varible. + +See Classifier Documentation below. +""" + +from .base import Tomographer +import numpy as np +from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier +from sklearn.svm import LinearSVC +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import make_pipeline +from sklearn.ensemble import StackingClassifier +from sklearn.linear_model import LogisticRegression +#JEC 15/7/20 use joblib to save the model +from joblib import dump, load + + +class myCombinedClassifiers(Tomographer): + """ Multi Classifiers """ + + # valid parameter -- see below + # JEC 15/7/20 savefile opt to dump the model + valid_options = ['bins', 'n_estimators', 'savefile'] + + # this settings means arrays will be sent to train and apply instead + # of dictionaries + wants_arrays = True + + def __init__ (self, bands, options): + """Constructor + + Parameters: + ----------- + bands: str + string containg valid bands, like 'riz' or 'griz' + options: dict + options come through here. Valid keys are listed as valid_options + class variable. + + Note: + ----- + Valiad options are: + 'bins' - number of tomographic bins + + """ + self.bands = bands + self.opt = options + + def train (self, training_data, training_z): + """Trains the classifier + + Parameters: + ----------- + training_data: numpy array, size Ngalaxes x Nbands + training data, each row is a galaxy, each column is a band as per + band defined above + training_z: numpy array, size Ngalaxies + true redshift for the training sample + + """ + #JEC + n_estimators=self.opt['n_estimators'] + + n_bin = self.opt['bins'] + print("Finding bins for training data") + # Now put the training data into redshift bins. + # Use zero so that the one object with minimum + # z in the whole survey will be in the lowest bin + training_bin = np.zeros(training_z.size) + + # Find the edges that split the redshifts into n_z bins of + # equal number counts in each + p = np.linspace(0, 100, n_bin + 1) + z_edges = np.percentile(training_z, p) + + + + #JEC 23/7/2020 + #Resultat de la sepoaration a # de gal constant per bin +## z_edges = np.array([0.02450252, 0.29473031, 0.44607811, 0.58123241,\ +## 0.70399809,0.8120476 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + +## #Chge1 z_edges[1] + +## z_edges = np.array([0.02450252, 0.15961642, 0.44607811, 0.58123241,\ +## 0.70399809,0.8120476 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + +## #Chge2 z_edges[2] + +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.58123241,\ +## 0.70399809,0.8120476 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + + +## #Chge2 z_edges[3] +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.70399809,0.8120476 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + + + +## #Chge2 z_edges[4] +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.58955, 0.8120476 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + + + +## #Chge2 z_edges[5] +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.58955, 0.700775 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + + +## #Chge2 z_edges[6] apres des essais +/- pas d'amlioration +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.58955, 0.700775 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + + +## #Chge2 z_edges[7] apres des essais +/- pas d'amlioration +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.58955, 0.700775 , 0.91746771, 1.03793607,\ +## 1.20601892, 1.49989052, 3.03609133]) + +## #Chge2 z_edges[8] +## z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ +## 0.58955, 0.700775 , 0.91746771, 1.03793607,\ +## 1.352945, 1.49989052, 3.03609133]) + + #Chge2 z_edges[9] + z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ + 0.58955, 0.700775 , 0.91746771, 1.03793607,\ + 1.352945, 1.8, 3.03609133]) + + + + + print("new set: ",z_edges) + + + + # Now find all the objects in each of these bins + for i in range(n_bin): + z_low = z_edges[i] + z_high = z_edges[i + 1] + training_bin[(training_z > z_low) & (training_z < z_high)] = i + + # for speed, cut down to 5% of original size + cut = np.random.uniform(0, 1, training_z.size) < 0.05 + training_bin = training_bin[cut] + training_data = training_data[cut] + + # Can be replaced with any classifier + + + estimators = [ ('gd', make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators=n_estimators, verbose=1))), + ('rf', make_pipeline(StandardScaler(), + RandomForestClassifier(n_estimators=n_estimators,verbose=1))) ] + classifier = StackingClassifier(estimators=estimators, + final_estimator=LogisticRegression(max_iter=5000)) + + print("Fitting classifier") + # Lots of data, so this will take some time + classifier.fit(training_data, training_bin) + + self.classifier = classifier + self.z_edges = z_edges + + #JEC 15/7/20 dump clf + dump(classifier, self.opt['savefile']) + + def apply (self, data): + """Applies training to the data. + + Parameters: + ----------- + Data: numpy array, size Ngalaxes x Nbands + testing data, each row is a galaxy, each column is a band as per + band defined above + + Returns: + tomographic_selections: numpy array, int, size Ngalaxies + tomographic selection for galaxies return as bin number for + each galaxy. + """ + tomo_bin = self.classifier.predict(data) + return tomo_bin + diff --git a/tomo_challenge/classifiers/my_GB.py b/tomo_challenge/classifiers/jec_GB.py similarity index 100% rename from tomo_challenge/classifiers/my_GB.py rename to tomo_challenge/classifiers/jec_GB.py diff --git a/tomo_challenge/jax_metrics.py b/tomo_challenge/jax_metrics.py new file mode 100644 index 00000000..e51c462d --- /dev/null +++ b/tomo_challenge/jax_metrics.py @@ -0,0 +1,206 @@ +import jax.numpy as np +import jax.random as rand +from jax import lax, jit, vmap, grad +from functools import partial +import jax_cosmo as jc +import jax + +def ell_binning(): + # we put this here to make sure it's used consistently + # plausible limits I guess + ell_max = 2000 + n_ell = 100 + # choose ell bins from 10 .. 2000 log spaced + ell_edges = np.logspace(2, np.log10(ell_max), n_ell+1) + ell = 0.5*(ell_edges[1:]+ell_edges[:-1]) + delta_ell =(ell_edges[1:]-ell_edges[:-1]) + return ell, delta_ell + +def get_probes(weights, labels, kernel_bandwidth=0.05, what='3x2'): + """ + JAX function that builds the 3x2pt probes, which can + then be used within any metric + """ + # pretend there is no evolution in measurement error. + # because LSST is awesome + sigma_e = 0.26 + + # assumed total over all bins, divided proportionally + n_eff_total_arcmin2 = 20.0 + + # Get the number of galaxiex. + ngal = len(weights) + nbins = weights.shape[-1] + + # # Generate CCL tracers and get total counts, for the noise + counts = [] + for b in range(nbins): + # total number of objects in this histogram bin + counts.append(weights[:,b].sum()) + + # Get the fraction of the total possible number of objects + # in each bin, and the consequent effective number density. + # We pretend here that all objects have the same weight. + # JAX modif: here instead we still know that all galaxies add up to 1, + # but they have bin specific weights + fractions = np.array([c / len(labels) for c in counts]) + n_eff = np.array([n_eff_total_arcmin2 * f for f in fractions]) + # Create redshift bins + nzs = [] + for i in range(nbins): + nzs.append(jc.redshift.kde_nz(labels, weights[:,i], bw=kernel_bandwidth, + gals_per_arcmin2=n_eff[i], zmax=4.)) + + probes = [] + # start with number counts + if (what == 'gg' or what == '3x2'): + # Define a bias parameterization + bias = jc.bias.inverse_growth_linear_bias(1.) + probes.append(jc.probes.NumberCounts(nzs, bias)) + + if (what == 'ww' or what == '3x2'): + probes.append(jc.probes.WeakLensing(nzs, sigma_e=sigma_e)) + + return probes + +def compute_snr_score(weights, labels, what='3x2'): + """Compute a score metric based on the total spectrum S/N + This is given by sqrt(mu^T . C^{-1} . mu) - baseline + where mu is the theory prediction and C the Gaussian covariance + for this set of bins. The baseline is the score for no tomographic binning. + Parameters + ---------- + tomo_bin: array + Tomographic bin choice (0 .. bin_max) for each object in the survey + z: array + True redshift for each object + Returns + ------- + score: float + Metric for this configuration + """ + # Retrieve the probes + probes = get_probes(weights, labels, what=what) + ell, delta_ell = ell_binning() + + @jax.jit + def snr_fn(probes, ell): + # instantiates fiducial cosmology + cosmo = jc.Cosmology( + Omega_c = 0.27, + Omega_b = 0.045, + h = 0.67, + n_s = 0.96, + sigma8 = 0.8404844953840714, + Omega_k=0., + w0=-1., wa=0.) + + # Compute mean and covariance + mu, C = jc.angular_cl.gaussian_cl_covariance_and_mean(cosmo, ell, probes, + f_sky=0.25, nonlinear_fn=jc.power.halofit) + + # S/N for correlated data, I assume, from generalizing + # sqrt(sum(mu**2/sigma**2)) + P = np.linalg.inv(C) + score = (mu.T @ P @ mu)**0.5 + return score + + return snr_fn(probes, ell) + +def compute_fom(weights, labels, inds=[0,4], what='3x2'): + """ + Computes the omega_c, sigma8 Figure of Merit + Actually the score returned is - area, I think it's more stable + """ + # Retrieve the probes + probes = get_probes(weights, labels, what=what) + ell, delta_ell = ell_binning() + + @jax.jit + def fom_fn(probes, ell): + # Compute the derivatives of the data vector + def mean(params): + cosmo = jc.Cosmology( + Omega_c = params[0], + Omega_b = params[1], + h = params[2], + n_s = params[3], + sigma8 = params[4], + Omega_k=0., + w0=params[5], wa=params[6] + ) + return jc.angular_cl.angular_cl(cosmo, ell, probes, nonlinear_fn=jc.power.halofit) + + # Compute the jacobian of the data vector at fiducial cosmology + fid_params = np.array([0.27, 0.045, 0.67, 0.96, 0.840484495, -1.0, 0.0]) + jac_mean = jax.jacfwd(lambda x: mean(x).flatten()) + + mu = mean(fid_params) + dmu = jac_mean(fid_params) + + # Compute the covariance matrix + cl_noise = jc.angular_cl.noise_cl(ell, probes) + C = jc.angular_cl.gaussian_cl_covariance(ell, probes, mu, cl_noise) + + invCov = np.linalg.inv(C) + + # Compute Fisher matrix for constant covariance + F = np.einsum('pa,pq,qb->ab', dmu, invCov, dmu) + + # Compute covariance + i,j = inds + covmat_chunk = np.linalg.inv(F)[:, [i, j]][[i, j], :] + + # And get the FoM, the inverse area of the 2 sigma contour + # area. + area = 6.17 * np.pi * np.sqrt(np.linalg.det(covmat_chunk)) + + return 1. / area + + return fom_fn(probes, ell) + +def compute_scores(tomo_bin, z, metrics='all'): + """Compute a set of score metrics. + Metric 1 + ======== + Score metric based on the total spectrum S/N + This is given by sqrt(mu^T . C^{-1} . mu) - baseline + where mu is the theory prediction and C the Gaussian covariance + for this set of bins. The baseline is the score for no tomographic binning. + Metric 2 + ======== + WL FoM in (currently) Omega_c - sigma_8. + Generated using a Fisher matrix calculation + Parameters + ---------- + tomo_bin: array + Tomographic bin choice (0 .. bin_max) for each object in the survey + z: array + True redshift for each object + metrics: str or list of str + Which metrics to compute. If all it will return all metrics, + otherwise just those required (see below) + Returns + ------- + scores: dict + A dictionary of scores. The following dict keys are present + "SNR_ww", "SNR_gg", "SNR_3x2": float + SNR scores for shear-shear, galaxy clustering and full 3x2pt + "FOM_ww", "FOM_gg", "FOM_3x2": float + FOM metric derived from SNR above + """ + tomo_bin = jax.nn.one_hot(tomo_bin, tomo_bin.max() + 1) + scores = {} + if metrics == 'all': + metrics = ["SNR_ww", "SNR_gg", "SNR_3x2", + "FOM_ww", "FOM_gg", "FOM_3x2", + "FOM_DETF_ww", "FOM_DETF_gg", "FOM_DETF_3x2"] + for what in ["ww", "gg", "3x2"]: + if ("SNR_"+what in metrics) or ("FOM_"+what in metrics): + scores['SNR_'+what] = float(compute_snr_score(tomo_bin, z, what=what)) + if "FOM_"+what in metrics: + scores['FOM_'+what] = float(compute_fom(tomo_bin, z, what=what)) + if "FOM_DETF_"+what in metrics: + scores['FOM__DETF_'+what] = float(compute_fom(tomo_bin, z, inds=[5,6], + what=what)) + return scores diff --git a/tomo_challenge/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.png b/tomo_challenge/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.png new file mode 100644 index 00000000..2baba604 Binary files /dev/null and b/tomo_challenge/jec__MultiClf-GB+RFNoDepthLimit-StdScaler-10bins-50estim-griz-JAX-Opt9_BEST_DETF.png differ diff --git a/tomo_challenge/jec_post.py b/tomo_challenge/jec_post.py new file mode 100644 index 00000000..fdc81a68 --- /dev/null +++ b/tomo_challenge/jec_post.py @@ -0,0 +1,87 @@ +#JEC To use the model after fitting + +import sys + +import numpy as np +from sklearn.ensemble import GradientBoostingClassifier +from joblib import dump, load + +from metrics import compute_scores, plot_distributions +from data import dict_to_array, load_data, load_redshift +# JEC 20/7/2020 beta test of F. Lanusse code +from jax_metrics import compute_scores as jc_compute_scores + +import classifiers + +def apply_model(classifier, filename, bands): + data = load_data(filename, bands, colors=True, errors=False) + data = dict_to_array(data, bands, colors=True, errors=False) + #JEC 20/7/20 restrict to 50,000 entries + data=data[:50000,:] + tomo_bin = classifier.predict(data) + return tomo_bin + + +def main(bands, n_bin, model_file, output_file, metrics_code='jax_cosmo'): + # Assume data in standard locations relative to current directory + training_file = '../data/training.hdf5' + validation_file = '../data/validation.hdf5' + + clf = load(model_file) + print('clf parameters: ',clf.get_params()) + + #Apply model + print('Apply model to validation set') + tomo_bin = apply_model(clf, validation_file, bands) + + # Get a score + z = load_redshift(validation_file) + #JEC 20/7/20 + z = z[:50000] + if metrics_code == 'jax_cosmo': + scores = jc_compute_scores(tomo_bin, z, metrics="SNR_3x2,FOM_3x2,FOM_DETF_3x2") + else: + scores = compute_scores(tomo_bin, z, metrics="SNR_3x2,FOM_3x2") + + + # Get the galaxy distribution of redshifts into n_z bins of + # equal number counts in each + # p = np.linspace(0, 100, n_bin + 1) + # z_edges = np.percentile(z, p) + # result of above + z_edges = np.array([0.02450252, 0.29473031, 0.44607811, 0.58123241,\ + 0.70399809,0.8120476 , 0.91746771, 1.03793607,\ + 1.20601892, 1.49989052, 3.03609133]) + + + #DETF optim 25/7/2020 + z_edges = np.array([0.02450252, 0.15961642, 0.37035, 0.475175,\ + 0.58955, 0.700775 , 0.91746771, 1.03793607,\ + 1.352945, 1.8, 3.03609133]) + + + print("new set: ",z_edges) + + + plot_distributions(z, tomo_bin, output_file, z_edges) + + # return + return scores + +if __name__ == '__main__': + # Command line arguments + try: + bands = sys.argv[1] + n_bin = int(sys.argv[2]) + model_file = sys.argv[3] + output_file = sys.argv[4] + assert bands in ['riz', 'griz'] + except: + sys.stderr.write("Script takes 4 arguments: 'riz'/'griz', n_bin, model_file, output fig\n") + sys.exit(1) + + # Run main code + scores = main(bands, n_bin, model_file, output_file) + print(f"Scores for {n_bin} bin(s) : ") + for k,v in scores.items(): + print (" %s : %4.1f"%(k,v))