ProteinGraphML/GenTrainingAndTestFeatures.py at master · JessicaMaine/ProteinGraphML · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
###
import sys, os, time, argparse, logging
import pyreadr, pickle
import numpy as np
import pandas as pd
import networkx as nx

from ProteinGraphML.DataAdapter import OlegDB, selectAsDF, TCRD
from ProteinGraphML.GraphTools import ProteinDiseaseAssociationGraph
from ProteinGraphML.MLTools.MetapathFeatures import metapathFeatures, ProteinInteractionNode, KeggNode, ReactomeNode, \
    GoNode, InterproNode, getMetapaths, getTrainingProteinIds
from ProteinGraphML.MLTools.Data import BinaryLabel
from ProteinGraphML.MLTools.Models import XGBoostModel
from ProteinGraphML.MLTools.Procedures import *


def savePickleObject(fileName, data):
    """
    This function saves data into a pickle file.
    """
    with open(fileName, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


def saveTrainPredictSet(allData, outputDir, disease=None, trainingfile=None, predictfile=None):
    """
    This function saves training and predict data sets in pickle format.
    """
    logging.info('Number of rows and features in allData: {0}'.format(allData.shape))
    if disease is not None:
        pklTrainFile = outputDir + '/' + disease + '_TrainingData.pkl'
        pklPredictFile = outputDir + '/' + disease + '_PredictData.pkl'

        # extract train data from the dataframe
        trainData = allData.loc[allData['Y'].isin([0, 1])]
        logging.info('Number of rows and features in training data: {0}'.format(trainData.shape))
        logging.info("Writing train data to file: {0}".format(pklTrainFile))
        savePickleObject(pklTrainFile, trainData)
        # print (trainData)

        # extract predict data from the dataframe
        PredictData = allData.loc[allData['Y'] == -1]
        # PredictData = PredictData.drop('Y', axis=1) #drop label from the predict data
        logging.info('Number of rows and features in predict data: {0}'.format(PredictData.shape))
        logging.info("Writing predict data to file: {0}".format(pklPredictFile))
        savePickleObject(pklPredictFile, PredictData)
    # print (PredictData)

    elif predictfile is None and trainingfile is not None:
        pklTrainFile = outputDir + '/' + os.path.basename(trainingfile).split('.')[0] + '_TrainingData.pkl'
        logging.info("Writing train data to file: {0}".format(pklTrainFile))
        savePickleObject(pklTrainFile, allData)

    elif predictfile is not None and trainingfile is not None:
        pklTrainFile = outputDir + '/' + os.path.basename(trainingfile).split('.')[0] + '_TrainingData.pkl'
        pklPredictFile = outputDir + '/' + os.path.basename(predictfile).split('.')[0] + '_PredictData.pkl'

        # extract train data from the dataframe
        trainData = allData.loc[allData['Y'].isin([0, 1])]
        logging.info('Number of rows and features in training data: {0}'.format(trainData.shape))
        logging.info("Writing train data to file: {0}".format(pklTrainFile))
        savePickleObject(pklTrainFile, trainData)
        # print (trainData)

        # extract predict data from the dataframe
        PredictData = allData.loc[allData['Y'] == -1]
        # PredictData = PredictData.drop('Y', axis=1) #drop label from the predict data
        logging.info('Number of rows and features in predict data: {0}'.format(PredictData.shape))
        logging.info("Writing predict data to file: {0}".format(pklPredictFile))
        savePickleObject(pklPredictFile, PredictData)
    # print (PredictData)
    else:
        logging.error('Missing argument(s)')


# ******************************* START OF THE CODE ******************************************************* #
if __name__ == '__main__':
    """
    This program is used to generate metapath features using the knowledge graph generated by the code "BuildKG.py".
    It uses pickled training and predict dictionaries created by "PrepTrainingAndTestSets.py" to generate training and
    predict data sets for machine learning models.
    This code can also be used for Mammalian Phenotype ID, e.g. MP_0000180 with argument --disease. It also uses static
    features to generate training and predict sets, but static features are optional.
    """
    t0 = time.time()
    # DATA_DIR = os.getcwd() + '/DataForML/'
    DEFAULT_GRAPH = "ProteinDisease_GRAPH.pkl"
    DEFAULT_STATIC_FEATURES = "gtex,lincs,ccle,hpa"
    DBS = ['olegdb', 'tcrd']

    # Command-line arguments
    parser = argparse.ArgumentParser(description='Generate features for training and predict set',
                                     epilog='Protein Ids with True label must be provided')
    parser.add_argument('--disease', metavar='disease', help='Mammalian Phenotype ID, e.g. MP_0000180')
    parser.add_argument('--trainingfile', help='pickled training set, e.g. "diabetes.pkl"')
    parser.add_argument('--predictfile', help='pickled predict set, e.g. "diabetes_predict.pkl"')
    parser.add_argument('--outputdir', required=True,
                        help='directory where train and predict data with features will be saved, '
                             'e.g. "diabetes_no_lincs"')
    parser.add_argument('--kgfile', default=DEFAULT_GRAPH,
                        help='input pickled KG (default: "{0}")'.format(DEFAULT_GRAPH))
    parser.add_argument('--static_data', default=DEFAULT_STATIC_FEATURES,
                        help='(default: "{0}")'.format(DEFAULT_STATIC_FEATURES))
    parser.add_argument('--static_dir', default=os.getcwd() + "/ProteinGraphML/MLTools/StaticFeatures", )
    parser.add_argument('--db', choices=DBS, default="tcrd", help='{0}'.format(str(DBS)))
    parser.add_argument("-v", "--verbose", action="count", default=0, help="verbosity")

    args = parser.parse_args()
    logging.basicConfig(format='%(levelname)s:%(message)s', level=(logging.DEBUG if args.verbose > 1 else logging.INFO))

    # Get data from file or disease
    # disease = argData['disease']
    # trainingfile = argData['trainingfile']
    # predictfile = argData['predictfile']
    fileData = None

    # folder where train and predict data with features will be stored
    # outputDir = argData['outputdir']
    if not os.path.isdir(args.outputdir):
        logging.info('Create the output directory')
        os.makedirs(args.outputdir)
    logging.info('Output directory for ML data(Training/predict): {0}'.format(args.outputdir))

    # check whether file or disease was given
    if args.trainingfile is None and args.disease is None:
        parser.error("--disease or -- training file must be specified.")

    # fetch KG data
    # graphString = args.kgfile
    currentGraph = ProteinDiseaseAssociationGraph.load(args.kgfile)
    logging.info("GRAPH {0} LOADED".format(args.kgfile))

    # Access the db adaptor. Make TCRD as the default DB
    dbAdapter = OlegDB() if args.db == "olegdb" else TCRD()

    if args.trainingfile is not None and args.disease is None:
        # trainingPklFile = trainingfile
        logging.info('Input training file: {0}'.format(args.trainingfile))
        try:
            with open(args.trainingfile, 'rb') as f:
                fileData = pickle.load(f)
        except:
            logging.error('Invalid pickled training set file')
            exit()

        # Also add predict data if provided
        if args.predictfile is not None:
            # predictPklFile = predictfile
            logging.info('Input predict file: {0}'.format(args.predictfile))
            try:
                with open(args.predictfile, 'rb') as f:
                    fileData.update(pickle.load(f))  # fileData will now have both train and predict set
            except:
                logging.error('Invalid pickled predict set file')
                exit()
    elif args.trainingfile is None and args.disease is not None:
        logging.info("running on this disease: {0}".format(args.disease))
        fullData = {}
        # get positive and negative training protein ids
        trainP, trainF = getTrainingProteinIds(args.disease, currentGraph)
        fullData[True] = trainP
        fullData[False] = trainF
        # get all protein ids
        allProteinIds = dbAdapter.fetchAllProteinIds()
        allProteinIds = set(allProteinIds['protein_id'].tolist())
        # prepare predict set
        predictProteinSet = allProteinIds.difference(trainP)
        predictProteinSet = predictProteinSet.difference(trainF)
        fullData['unknown'] = predictProteinSet
        fileData = fullData
    else:
        logging.error('Wrong parameters passed')

    # Nodes
    nodes = [ProteinInteractionNode, KeggNode, ReactomeNode, GoNode, InterproNode]

    # Static features that need to be included
    if len(args.static_data) == 0:
        staticFeatures = []
    else:
        staticFeatures = args.static_data.split(',')
    logging.info(staticFeatures)
    logging.info("--- METAPATH FEATURE SETS: {0}".format(len(nodes)))
    logging.info("--- STATIC FEATURE SETS: {0}".format(len(staticFeatures)))
    logging.info("--- STATIC FEATURE DIR: {0}".format(args.static_dir))

    # fetch the description of proteins
    idDescription = dbAdapter.fetchPathwayIdDescription()  # fetch the description

    # generate features
    if fileData is not None:
        # logging.info("FOUND {0} POSITIVE LABELS".format(len(fileData[True])))
        # logging.info("FOUND {0} NEGATIVE LABELS".format(len(fileData[False])))
        allData = metapathFeatures(args.disease, currentGraph, nodes, idDescription, staticFeatures, args.static_dir,
                                   loadedLists=fileData).fillna(0)
    else:
        logging.error('fileData should not be None')
        exit()

    # Divide allData into training/predict set and save them
    saveTrainPredictSet(allData, args.outputdir, args.disease, args.trainingfile, args.predictfile)

    logging.info('{0}: elapsed time: {1}'.format(os.path.basename(sys.argv[0]),
                                                 time.strftime('%Hh:%Mm:%Ss', time.gmtime(time.time() - t0))))