BiGraph4TME/population_graph.py at master · Sulam-Group/BiGraph4TME · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import networkx as nx


class Population_Graph:
    def __init__(
        self,
        k_clustering=20,
        k_estimate=3,
        resolution=1.0,
        size_smallest_cluster=10,
        seed=1,
    ):
        self.k_clustering = k_clustering  # decides the coarseness of patient clustering
        self.k_estimate = (
            k_estimate  # number of nearest neighbors for estimating the community
        )
        self.resolution = resolution  # resolution parameter for community detection
        self.seed = seed  # random seed
        self.size_smallest_cluster = (
            size_smallest_cluster  # the size of the smallest cluster
        )

    def generate(self, Similarity_matrix, Patient_ids):
        """
        Generate the population graph.
        Parameters
        ----------
        Similarity_matrix : numpy array (n_patients, n_patients)
            The similarity matrix between patients.
        Patient_ids : list of str
            The patient ids.
        Returns
        -------
        Population_graph : networkx graph
            The population graph.
        """
        Similarity_matrix_ = Similarity_matrix.copy()  # make a copy
        np.fill_diagonal(Similarity_matrix_, 0)  # remove self-similarity

        for i in range(Similarity_matrix_.shape[0]):
            Similarity_matrix_[i, np.argsort(Similarity_matrix_[i, :])[: -self.k_clustering]] = 0
        adj_1 = np.maximum(
            Similarity_matrix_, Similarity_matrix_.transpose()
        )  # make it symmetric
        adj_2 = np.zeros_like(adj_1)  # IoU matrix
        for i in range(Similarity_matrix_.shape[0]):
            for j in range(Similarity_matrix_.shape[0]):
                neighbor_i = np.where(adj_1[i, :] > 0)[0]
                neighbor_j = np.where(adj_1[j, :] > 0)[0]
                if len(set(neighbor_i).union(set(neighbor_j))) == 0:
                    IoU = 0
                else:
                    IoU = len(set(neighbor_i).intersection(set(neighbor_j))) / len(
                        set(neighbor_i).union(set(neighbor_j))  # calculate the IoU
                    )
                adj_2[i, j] = IoU  # fill in the IoU matrix
        np.fill_diagonal(adj_2, 0)  # remove self-similarity
        G_population = nx.from_numpy_array(adj_2)  # create the population graph
        Patient_ids_dict = {
            i: Patient_ids[i] for i in range(len(Patient_ids))
        }  # create a dictionary for patient ids
        nx.set_node_attributes(
            G_population, Patient_ids_dict, "patientID"
        )  # set the patient ids as node attributes
        return G_population

    def community_detection(self, G_population):
        """
        Community detection on the population graph.
        Parameters
        ----------
        G_population : networkx graph
            The population graph.
        Returns
        -------
        Patient_subgroups : list of dict
            The patient subgroups. Each dict contains the patient ids in the subgroup.
        """
        Communities = nx.community.louvain_communities(
            G_population, weight="weight", resolution=self.resolution, seed=1
        )  # community detection using Louvain method
        Communities = [
            list(c) for c in Communities if len(c) > self.size_smallest_cluster
        ]  # remove small clusters
        Communities = sorted(
            Communities, key=lambda x: len(x), reverse=True
        )  # sort the clusters by size
        Patient_subgroups = []
        for i, c in enumerate(Communities):
            Patient_subgroups.append(
                {"patient_ids": [G_population.nodes[n]["patientID"] for n in c]}
            )
        return Patient_subgroups

    def estimate_community(
        self,
        Patient_ids_train,
        Patient_subgroups_train,
        Similarity_matrix,
    ):
        """
        Estimate the patient subgroups for the new patients.
        Parameters
        ----------
        Patient_ids_train : list of str
            The patient ids in the training set.
        Patient_ids_hat : list of str
            The patient ids in the test set.
        Patient_subgroups_train : list of str
            The patient subgroups in the training set.
        Similarity_matrix : numpy array (n_patients_train, n_patients_hat)
            The similarity matrix between the training set and the test set.
        Returns
        -------
        Labels_hat : list
            The estimated patient subgroups. The key is the subgroup id, and the value is a list of patient ids.

        """
        Labels_train = np.zeros(len(Patient_ids_train), dtype=object)
        for subgroup in Patient_subgroups_train:
            subgroup_id = subgroup['subgroup_id']
            for patient in subgroup["patient_ids"]:
                Labels_train[Patient_ids_train.index(patient)] = subgroup_id
        assert len(Patient_ids_train) == Similarity_matrix.shape[0]
        Labels_hat = np.zeros(Similarity_matrix.shape[1], dtype=object)
        for new_patient in range(Similarity_matrix.shape[1]):
            idx = np.argsort(Similarity_matrix[:, new_patient])[::-1]
            knn_idx = idx[: self.k_estimate]
            knn_labels = Labels_train[knn_idx]
            knn_similarities = Similarity_matrix[:, new_patient][knn_idx]
            unique, counts = np.unique(knn_labels, return_counts=True)
            similarities_within_knn = np.zeros(len(unique))
            for j in range(len(unique)):
                similarities_within_knn[j] = np.sum(
                    knn_similarities[knn_labels == unique[j]]
                )
            Labels_hat[new_patient] = unique[np.argmax(similarities_within_knn)]

        return Labels_hat