Skip to content

Commit ad06048

Browse files
committed
compare diff between vasculature types
1 parent 0a9078f commit ad06048

File tree

3 files changed

+60
-34
lines changed

3 files changed

+60
-34
lines changed

src/find_clusters/combine_temporal_data.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def load_data(file_pattern, suffix_filter):
2424
data = pd.read_csv(file)
2525
data['TIME'] = time_point # Add the TIME column
2626
all_data.append(data)
27-
2827
# Combine all data into a single DataFrame
2928
combined_data = pd.concat(all_data, ignore_index=True)
29+
# Sort the data by TIME
30+
combined_data.sort_values(by='TIME', inplace=True)
3031
return combined_data

src/find_clusters/find_best_features.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -138,37 +138,34 @@ def visualize_features_response(data, time_point, features, response_name, label
138138
plt.savefig("combined_visualization.png")
139139
plt.show()
140140

141-
def visualize_vasculature_over_time(data, vasculature_type, features, label_column):
142-
# Filter data for the selected vasculature type
143-
data_filtered = data[data[label_column] == vasculature_type].copy()
144-
141+
def visualize_multiple_vasculatures_over_time(data, vasculature_types, features, label_column):
142+
# Filter data for the selected vasculature types
143+
data_filtered = data[data[label_column].isin(vasculature_types)].copy()
144+
145145
# Remove columns with excessive inf/-inf values
146146
threshold = 0.2
147147
columns_to_drop = [col for col in data_filtered.columns if ((data_filtered[col] == np.inf) | (data_filtered[col] == -np.inf)).mean() >= threshold]
148148
data_filtered = data_filtered.drop(columns=columns_to_drop)
149-
149+
150150
# Replace inf/-inf with NaN and drop rows with NaN
151151
data_filtered = data_filtered.replace([float('inf'), float('-inf')], float('nan')).dropna(axis=0)
152-
# Encode time points as labels
153-
time_points = data_filtered['TIME'].unique()
154-
time_points.sort() # Ensure time points are sorted
155-
152+
153+
# Reset the index
154+
data_filtered = data_filtered.reset_index(drop=True)
155+
156156
# Standardize features
157157
X = data_filtered[features]
158158
scaler = StandardScaler()
159159
X = scaler.fit_transform(X)
160-
160+
161161
# Perform PCA
162162
pca = PCA(n_components=2)
163163
reduced_features = pca.fit_transform(X)
164-
165-
# Extract time labels for coloring
166-
time_labels = data_filtered['TIME'].astype(str) # Convert to string for color mapping
167-
164+
168165
# PCA Explained Variance
169166
print("PCA Explained Variance Ratio:")
170167
print(pca.explained_variance_ratio_)
171-
168+
172169
# Feature importance (loadings for PC1)
173170
pc1_loadings = pca.components_[0]
174171
feature_importance = pd.DataFrame({
@@ -178,20 +175,38 @@ def visualize_vasculature_over_time(data, vasculature_type, features, label_colu
178175
}).sort_values(by="Absolute Loading", ascending=False)
179176
print("Feature Importance Rankings:")
180177
print(feature_importance)
181-
182-
# Define colormap for time points
178+
179+
# Define colormap for time points and markers for vasculature types
183180
cmap = plt.cm.viridis
184181
norm = plt.Normalize(vmin=min(data_filtered['TIME']), vmax=max(data_filtered['TIME']))
185-
colors = cmap(norm(data_filtered['TIME']))
186-
187-
# PCA Scatter Plot
182+
markers = ['o', 's', '^', 'D', 'P', 'X'] # Add more markers if needed
183+
184+
# Plot PCA scatter for multiple vasculature types
188185
plt.figure(figsize=(10, 7))
189-
scatter = plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=data_filtered['TIME'], cmap=cmap, s=50)
190-
colorbar = plt.colorbar(scatter, label="Time")
186+
for i, vasculature in enumerate(vasculature_types):
187+
subset = data_filtered[data_filtered[label_column] == vasculature]
188+
reduced_subset = reduced_features[subset.index] # Indices now align after reset_index
189+
plt.scatter(
190+
reduced_subset[:, 0], reduced_subset[:, 1],
191+
c=subset['TIME'], cmap=cmap, norm=norm, s=50,
192+
marker=markers[i % len(markers)], label=vasculature
193+
)
194+
195+
# Add colorbar and legend
196+
colorbar = plt.colorbar(label="Time")
197+
plt.legend(title="Vasculature Type")
191198
plt.xlabel("PCA Component 1")
192199
plt.ylabel("PCA Component 2")
193-
plt.title(f"PCA Visualization of {vasculature_type} Over Time")
194-
plt.savefig(f"pca_{vasculature_type}_over_time.png")
200+
plt.title("PCA Visualization of Multiple Vasculature Types Over Time")
201+
plt.savefig("pca_multiple_vasculatures_over_time.png")
202+
plt.show()
203+
204+
# Feature Correlation Heatmap for all selected vasculatures
205+
plt.figure(figsize=(12, 10))
206+
sns.heatmap(data_filtered[features].corr(), annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
207+
plt.title("Feature Correlation Heatmap")
208+
plt.savefig("feature_correlation_multiple_vasculatures.png")
209+
plt.show()
195210

196211

197212

@@ -212,12 +227,11 @@ def main():
212227
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
213228
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files
214229
data = load_data(data_path_pattern, suffix_filter)
230+
data = data[(data['LAYOUT'] == 'Savav') | (data['LAYOUT'] == 'Lav')]
215231
time_point = 15.0
216232
response_name = "ACTIVITY"
217-
#pca_visualization(data, time_point, features, label_column=label_column)
218-
#visualize_response(data, time_point, response_name, label_column)
219-
#visualize_features_response(data, time_point, features, response_name, label_column)
220-
vasculature_type = "C_Savav"
221-
visualize_vasculature_over_time(data, vasculature_type, features, label_column)
233+
visualize_features_response(data, time_point, features, response_name, label_column)
234+
vasculature_types = ["C_Savav", "C_Lav"]
235+
#visualize_multiple_vasculatures_over_time(data, vasculature_types, features, label_column)
222236
if __name__ == "__main__":
223237
main()

src/find_clusters/find_clusters.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@ def cluster_analysis_with_ground_truth(data, time_point, features, label_column)
4040
cm = confusion_matrix(ground_truth_labels, cluster_labels)
4141
plt.figure(figsize=(8, 5))
4242
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=range(optimal_clusters), yticklabels=label_encoder.classes_)
43+
for i in range(cm.shape[0]):
44+
for j in range(cm.shape[1]):
45+
plt.text(j + 0.5, i + 0.5, cm[i, j], ha='center', va='center', color='black')
4346
plt.xlabel("Predicted Clusters")
4447
plt.ylabel("Ground Truth")
4548
plt.title("Confusion Matrix")
@@ -61,24 +64,32 @@ def cluster_analysis_with_ground_truth(data, time_point, features, label_column)
6164

6265
def main():
6366
features = [
64-
"KEY", "RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
67+
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW",
6568
"NODES", "EDGES", "GRADIUS", "GDIAMETER", "AVG_ECCENTRICITY",
6669
"AVG_SHORTEST_PATH", "AVG_IN_DEGREES", "AVG_OUT_DEGREES",
6770
"AVG_DEGREE", "AVG_CLUSTERING", "AVG_CLOSENESS",
6871
"AVG_BETWEENNESS", "AVG_CORENESS"
6972
]
7073
label_column = "KEY"
7174
features = [
72-
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW"]
75+
"RADIUS", "LENGTH", "WALL", "SHEAR", "CIRCUM", "FLOW", "NODES", "EDGES"]
7376
# Define the pattern to locate the files and filter suffix
7477
data_path_pattern = os.path.join(os.path.dirname(__file__), "../../data/ARCADE/C-feature_*.csv")
7578
suffix_filter = "_15-04032023.csv" # Specify the suffix to filter files
7679
data = load_data(data_path_pattern, suffix_filter)
77-
#print(data.head()) # Show the first few rows of the combined DataFrame
80+
# Print number of rows
81+
data = data[(data['LAYOUT'] == 'Savav') | (data['LAYOUT'] == 'Lav')]
82+
# add graph density = nodes/edges
83+
data['DENSITY'] = data['NODES'] / data['EDGES']
84+
# print mean density of Lav and Savav
85+
print(data.groupby('LAYOUT')['DENSITY'].agg(['mean', 'std']))
86+
print(data.groupby('LAYOUT')['NODES'].agg(['mean', 'std']))
87+
print(data.groupby('LAYOUT')['EDGES'].agg(['mean', 'std']))
88+
print(data.head()) # Show the first few rows of the combined DataFrame
7889
#print(data['TIME'].unique()) # Display the unique time points
7990
# Analyze clusters for TIME=0.0 with ground truth comparison
8091
clustered_data = cluster_analysis_with_ground_truth(data, time_point=0.0, features=features, label_column=label_column)
81-
print(clustered_data.head()) # View the clustered data
92+
#print(clustered_data.head()) # View the clustered data
8293

8394

8495
if __name__ == '__main__':

0 commit comments

Comments
 (0)