-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoperator_funcs.py
More file actions
100 lines (78 loc) · 3.26 KB
/
operator_funcs.py
File metadata and controls
100 lines (78 loc) · 3.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import polars as pl
import pandas as pd
import numpy as np
import pacmap
def reshape_data(main_data, col_data, row_data):
"""
Reshape the long-format data into a matrix suitable for PaCMAP.
Args:
main_data: Polars DataFrame with .y, .ci, .ri columns
col_data: Polars DataFrame with eventId, .ci columns
row_data: Polars DataFrame with pixel_id, .ri columns
Returns:
numpy.ndarray: Matrix where each row is an event (image) and each column is a pixel
"""
# Join main data with column and row data
df = (main_data
.join(col_data, on='.ci', how='left')
.join(row_data, on='.ri', how='left'))
# Convert to wide format where each row is an event and each column is a pixel
pivot_df = df.pivot(
index='logicle..event_id',
columns=['logicle..channel_name', 'logicle..channel_description'],
values='.y'
)
# Convert to numpy array for PaCMAP
# issue 5: Must remove eventId
matrix = pivot_df.drop('logicle..event_id').to_numpy()
# Fill NaN values with 0 (if any)
# matrix = np.nan_to_num(matrix, nan=0.0)
return matrix
def apply_pacmap(data_matrix, n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0):
"""
Apply PaCMAP dimensionality reduction.
Args:
data_matrix: numpy.ndarray where each row is an event and each column is a pixel
n_components: Number of dimensions in the embedding
n_neighbors: Number of neighbors for the kNN graph
MN_ratio: Ratio of mid-near pairs to be sampled
FP_ratio: Ratio of further pairs to be sampled
Returns:
numpy.ndarray: Reduced dimensionality embedding
"""
# Initialize PaCMAP
embedding = pacmap.PaCMAP(
n_components=n_components,
n_neighbors=n_neighbors,
MN_ratio=MN_ratio,
FP_ratio=FP_ratio
)
# Create wide format data matrix - transpose so each pixel_id is a column
# Fit and transform the data
# Issue #4 : Missing init pca
result = embedding.fit_transform(data_matrix, init='pca')
return result
def format_results(embedding, col_data):
"""
Format the PaCMAP results for Tercen.
Args:
embedding: numpy.ndarray with the reduced dimensionality data
col_data: Polars DataFrame with eventId, .ci columns
Returns:
pandas.DataFrame: DataFrame in Tercen format with .ci, .ri, and component values
"""
# Create a DataFrame with the embedding results
result_df = pd.DataFrame(embedding)
# Rename columns to indicate PaCMAP components
renamed_cols = {i: f"PaCMAP_{i+1}" for i in range(embedding.shape[1])}
result_df = result_df.rename(columns=renamed_cols)
# Add eventId from col_data
unique_events_col = col_data.select(['.ci']).to_pandas()
# unique_events_row = col_data.select(['.ri']).to_pandas()
# result_df['eventId'] = unique_events['eventId'].values
result_df['.ci'] = unique_events_col['.ci'].values
# Add .ri column (set to 0 as we have one row per event)
result_df['.ri'] = 0#unique_events_row['.ri'].values
# Convert .ci and .ri to integers
result_df = result_df.astype({".ci": np.int32, ".ri": np.int32})
return result_df