32
32
import numpy as np
33
33
import pandas as pd
34
34
import scipy .sparse as sparse
35
+
35
36
try :
36
37
from pandarallel import pandarallel
38
+
37
39
pandarallel .initialize (progress_bar = True )
38
- parallel_apply = True
40
+ PARALLEL_APPLY = True
39
41
except ImportError :
40
- warnings .warn ("pandarallel not installed, parallel processing will not be available." )
41
- parallel_apply = False
42
-
42
+ warnings .warn (
43
+ "pandarallel not installed, parallel processing will not be available."
44
+ )
45
+ PARALLEL_APPLY = False
43
46
44
47
import src .utils .log_time as log_time
45
48
46
49
MIN_ARR_SIZE_FOR_CACHE = 10_000
47
50
48
51
POSTIEOR_DATA_PATH = "./data/posterior_y_{year}.parquet"
49
52
53
+
50
54
def default_combine_posterior_prior_y_func (arrs : List [np .ndarray ]) -> np .ndarray :
51
55
"""Default function for combining the posterior for years t-1..t-n and the prior for year t.
52
56
@@ -195,21 +199,21 @@ def build_adjacency_matrix(
195
199
196
200
def calculate_prior_y_from_eids (
197
201
auids : np .ndarray ,
198
- auid_eids : pd .Series , # auid:int -> eids:List[int]
199
- eid_score : pd .Series , # eid:int -> score:float
202
+ auid_eids : pd .Series , # auid:int -> eids:List[int]
203
+ eid_score : pd .Series , # eid:int -> score:float
200
204
agg_score_func : Callable [[np .ndarray ], float ] = np .mean ,
201
205
) -> np .ndarray :
202
206
203
207
selected_eids = auid_eids [auids ]
204
208
205
- if len (selected_eids ) > MIN_ARR_SIZE_FOR_CACHE and parallel_apply :
209
+ if len (selected_eids ) > MIN_ARR_SIZE_FOR_CACHE or PARALLEL_APPLY :
206
210
y = selected_eids .parallel_apply (
207
211
lambda eids : agg_score_func (eid_score [eids ])
208
212
).astype (eid_score .dtype )
209
213
else :
210
- y = selected_eids .apply (
211
- lambda eids : agg_score_func ( eid_score [ eids ])
212
- ). astype ( eid_score . dtype )
214
+ y = selected_eids .apply (lambda eids : agg_score_func ( eid_score [ eids ])). astype (
215
+ eid_score . dtype
216
+ )
213
217
214
218
return y
215
219
@@ -247,68 +251,43 @@ def get_previous_posterior(
247
251
return post_s .reindex (auids ).values
248
252
249
253
250
- # def calculate_prior_y(
251
- # auids: np.ndarray,
252
- # auid_eids: pd.Series,
253
- # eid_score: pd.Series,
254
- # year: int,
255
- # prior_y_aggregate_eid_score_func: Callable[[np.ndarray], float] = np.mean,
256
- # combine_posterior_prior_y_func: Callable[
257
- # [List[np.ndarray]], np.ndarray
258
- # ] = default_combine_posterior_prior_y_func,
259
- # posterior_y_missing_value: float = 0.5,
260
- # ) -> np.ndarray:
261
-
262
- # # get all of eids for each auid
263
- # selected_eids = auid_eids[auids]
264
-
265
- # prior_y = selected_eids.apply(
266
- # lambda eids: prior_y_aggregate_eid_score_func(eid_score[eids])
267
- # )
268
-
269
- # # TODO: support an arbitrary number of years
270
- # posterior_y_path = f"./data/posterior_y_{year}.parquet"
271
- # if os.path.exists(posterior_y_path):
272
- # posterior_y_dframe = pd.read_parquet(posterior_y_path)
273
-
274
- # known_auids = posterior_y_dframe.index.values
275
- # new_auids = set(auids) - set(known_auids)
276
- # posterior_y_t_minus_1 = pd.Series(
277
- # data=posterior_y_dframe["score"].values,
278
- # index=known_auids,
279
- # )
280
- # del posterior_y_dframe
281
-
282
- # # if there are new ids tat we haven't seen before, we need to add them
283
- # # with the default value.
284
- # if new_auids:
285
- # for auid in new_auids:
286
- # posterior_y_t_minus_1[auid] = posterior_y_missing_value
287
- # posterior_y_t_minus_1.sort_index(inplace=True)
288
-
289
- # # There is a chance that there are less auids in the prior_y than in the
290
- # # posterior_y. If that is the case, we need to limit the calculation
291
- # # to the auids that are in both.
292
- # if len(posterior_y_t_minus_1) > 0:
293
- # posterior_matched = posterior_y_t_minus_1.index.intersection(prior_y.index)
294
- # posterior_y_t_minus_1 = posterior_y_t_minus_1[posterior_matched]
295
- # prior_y = combine_posterior_prior_y_func(
296
- # np.stack([prior_y, posterior_y_t_minus_1], axis=1),
297
- # )
298
-
299
- # return prior_y
300
-
301
-
302
254
def get_data (
303
255
year : int ,
304
256
logger : logging .Logger = None ,
305
257
prior_y_aggregate_eid_score_func : Callable [[np .ndarray ], float ] = np .mean ,
306
258
n_years_lookback : int = 1 ,
307
- combine_posterior_prior_y_func : Callable [[List [np .ndarray ]], np .ndarray ] = default_combine_posterior_prior_y_func ,
259
+ combine_posterior_prior_y_func : Callable [
260
+ [List [np .ndarray ]], np .ndarray
261
+ ] = default_combine_posterior_prior_y_func ,
308
262
adj_mat_dtype : np .dtype = bool ,
309
263
numeric_types : np .dtype = np .float32 ,
310
264
operate_on_subgraphs_separately : bool = False ,
311
265
) -> Iterable [Tuple [sparse .csr_matrix , np .ndarray , np .ndarray ]]:
266
+ """The get_data function for the SciServer backend.
267
+
268
+ Args:
269
+ year (int): The year to get the data for.
270
+ logger (logging.Logger): The logger to use. Defaults to None.
271
+ prior_y_aggregate_eid_score_func (Callable[[np.ndarray], float]): A function
272
+ that takes an array of scores and returns a single score. Defaults to
273
+ np.mean.
274
+ n_years_lookback (int): The number of years to look back when getting the
275
+ previous posterior. Defaults to 1.
276
+ combine_posterior_prior_y_func (Callable[[List[np.ndarray]], np.ndarray]): A
277
+ function that takes a list of arrays and returns a single array. Defaults
278
+ to default_combine_posterior_prior_y_func.
279
+ adj_mat_dtype (np.dtype): The data type of the adjacency matrix. Defaults to
280
+ bool.
281
+ numeric_types (np.dtype): The data type of the numeric values. Defaults to
282
+ np.float32.
283
+ operate_on_subgraphs_separately (bool): Whether to operate on subgraphs
284
+ separately. Defaults to False.
285
+
286
+ Yields:
287
+ Iterable[Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]]: An iterable with
288
+ the adjacency matrix, the auids and the prior_y for the given year.
289
+
290
+ """
312
291
313
292
os .makedirs ("./data/cache" , exist_ok = True )
314
293
@@ -346,22 +325,34 @@ def get_data(
346
325
for i , auids in enumerate (auids_iter ):
347
326
348
327
if len (auids ) > MIN_ARR_SIZE_FOR_CACHE :
349
- #TODO: This section might be too hard to read
328
+ # TODO: This section might be too hard to read
350
329
logger .info (f"n auids: { len (auids )} , looking for cached adjacency matrix" )
351
- if os .path .exists (f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz" ):
330
+ if os .path .exists (
331
+ f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz"
332
+ ):
352
333
logger .info ("Found cached adjacency matrix, loading..." )
353
334
with log_time .LogTime (f"Loading adjacency matrix { i } " , logger ):
354
- A = sparse .load_npz (f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz" )
355
- auids = np .load (f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } auids_{ year } _{ i } .npy" )
335
+ A = sparse .load_npz (
336
+ f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz"
337
+ )
338
+ auids = np .load (
339
+ f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } auids_{ year } _{ i } .npy"
340
+ )
356
341
else :
357
342
logger .info ("No cached adjacency matrix found, building..." )
358
343
with log_time .LogTime (f"Building adjacency matrix { i } " , logger ):
359
344
auids , A = adj_mat_func (auids )
360
345
with log_time .LogTime (f"Caching adjacency matrix { i } " , logger ):
361
346
logger .info (f"Saving adjacency matrix to cache" )
362
- sparse .save_npz (f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz" , A )
347
+ sparse .save_npz (
348
+ f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } adjacency_matrix_{ year } _{ i } .npz" ,
349
+ A ,
350
+ )
363
351
logger .info (f"Saving auids to cache" )
364
- np .save (f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } auids_{ year } _{ i } .npy" , auids )
352
+ np .save (
353
+ f"./data/cache/{ 'iter_' * operate_on_subgraphs_separately } auids_{ year } _{ i } .npy" ,
354
+ auids ,
355
+ )
365
356
else :
366
357
with log_time .LogTime (f"Building adjacency matrix { i } " , logger ):
367
358
auids , A = adj_mat_func (auids )
@@ -374,18 +365,25 @@ def get_data(
374
365
prior_y_aggregate_eid_score_func ,
375
366
)
376
367
377
- with log_time .LogTime (f"Retrieving posteriors for previous { n_years_lookback } years" , logger ):
378
- previous_posteriors = list (filter (
379
- lambda x : x is not None ,
380
- map (
381
- lambda year : get_previous_posterior (auids , year ),
382
- range (year - 1 , year - n_years_lookback - 1 , - 1 )
383
- ),
384
- ))
385
-
386
- with log_time .LogTime (f"Combining posteriors for previous { n_years_lookback } years" , logger ):
387
- prior_y = combine_posterior_prior_y_func ([prior_y_eids ] + previous_posteriors )
368
+ with log_time .LogTime (
369
+ f"Retrieving posteriors for previous { n_years_lookback } years" , logger
370
+ ):
371
+ previous_posteriors = list (
372
+ filter (
373
+ lambda x : x is not None ,
374
+ map (
375
+ lambda year : get_previous_posterior (auids , year ),
376
+ range (year - 1 , year - n_years_lookback - 1 , - 1 ),
377
+ ),
378
+ )
379
+ )
388
380
381
+ with log_time .LogTime (
382
+ f"Combining posteriors for previous { n_years_lookback } years" , logger
383
+ ):
384
+ prior_y = combine_posterior_prior_y_func (
385
+ [prior_y_eids ] + previous_posteriors
386
+ )
389
387
390
388
yield A , auids , prior_y .astype (numeric_types )
391
389
0 commit comments