wip sciserver backend, seems to work, needs more tests

ryanhausen · ryanhausen · commit 3b1afc2ee0a0 · 2024-03-19T21:33:33.000-04:00
diff --git a/README.md b/README.md
@@ -39,6 +39,47 @@ For other options, see the help message.
 python -m src.run_algo --help
 ```
 
+### Implementing a runtime
+
+The algorithm works generall in three phases:
+
+1. Get the prior data for that year and any number of prior years.
+2. Run the label propagation algorithm
+3. Update the posterior data for that year
+
+A backend then needs to implement steps 1 and 3.
+
+You need to implement the following functions
+
+```python
+MaybeSparseMatrix = Union[np.ndarray, sp.spmatrix]
+
+get_data(
+    year: int,
+    logger: logging.Logger
+) -> Tuple[MaybeSparseMatrix, np.ndarray, np.ndarray]:
+```
+
+This function accepts a year and a logger and returns a tuple of the following:
+- The adjacency matrix
+- The auids
+- The prior for the auids
+
+The second function you need to implement is
+
+```python
+def update_posterior(
+    auids: np.ndarray,
+    posterior_y_value: np.ndarray,
+    year: int,
+    logger: logging.Logger,
+) -> None:
+```
+
+This function accepts the auids, the posterior_y_value, and the year and
+updates the posterior values for that year. It's important to note that
+if you parse the graph in pieces of disconnnected sets, this will update
+the same file multiple times.
 
 ### TODO
 
diff --git a/src/backend/elsevier.py b/src/backend/elsevier.py
@@ -20,13 +20,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 from typing import Tuple
 
 import numpy as np
 import scipy.sparse as sparse
 
 
-def get_data(year: int) -> Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]:
+def get_data(year: int, logger: logging.Logger) -> Tuple[sparse.csr_matrix, np.ndarray, np.ndarray]:
     raise NotImplementedError("Not implemented in the Elsevier backend.")
 
 
diff --git a/src/backend/sciserver.py b/src/backend/sciserver.py
@@ -26,11 +26,20 @@
 import itertools
 import logging
 import os
+import warnings
 from typing import Callable, Iterable, Iterator, List, Tuple, Union
 
 import numpy as np
 import pandas as pd
 import scipy.sparse as sparse
+try:
+    from pandarallel import pandarallel
+    pandarallel.initialize(progress_bar=True)
+    parallel_apply = True
+except ImportError:
+    warnings.warn("pandarallel not installed, parallel processing will not be available.")
+    parallel_apply = False
+
 
 import src.utils.log_time as log_time
 
@@ -59,12 +68,8 @@ def default_combine_posterior_prior_y_func(arrs: List[np.ndarray]) -> np.ndarray
     if not all(arr.shape[0] == length for arr in arrs):
         raise ValueError("All arrays must be same length.")
 
-    print("arr shapes:", [arr.shape for arr in arrs])
-
     outs = np.nanmean(np.stack(arrs, axis=1), axis=1)
 
-    print("out shape:", outs.shape)
-
     return np.nanmean(np.stack(arrs, axis=1), axis=1)
 
 
@@ -197,9 +202,14 @@ def calculate_prior_y_from_eids(
 
     selected_eids = auid_eids[auids]
 
-    y = selected_eids.apply(
-        lambda eids: agg_score_func(eid_score[eids])
-    ).astype(eid_score.dtype)
+    if len(selected_eids) > MIN_ARR_SIZE_FOR_CACHE and parallel_apply:
+        y = selected_eids.parallel_apply(
+            lambda eids: agg_score_func(eid_score[eids])
+        ).astype(eid_score.dtype)
+    else:
+        y = selected_eids.apply(
+            lambda eids: agg_score_func(eid_score[eids])
+        ).astype(eid_score.dtype)
 
     return y
 
@@ -384,6 +394,7 @@ def update_posterior(
     auids: np.ndarray,
     posterior_y_values: np.ndarray,
     year: int,
+    logger: logging.Logger,
 ) -> None:
 
     posterior_path = POSTIEOR_DATA_PATH.format(year=year)
diff --git a/src/run_algo.py b/src/run_algo.py
@@ -73,11 +73,10 @@ def run_algo_year(
     for i, (algo, (A, auids, prior_y)) in enumerate(
         zip(algo_instances, get_data_func(year, logger)), start=1
     ):
-        print(i, A.shape, auids.shape, prior_y.shape)
         with log_time.LogTime(f"Fitting data for {year}, ajd matrix {i}", logger):
             posterior_y = algo.fit_predict_graph(A, prior_y)
         with log_time.LogTime(f"Updating posterior for {year}", logger):
-            posterior_update_func(auids, posterior_y, year)
+            posterior_update_func(auids, posterior_y, year, logger)
 
 
 def main(args: Dict[str, Any]):
@@ -98,7 +97,7 @@ def main(args: Dict[str, Any]):
         get_data_func = functools.partial(
             sciserver.get_data,
             prior_y_aggregate_eid_score_func=np.mean,
-            combine_posterior_prior_y_func=functools.partial(np.mean, axis=1),
+            combine_posterior_prior_y_func=sciserver.default_combine_posterior_prior_y_func,
             operate_on_subgraphs_separately=args.get("parse_subgraphs_separately"),
         )
         posterior_update_func = sciserver.update_posterior
diff --git a/tests/test_sciserver.py b/tests/test_sciserver.py
@@ -22,6 +22,7 @@
 
 """Testing for the SciServer backend."""
 
+import logging
 import os
 
 import numpy as np
@@ -252,7 +253,7 @@ def test_update_posterior_nothing_exists():
     os.makedirs("./tmp", exist_ok=True)
     ss.POSTIEOR_DATA_PATH = "./tmp/posterior_{year}.parquet"
 
-    ss.update_posterior(auids, posterior_y, year)
+    ss.update_posterior(auids, posterior_y, year, logging.getLogger("test"))
 
     df = pd.read_parquet(ss.POSTIEOR_DATA_PATH.format(year=2020))
 
@@ -282,7 +283,7 @@ def test_update_posterior_something_exists():
     auids = [4, 5, 6]
     posterior_y = np.array([0.6, 0.5, 0.4])
 
-    ss.update_posterior(auids, posterior_y, year)
+    ss.update_posterior(auids, posterior_y, year, logging.getLogger("test"))
 
     df = pd.read_parquet(ss.POSTIEOR_DATA_PATH.format(year=2020))