uw-ictd
diff --git a/‎annotate_category_org_local.py
+9-9 b/‎annotate_category_org_local.py
+9-9
diff --git a/‎bytes_per_category.py
+9-9 b/‎bytes_per_category.py
+9-9
diff --git a/‎bytes_per_category_per_quantile.py
+7-7 b/‎bytes_per_category_per_quantile.py
+7-7
diff --git a/‎bytes_per_category_per_user.py
+7-7 b/‎bytes_per_category_per_user.py
+7-7
diff --git a/‎bytes_per_day_of_week.py
+10-10 b/‎bytes_per_day_of_week.py
+10-10
diff --git a/‎bytes_per_online_day_per_user.py
+7-7 b/‎bytes_per_online_day_per_user.py
+7-7
@@ -6,7 +6,7 @@
 import ipaddress
 import os
 
-import infra.dask_infra
+import infra.dask
 import mappers.domains
 import infra.platform
 
@@ -19,7 +19,7 @@ def _categorize_user(in_path, out_path):
 
     Requires the input parquet file specify an `fqdn` column, protocol, and ports
     """
-    frame = infra.dask_infra.read_parquet(in_path)
+    frame = infra.dask.read_parquet(in_path)
 
     # First pass assign by FQDN
     processor = mappers.domains.FqdnProcessor()
@@ -65,7 +65,7 @@ def _categorize_user(in_path, out_path):
         axis="columns",
         meta=("local", bool))
 
-    return infra.dask_infra.clean_write_parquet(frame, out_path, compute=False)
+    return infra.dask.clean_write_parquet(frame, out_path, compute=False)
 
 
 def _assign_org_from_ip(ip, current):
@@ -144,7 +144,7 @@ def _augment_user_flows_with_stun_state(in_path, out_path):
 
     Will not be correct unless the flow is indexed by time
     """
-    flow_frame = infra.dask_infra.read_parquet(in_path)
+    flow_frame = infra.dask.read_parquet(in_path)
     # Bookeeping for building a new frame
     max_rows_per_division = 100000
     out_chunk = list()
@@ -231,7 +231,7 @@ def _augment_user_flows_with_stun_state(in_path, out_path):
                                                          force=True)
     out_frame = out_frame.categorize(columns=["fqdn_source", "org", "category"])
 
-    infra.dask_infra.clean_write_parquet(out_frame, out_path)
+    infra.dask.clean_write_parquet(out_frame, out_path)
     print("Finished writing user", in_path)
 
 
@@ -285,7 +285,7 @@ def merge_parquet_frames(in_parent_directory, out_frame_path):
     div_on_disk = sorted(os.listdir(in_parent_directory))
     for div in div_on_disk:
         div_path = os.path.join(in_parent_directory, div)
-        frame = infra.dask_infra.read_parquet(div_path)
+        frame = infra.dask.read_parquet(div_path)
 
         if merged_frame is None:
             merged_frame = frame
@@ -299,11 +299,11 @@ def merge_parquet_frames(in_parent_directory, out_frame_path):
         force=True
     )
 
-    infra.dask_infra.clean_write_parquet(merged_frame, out_frame_path)
+    infra.dask.clean_write_parquet(merged_frame, out_frame_path)
 
 
 def _print_heavy_hitter_unmapped_domains(infile):
-    df = infra.dask_infra.read_parquet(infile)
+    df = infra.dask.read_parquet(infile)
 
     unmapped = df.loc[((df["org"] == "Unknown (Not Mapped)") | (df["category"] == "Unknown (Not Mapped)"))]
     df = unmapped.groupby("fqdn").sum()
@@ -328,7 +328,7 @@ def _print_heavy_hitter_unmapped_domains(infile):
 
     if platform.large_compute_support:
         print("To see execution status, check out the dask status page at localhost:8787 while the computation is running.")
-        client = infra.dask_infra.setup_platform_tuned_dask_client(20, platform)
+        client = infra.dask.setup_platform_tuned_dask_client(20, platform)
 
         #augment_all_user_flows(in_parent_directory, annotated_parent_directory, client)
         stun_augment_all_user_flows(annotated_parent_directory, stun_annotated_parent_directory, client)
 
@@ -5,8 +5,8 @@
 import pandas as pd
 
 import infra.constants
-import infra.dask_infra
-import infra.pd_infra
+import infra.dask
+import infra.pd
 import infra.platform
 
 
@@ -18,7 +18,7 @@
 
 
 def reduce_to_pandas(outfile, dask_client):
-    flows = infra.dask_infra.read_parquet(
+    flows = infra.dask.read_parquet(
         "data/clean/flows/typical_fqdn_org_category_local_TM_DIV_none_INDEX_start")[["category", "org", "bytes_up", "bytes_down", "protocol", "dest_port"]]
 
     # Compress to days
@@ -30,11 +30,11 @@ def reduce_to_pandas(outfile, dask_client):
     flows = flows.groupby(["start_bin", "category", "org"]).sum()
     flows = flows.compute()
 
-    infra.pd_infra.clean_write_parquet(flows, outfile)
+    infra.pd.clean_write_parquet(flows, outfile)
 
 
 def make_category_plot(infile):
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
     grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows["bytes_down"]
 
@@ -121,7 +121,7 @@ def make_category_plot(infile):
 
 
 def make_category_aggregate_bar_chart(infile):
-    grouped_flows = infra.pd_infra.read_parquet(infile).reset_index()
+    grouped_flows = infra.pd.read_parquet(infile).reset_index()
 
     # Consolidate by week instead of by day
     grouped_flows = grouped_flows[
@@ -167,7 +167,7 @@ def make_category_aggregate_bar_chart(infile):
 
 
 def compute_stats(infile, dimension):
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
     grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows["bytes_down"]
 
@@ -193,7 +193,7 @@ def compute_stats(infile, dimension):
 def make_org_plot(infile):
     """ Generate plots to explore the traffic distribution across organizations
     """
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
     grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows["bytes_down"]
 
@@ -345,7 +345,7 @@ def make_org_plot(infile):
 
     graph_temporary_file = "scratch/graphs/bytes_per_category"
     if platform.large_compute_support:
-        client = infra.dask_infra.setup_platform_tuned_dask_client(10, platform)
+        client = infra.dask.setup_platform_tuned_dask_client(10, platform)
         reduce_to_pandas(outfile=graph_temporary_file, dask_client=client)
         client.close()
 
 
@@ -4,32 +4,32 @@
 import altair as alt
 import pandas as pd
 
-import infra.dask_infra
-import infra.pd_infra
+import infra.dask
+import infra.pd
 import infra.platform
 
 
 def reduce_to_pandas(outfile, dask_client):
-    flows = infra.dask_infra.read_parquet(
+    flows = infra.dask.read_parquet(
         "data/clean/flows/typical_fqdn_org_category_local_TM_DIV_none_INDEX_start")[["user", "category", "org", "bytes_up", "bytes_down"]]
 
     # Do the grouping
     flows = flows.groupby(["user", "category", "org"]).sum()
     flows = flows.compute()
 
-    infra.pd_infra.clean_write_parquet(flows, outfile)
+    infra.pd.clean_write_parquet(flows, outfile)
 
 
 def make_category_quantiles_plots(infile):
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
     grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows["bytes_down"]
     user_category_total = grouped_flows[["user", "category", "bytes_total"]].groupby(
         ["user", "category"]
     ).sum().reset_index()
 
     # Filter users by time in network to eliminate early incomplete samples
-    user_active_ranges = infra.pd_infra.read_parquet(
+    user_active_ranges = infra.pd.read_parquet(
         "data/clean/user_active_deltas.parquet")[["user", "days_since_first_active", "days_active", "days_online"]]
     # Drop users that joined less than a week ago.
     users_to_analyze = user_active_ranges.loc[
@@ -207,7 +207,7 @@ def make_category_quantiles_plots(infile):
 
     if platform.large_compute_support:
         print("Running compute tasks")
-        client = infra.dask_infra.setup_platform_tuned_dask_client(10, platform)
+        client = infra.dask.setup_platform_tuned_dask_client(10, platform)
         reduce_to_pandas(outfile=graph_temporary_file, dask_client=client)
         client.close()
 
 
@@ -4,32 +4,32 @@
 import altair as alt
 import pandas as pd
 
-import infra.dask_infra
-import infra.pd_infra
+import infra.dask
+import infra.pd
 import infra.platform
 
 
 def reduce_to_pandas(outfile, dask_client):
-    flows = infra.dask_infra.read_parquet(
+    flows = infra.dask.read_parquet(
         "data/clean/flows/typical_fqdn_org_category_local_TM_DIV_none_INDEX_start")[["user", "category", "org", "bytes_up", "bytes_down"]]
 
     # Do the grouping
     flows = flows.groupby(["user", "category", "org"]).sum()
     flows = flows.compute()
 
-    infra.pd_infra.clean_write_parquet(flows, outfile)
+    infra.pd.clean_write_parquet(flows, outfile)
 
 
 def make_category_per_user_plots(infile):
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
     grouped_flows["bytes_total"] = grouped_flows["bytes_up"] + grouped_flows["bytes_down"]
     user_category_total = grouped_flows[["user", "category", "bytes_total"]].groupby(
         ["user", "category"]
     ).sum().reset_index()
 
     # Filter users by time in network to eliminate early incomplete samples
-    user_active_ranges = infra.pd_infra.read_parquet(
+    user_active_ranges = infra.pd.read_parquet(
         "data/clean/user_active_deltas.parquet")[["user", "days_since_first_active", "days_active", "days_online"]]
     # Drop users that joined less than a week ago.
     users_to_analyze = user_active_ranges.loc[
@@ -151,7 +151,7 @@ def make_category_per_user_plots(infile):
 
     if platform.large_compute_support:
         print("Running compute tasks")
-        client = infra.dask_infra.setup_platform_tuned_dask_client(10, platform)
+        client = infra.dask.setup_platform_tuned_dask_client(10, platform)
         reduce_to_pandas(outfile=graph_temporary_file, dask_client=client)
         client.close()
 
 
@@ -2,18 +2,18 @@
 import numpy as np
 import pandas as pd
 
-import infra.dask_infra
-import infra.pd_infra
+import infra.dask
+import infra.pd
 import infra.platform
 
 
 def create_all_flows(dask_client):
-    typical_flows = infra.dask_infra.read_parquet(
+    typical_flows = infra.dask.read_parquet(
         "data/clean/flows/typical_fqdn_category_local_TM_DIV_none_INDEX_start")[["end", "protocol", "bytes_up", "bytes_down"]]
 
-    p2p_flows = infra.dask_infra.read_parquet("data/clean/flows/p2p_TM_DIV_none_INDEX_start")[["end", "protocol", "bytes_a_to_b", "bytes_b_to_a"]]
+    p2p_flows = infra.dask.read_parquet("data/clean/flows/p2p_TM_DIV_none_INDEX_start")[["end", "protocol", "bytes_a_to_b", "bytes_b_to_a"]]
 
-    nouser_flows = infra.dask_infra.read_parquet("data/clean/flows/nouser_TM_DIV_none_INDEX_start")[["end", "protocol", "bytes_a_to_b", "bytes_b_to_a"]]
+    nouser_flows = infra.dask.read_parquet("data/clean/flows/nouser_TM_DIV_none_INDEX_start")[["end", "protocol", "bytes_a_to_b", "bytes_b_to_a"]]
 
     typical_flows["bytes_total"] = typical_flows["bytes_up"] + typical_flows["bytes_down"]
     p2p_flows["bytes_total"] = p2p_flows["bytes_a_to_b"] + p2p_flows["bytes_b_to_a"]
@@ -25,11 +25,11 @@ def create_all_flows(dask_client):
 
     all_flows = typical_flows.append(p2p_flows).append(nouser_flows)
     all_flows = all_flows.set_index("start").repartition(partition_size="128M", force=True)
-    infra.dask_infra.clean_write_parquet(all_flows, "data/clean/flows/all_TM_DIV_none_INDEX_start")
+    infra.dask.clean_write_parquet(all_flows, "data/clean/flows/all_TM_DIV_none_INDEX_start")
 
 
 def reduce_to_pandas(outfile, dask_client):
-    flows = infra.dask_infra.read_parquet(
+    flows = infra.dask.read_parquet(
         "data/clean/flows/all_TM_DIV_none_INDEX_start")[["bytes_total"]]
 
     # Compress to days
@@ -43,11 +43,11 @@ def reduce_to_pandas(outfile, dask_client):
 
     flows = flows.compute()
 
-    infra.pd_infra.clean_write_parquet(flows, outfile)
+    infra.pd.clean_write_parquet(flows, outfile)
 
 
 def make_plot(infile):
-    grouped_flows = infra.pd_infra.read_parquet(infile)
+    grouped_flows = infra.pd.read_parquet(infile)
     grouped_flows = grouped_flows.reset_index()
 
     days = ['Monday', 'Tuesday', 'Wednesday',
@@ -151,7 +151,7 @@ def make_plot(infile):
 
     if platform.large_compute_support:
         print("Performing compute operations")
-        client = infra.dask_infra.setup_dask_client()
+        client = infra.dask.setup_dask_client()
         # create_all_flows(client)
         reduce_to_pandas(outfile=graph_temporary_file, dask_client=client)
         client.close()
 
@@ -5,13 +5,13 @@
 import numpy as np
 import pandas as pd
 
-import infra.dask_infra
-import infra.pd_infra
+import infra.dask
+import infra.pd
 import infra.platform
 
 
 def reduce_to_pandas(outpath, dask_client):
-    flows = infra.dask_infra.read_parquet(
+    flows = infra.dask.read_parquet(
         "data/clean/flows/typical_fqdn_org_category_local_TM_DIV_none_INDEX_start")[["user", "bytes_up", "bytes_down"]]
 
     flows["bytes_total"] = flows["bytes_up"] + flows["bytes_down"]
@@ -27,7 +27,7 @@ def reduce_to_pandas(outpath, dask_client):
     flows = flows.reset_index()[["start_bin", "user", "bytes_total"]]
     flows = flows.compute()
 
-    infra.pd_infra.clean_write_parquet(flows, outpath)
+    infra.pd.clean_write_parquet(flows, outpath)
 
 
 def compute_cdf(frame, value_column, base_column):
@@ -40,13 +40,13 @@ def compute_cdf(frame, value_column, base_column):
 
 
 def make_plot(inpath):
-    flows = infra.pd_infra.read_parquet(inpath)
+    flows = infra.pd.read_parquet(inpath)
     flows = flows.reset_index()
     flows["MB"] = flows["bytes_total"] / (1000**2)
     user_total = flows[["user", "MB"]]
     user_total = user_total.groupby(["user"]).sum().reset_index()
 
-    activity = infra.pd_infra.read_parquet("data/clean/user_active_deltas.parquet")
+    activity = infra.pd.read_parquet("data/clean/user_active_deltas.parquet")
 
     df = user_total.merge(activity[["user", "days_online", "optimistic_days_online", "days_active"]], on="user")
     df["MB_per_online_day"] = df["MB"] / df["days_online"]
@@ -104,7 +104,7 @@ def make_plot(inpath):
 
     if platform.large_compute_support:
         print("Running compute subcommands")
-        client = infra.dask_infra.setup_platform_tuned_dask_client(per_worker_memory_GB=10, platform=platform)
+        client = infra.dask.setup_platform_tuned_dask_client(per_worker_memory_GB=10, platform=platform)
         reduce_to_pandas(outpath=graph_temporary_file, dask_client=client)
         client.close()