6
6
import ipaddress
7
7
import os
8
8
9
- import infra .dask_infra
9
+ import infra .dask
10
10
import mappers .domains
11
11
import infra .platform
12
12
@@ -19,7 +19,7 @@ def _categorize_user(in_path, out_path):
19
19
20
20
Requires the input parquet file specify an `fqdn` column, protocol, and ports
21
21
"""
22
- frame = infra .dask_infra .read_parquet (in_path )
22
+ frame = infra .dask .read_parquet (in_path )
23
23
24
24
# First pass assign by FQDN
25
25
processor = mappers .domains .FqdnProcessor ()
@@ -65,7 +65,7 @@ def _categorize_user(in_path, out_path):
65
65
axis = "columns" ,
66
66
meta = ("local" , bool ))
67
67
68
- return infra .dask_infra .clean_write_parquet (frame , out_path , compute = False )
68
+ return infra .dask .clean_write_parquet (frame , out_path , compute = False )
69
69
70
70
71
71
def _assign_org_from_ip (ip , current ):
@@ -144,7 +144,7 @@ def _augment_user_flows_with_stun_state(in_path, out_path):
144
144
145
145
Will not be correct unless the flow is indexed by time
146
146
"""
147
- flow_frame = infra .dask_infra .read_parquet (in_path )
147
+ flow_frame = infra .dask .read_parquet (in_path )
148
148
# Bookeeping for building a new frame
149
149
max_rows_per_division = 100000
150
150
out_chunk = list ()
@@ -231,7 +231,7 @@ def _augment_user_flows_with_stun_state(in_path, out_path):
231
231
force = True )
232
232
out_frame = out_frame .categorize (columns = ["fqdn_source" , "org" , "category" ])
233
233
234
- infra .dask_infra .clean_write_parquet (out_frame , out_path )
234
+ infra .dask .clean_write_parquet (out_frame , out_path )
235
235
print ("Finished writing user" , in_path )
236
236
237
237
@@ -285,7 +285,7 @@ def merge_parquet_frames(in_parent_directory, out_frame_path):
285
285
div_on_disk = sorted (os .listdir (in_parent_directory ))
286
286
for div in div_on_disk :
287
287
div_path = os .path .join (in_parent_directory , div )
288
- frame = infra .dask_infra .read_parquet (div_path )
288
+ frame = infra .dask .read_parquet (div_path )
289
289
290
290
if merged_frame is None :
291
291
merged_frame = frame
@@ -299,11 +299,11 @@ def merge_parquet_frames(in_parent_directory, out_frame_path):
299
299
force = True
300
300
)
301
301
302
- infra .dask_infra .clean_write_parquet (merged_frame , out_frame_path )
302
+ infra .dask .clean_write_parquet (merged_frame , out_frame_path )
303
303
304
304
305
305
def _print_heavy_hitter_unmapped_domains (infile ):
306
- df = infra .dask_infra .read_parquet (infile )
306
+ df = infra .dask .read_parquet (infile )
307
307
308
308
unmapped = df .loc [((df ["org" ] == "Unknown (Not Mapped)" ) | (df ["category" ] == "Unknown (Not Mapped)" ))]
309
309
df = unmapped .groupby ("fqdn" ).sum ()
@@ -328,7 +328,7 @@ def _print_heavy_hitter_unmapped_domains(infile):
328
328
329
329
if platform .large_compute_support :
330
330
print ("To see execution status, check out the dask status page at localhost:8787 while the computation is running." )
331
- client = infra .dask_infra .setup_platform_tuned_dask_client (20 , platform )
331
+ client = infra .dask .setup_platform_tuned_dask_client (20 , platform )
332
332
333
333
#augment_all_user_flows(in_parent_directory, annotated_parent_directory, client)
334
334
stun_augment_all_user_flows (annotated_parent_directory , stun_annotated_parent_directory , client )
0 commit comments