3
3
import itertools
4
4
import multiprocessing
5
5
import os
6
- import sys
7
6
import shutil
8
7
import subprocess
9
8
from threading import Timer
10
- import sys
11
9
from argparse import ArgumentParser
12
- from subprocess import Popen , PIPE , STDOUT , call
13
-
14
10
15
11
def get_immediate_subdirectories (a_dir ):
16
12
return [(os .path .join (a_dir , name )) for name in os .listdir (a_dir )
17
13
if os .path .isdir (os .path .join (a_dir , name ))]
18
14
19
-
20
15
def ParallelExtractDir (args , tmpdir , dir_ ):
21
- ExtractFeaturesForDir (args ,tmpdir , dir_ , "" )
22
-
16
+ ExtractFeaturesForDir (args , tmpdir , dir_ , "" )
23
17
24
18
def ExtractFeaturesForDir (args , tmpdir , dir_ , prefix ):
25
19
command = ['java' , '-cp' , args .jar , 'JavaExtractor.App' ,
26
20
'--max_path_length' , str (args .max_path_length ), '--max_path_width' , str (args .max_path_width ),
27
21
'--dir' , dir_ , '--num_threads' , str (args .num_threads )]
28
- # print command
29
- # os.system(command)
30
22
kill = lambda process : process .kill ()
31
23
outputFileName = tmpdir + prefix + dir_ .split ('/' )[- 1 ]
32
24
failed = False
@@ -39,36 +31,35 @@ def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
39
31
finally :
40
32
timer .cancel ()
41
33
42
- if sleeper .poll () == 0 :
43
- if len (stderr ) > 0 :
44
- print (sys .stderr , stderr , file = sys .stdout )
45
- else :
46
- print (sys .stderr , 'dir: ' + str (dir_ ) + ' was not completed in time' , file = sys .stdout , flush = True )
34
+ if sleeper .poll () != 0 :
47
35
failed = True
48
36
subdirs = get_immediate_subdirectories (dir_ )
49
37
for subdir in subdirs :
50
38
ExtractFeaturesForDir (args , subdir , prefix + dir_ .split ('/' )[- 1 ] + '_' )
51
- if failed :
52
- if os .path .exists (outputFileName ):
53
- os .remove (outputFileName )
54
-
39
+
40
+ if failed and os .path .exists (outputFileName ):
41
+ os .remove (outputFileName )
55
42
56
43
def ExtractFeaturesForDirsList (args , dirs ):
57
44
tmp_dir = f"./tmp/feature_extractor{ os .getpid ()} /"
58
45
if os .path .exists (tmp_dir ):
59
46
shutil .rmtree (tmp_dir , ignore_errors = True )
60
47
os .makedirs (tmp_dir )
61
- try :
62
- p = multiprocessing .Pool (4 )
63
- p .starmap (ParallelExtractDir , zip (itertools .repeat (args ),itertools .repeat (tmp_dir ), dirs ))
64
- #for dir in dirs:
65
- # ExtractFeaturesForDir(args, dir, '')
48
+
49
+ for i in range (0 , len (dirs ), args .batch_size ):
50
+ batch_dirs = dirs [i :i + args .batch_size ]
51
+ timeout_seconds = 60 # timeout setting
52
+ try :
53
+ with multiprocessing .Pool (4 ) as p :
54
+ result = p .starmap_async (ParallelExtractDir , zip (itertools .repeat (args ), itertools .repeat (tmp_dir ), batch_dirs ))
55
+ result .get (timeout = timeout_seconds )
56
+ except multiprocessing .TimeoutError :
57
+ continue
58
+
66
59
output_files = os .listdir (tmp_dir )
67
60
for f in output_files :
68
61
os .system ("cat %s/%s" % (tmp_dir , f ))
69
- finally :
70
- shutil .rmtree (tmp_dir , ignore_errors = True )
71
-
62
+ os .remove (os .path .join (tmp_dir , f ))
72
63
73
64
if __name__ == '__main__' :
74
65
parser = ArgumentParser ()
@@ -78,6 +69,9 @@ def ExtractFeaturesForDirsList(args, dirs):
78
69
parser .add_argument ("-j" , "--jar" , dest = "jar" , required = True )
79
70
parser .add_argument ("-dir" , "--dir" , dest = "dir" , required = False )
80
71
parser .add_argument ("-file" , "--file" , dest = "file" , required = False )
72
+ # add a new parameter batch_size
73
+ parser .add_argument ("-batch_size" , "--batch_size" , dest = "batch_size" , required = False , default = 3 , type = int )
74
+
81
75
args = parser .parse_args ()
82
76
83
77
if args .file is not None :
@@ -86,9 +80,5 @@ def ExtractFeaturesForDirsList(args, dirs):
86
80
os .system (command )
87
81
elif args .dir is not None :
88
82
subdirs = get_immediate_subdirectories (args .dir )
89
- to_extract = subdirs
90
- if len (subdirs ) == 0 :
91
- to_extract = [args .dir .rstrip ('/' )]
83
+ to_extract = subdirs if subdirs else [args .dir .rstrip ('/' )]
92
84
ExtractFeaturesForDirsList (args , to_extract )
93
-
94
-
0 commit comments