Skip to content

Commit 103362b

Browse files
committed
I added a batch_size to the extract file, allowing for batch processing of projects instead of loading them all at once. During batch processing, I also incorporated timeout handling.
1 parent d68f07f commit 103362b

File tree

1 file changed

+21
-31
lines changed

1 file changed

+21
-31
lines changed

JavaExtractor/extract.py

+21-31
Original file line numberDiff line numberDiff line change
@@ -3,30 +3,22 @@
33
import itertools
44
import multiprocessing
55
import os
6-
import sys
76
import shutil
87
import subprocess
98
from threading import Timer
10-
import sys
119
from argparse import ArgumentParser
12-
from subprocess import Popen, PIPE, STDOUT, call
13-
1410

1511
def get_immediate_subdirectories(a_dir):
1612
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
1713
if os.path.isdir(os.path.join(a_dir, name))]
1814

19-
2015
def ParallelExtractDir(args, tmpdir, dir_):
21-
ExtractFeaturesForDir(args,tmpdir, dir_, "")
22-
16+
ExtractFeaturesForDir(args, tmpdir, dir_, "")
2317

2418
def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
2519
command = ['java', '-cp', args.jar, 'JavaExtractor.App',
2620
'--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
2721
'--dir', dir_, '--num_threads', str(args.num_threads)]
28-
# print command
29-
# os.system(command)
3022
kill = lambda process: process.kill()
3123
outputFileName = tmpdir + prefix + dir_.split('/')[-1]
3224
failed = False
@@ -39,36 +31,35 @@ def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
3931
finally:
4032
timer.cancel()
4133

42-
if sleeper.poll() == 0:
43-
if len(stderr) > 0:
44-
print(sys.stderr, stderr, file=sys.stdout)
45-
else:
46-
print(sys.stderr, 'dir: ' + str(dir_) + ' was not completed in time', file=sys.stdout, flush=True)
34+
if sleeper.poll() != 0:
4735
failed = True
4836
subdirs = get_immediate_subdirectories(dir_)
4937
for subdir in subdirs:
5038
ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_')
51-
if failed:
52-
if os.path.exists(outputFileName):
53-
os.remove(outputFileName)
54-
39+
40+
if failed and os.path.exists(outputFileName):
41+
os.remove(outputFileName)
5542

5643
def ExtractFeaturesForDirsList(args, dirs):
5744
tmp_dir = f"./tmp/feature_extractor{os.getpid()}/"
5845
if os.path.exists(tmp_dir):
5946
shutil.rmtree(tmp_dir, ignore_errors=True)
6047
os.makedirs(tmp_dir)
61-
try:
62-
p = multiprocessing.Pool(4)
63-
p.starmap(ParallelExtractDir, zip(itertools.repeat(args),itertools.repeat(tmp_dir), dirs))
64-
#for dir in dirs:
65-
# ExtractFeaturesForDir(args, dir, '')
48+
49+
for i in range(0, len(dirs), args.batch_size):
50+
batch_dirs = dirs[i:i + args.batch_size]
51+
timeout_seconds = 60 # timeout setting
52+
try:
53+
with multiprocessing.Pool(4) as p:
54+
result = p.starmap_async(ParallelExtractDir, zip(itertools.repeat(args), itertools.repeat(tmp_dir), batch_dirs))
55+
result.get(timeout=timeout_seconds)
56+
except multiprocessing.TimeoutError:
57+
continue
58+
6659
output_files = os.listdir(tmp_dir)
6760
for f in output_files:
6861
os.system("cat %s/%s" % (tmp_dir, f))
69-
finally:
70-
shutil.rmtree(tmp_dir, ignore_errors=True)
71-
62+
os.remove(os.path.join(tmp_dir, f))
7263

7364
if __name__ == '__main__':
7465
parser = ArgumentParser()
@@ -78,6 +69,9 @@ def ExtractFeaturesForDirsList(args, dirs):
7869
parser.add_argument("-j", "--jar", dest="jar", required=True)
7970
parser.add_argument("-dir", "--dir", dest="dir", required=False)
8071
parser.add_argument("-file", "--file", dest="file", required=False)
72+
# add a new parameter batch_size
73+
parser.add_argument("-batch_size", "--batch_size", dest="batch_size", required=False, default=3, type=int)
74+
8175
args = parser.parse_args()
8276

8377
if args.file is not None:
@@ -86,9 +80,5 @@ def ExtractFeaturesForDirsList(args, dirs):
8680
os.system(command)
8781
elif args.dir is not None:
8882
subdirs = get_immediate_subdirectories(args.dir)
89-
to_extract = subdirs
90-
if len(subdirs) == 0:
91-
to_extract = [args.dir.rstrip('/')]
83+
to_extract = subdirs if subdirs else [args.dir.rstrip('/')]
9284
ExtractFeaturesForDirsList(args, to_extract)
93-
94-

0 commit comments

Comments
 (0)