-
Notifications
You must be signed in to change notification settings - Fork 287
/
Copy pathextract.py
84 lines (72 loc) · 3.5 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/python
import itertools
import multiprocessing
import os
import shutil
import subprocess
from threading import Timer
from argparse import ArgumentParser
def get_immediate_subdirectories(a_dir):
return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
if os.path.isdir(os.path.join(a_dir, name))]
def ParallelExtractDir(args, tmpdir, dir_):
ExtractFeaturesForDir(args, tmpdir, dir_, "")
def ExtractFeaturesForDir(args, tmpdir, dir_, prefix):
command = ['java', '-cp', args.jar, 'JavaExtractor.App',
'--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
'--dir', dir_, '--num_threads', str(args.num_threads)]
kill = lambda process: process.kill()
outputFileName = tmpdir + prefix + dir_.split('/')[-1]
failed = False
with open(outputFileName, 'a') as outputFile:
sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE,)
timer = Timer(600000, kill, [sleeper])
try:
timer.start()
stdout, stderr = sleeper.communicate()
finally:
timer.cancel()
if sleeper.poll() != 0:
failed = True
subdirs = get_immediate_subdirectories(dir_)
for subdir in subdirs:
ExtractFeaturesForDir(args, subdir, prefix + dir_.split('/')[-1] + '_')
if failed and os.path.exists(outputFileName):
os.remove(outputFileName)
def ExtractFeaturesForDirsList(args, dirs):
tmp_dir = f"./tmp/feature_extractor{os.getpid()}/"
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir, ignore_errors=True)
os.makedirs(tmp_dir)
for i in range(0, len(dirs), args.batch_size):
batch_dirs = dirs[i:i + args.batch_size]
timeout_seconds = 60 # timeout setting
try:
with multiprocessing.Pool(4) as p:
result = p.starmap_async(ParallelExtractDir, zip(itertools.repeat(args), itertools.repeat(tmp_dir), batch_dirs))
result.get(timeout=timeout_seconds)
except multiprocessing.TimeoutError:
continue
output_files = os.listdir(tmp_dir)
for f in output_files:
os.system("cat %s/%s" % (tmp_dir, f))
os.remove(os.path.join(tmp_dir, f))
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
parser.add_argument("-j", "--jar", dest="jar", required=True)
parser.add_argument("-dir", "--dir", dest="dir", required=False)
parser.add_argument("-file", "--file", dest="file", required=False)
# add a new parameter batch_size
parser.add_argument("-batch_size", "--batch_size", dest="batch_size", required=False, default=3, type=int)
args = parser.parse_args()
if args.file is not None:
command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \
str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file
os.system(command)
elif args.dir is not None:
subdirs = get_immediate_subdirectories(args.dir)
to_extract = subdirs if subdirs else [args.dir.rstrip('/')]
ExtractFeaturesForDirsList(args, to_extract)