-
Notifications
You must be signed in to change notification settings - Fork 0
/
file_identical_check.py
executable file
·160 lines (135 loc) · 4.45 KB
/
file_identical_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/python
import sys
import getopt
import itertools
def get_file_refs_distribution(trace):
total_refs = 0
total_files = 0
while True:
line = list(itertools.islice(trace, 1))
if len(line) == 0:
break;
fnum = int(line[0].split()[1])
iflines = list(itertools.islice(trace, fnum))
print fnum
total_files += 1
total_refs += fnum
print >>sys.stderr, "mean = %8.5f" % (1.0*total_refs/total_files)
def get_file_size(trace):
mean = 0;
count = 0
while True:
line = list(itertools.islice(trace, 1))
if len(line) == 0:
break;
fnum = int(line[0].split()[1])
iflines = list(itertools.islice(trace, fnum))
for line in iflines:
size = int(line.split()[2])
mean += size
count += 1
print size
print >>sys.stderr, "mean = %.2f" % (1.0*mean/count)
def check_each_identical_pair(sufs, names):
same_name = 0
same_suffix = 0
pair_num = 0
for i in range(len(sufs)):
for j in range(len(sufs)):
if i == j:
continue
pair_num += 1
if(sufs[i] == sufs[j]):
same_suffix += 1
if(names[i] == names[j]):
same_name += 1
return (same_name, same_suffix, pair_num)
def check_name_and_type(trace):
logical_files = 0
physical_files = 0
total_names = 0
total_suffix = 0
total_same_name = 0
total_same_suffix = 0
total_pair_num = 0
while True:
line = list(itertools.islice(trace, 1))
if len(line) == 0:
break;
fnum = int(line[0].split()[1])
physical_files += 1
logical_files += fnum
iflines = list(itertools.islice(trace, fnum))
names = [ifl.split()[3] for ifl in iflines]
sufs = [ifl.split()[-1] for ifl in iflines]
(same_name, same_suffix, pair_num) = check_each_identical_pair(sufs, names)
total_same_name += same_name
total_same_suffix += same_suffix
total_pair_num += pair_num
nameset = {}
for name in names:
if name in nameset:
nameset[name] += 1
else:
nameset[name] = 1
suffixset = {}
for suf in sufs:
if suf in suffixset:
suffixset[suf] += 1
else:
suffixset[suf] = 1
total_names += len(nameset)
total_suffix += len(suffixset)
print >>sys.stderr, "%10s %10s %10s %10s" % ("Logical", "Physical", "Names", "Suffix")
print >>sys.stderr, "%10d %10d %10d %10d" % (logical_files, physical_files, total_names, total_suffix)
print >>sys.stderr, "Probability of same name and same type: %10f and %10f in %d pairs" % (1.0*total_same_name/total_pair_num,
1.0*total_same_suffix/total_pair_num, total_pair_num)
def get_popular_types(trace):
# {suffix : [num, size]}
suffixset = {}
while True:
line = list(itertools.islice(trace, 1))
if len(line) == 0:
break;
fnum = int(line[0].split()[1])
iflines = list(itertools.islice(trace, fnum))
files = [ifl.split() for ifl in iflines]
for file in files:
if file[-1] in suffixset:
suffixset[file[-1]][0] += 1
suffixset[file[-1]][1] += int(file[2])
else:
suffixset[file[-1]] = [1, int(file[2])]
top_suffix = []
for suf in suffixset:
top_suffix.append((suffixset[suf][0], suf))
top_suffix = sorted(top_suffix, reverse=True)
if(len(top_suffix) > 10):
top_suffix.pop()
print "In number:"
print top_suffix
top_suffix = []
for suf in suffixset:
top_suffix.append((suffixset[suf][1], suf))
top_suffix = sorted(top_suffix, reverse=True)
if(len(top_suffix) > 10):
top_suffix.pop()
print "In size:"
print top_suffix
# find all identical files
# -n check names
# -s check file size distribution
# -p output popular suffix
if __name__ == "__main__":
(opts, args) = getopt.gnu_getopt(sys.argv[1:], "npsr")
trace = open(args[0], "r")
for o, a in opts:
if o in ["-n"]:
check_name_and_type(trace)
elif o in ["-p"]:
get_popular_types(trace)
elif o in ["-s"]:
get_file_size(trace)
elif o in ["-r"]:
get_file_refs_distribution(trace)
trace.close()