-
Notifications
You must be signed in to change notification settings - Fork 3
/
makedata.py
65 lines (48 loc) · 1.73 KB
/
makedata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# this is PyFIM, available from http://www.borgelt.net/pyfim.html
from fim import fpgrowth
from numpy import loadtxt
# Routines taken from BRL_code.py (Reformatted for me.)
#Read in the .tab file -- generate sets of attributes and array of outcomes
def load_data(fname):
# Load sample attributes (one set of attributes per line)
with open(fname+'.tab','r') as fin:
A = fin.readlines()
data = []
for ln in A:
data.append(ln.split())
# Load outcomes (binary), one per line
Y = loadtxt(fname+'.Y')
if len(Y.shape) == 1:
Y = array([Y])
return data,Y
# Frequent itemset mining
# minsupport is an integer percentage (e.g. 10 for 10%)
# maxlhs is the maximum size of the lhs
def get_freqitemsets(fname, minsupport=10, maxlhs = 2):
# Load the data
data,Y = load_data(fname)
# Open output file
fout = open (fname+".out", "w")
# Now find frequent itemsets, mining separately for each class
data_pos = [x for i,x in enumerate(data) if Y[i,0]==0]
data_neg = [x for i,x in enumerate(data) if Y[i,0]==1]
assert len(data_pos)+len(data_neg) == len(data)
print "About to calculate positive itemsets"
itemsets = [r[0] for r in fpgrowth(data_pos,supp=minsupport,zmax=maxlhs)]
print "About to calculate negative itemsets"
itemsets.extend([r[0] for r in fpgrowth(data_neg,supp=minsupport,zmax=maxlhs)])
print "Done"
n_rules = len(itemsets)
# Now for each rule we want to write out a line of output
# containing the rule and a bit for each training sample
# indicating if the sample satisfies the rule or not.
for lhs in itemsets :
print lhs
fout.write(''.join(lhs) + '\t')
for (j, attrs) in enumerate(data) :
if set(lhs).issubset(attrs) :
fout.write('1 ')
else :
fout.write('0 ')
fout.write('\n')
fout.close()