-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_data.py
81 lines (70 loc) · 2.29 KB
/
split_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sklearn.model_selection as ms
import pandas as pd
import numpy as np
import os
DIR="D:/cells/";
def getTrainDataset():
path_to_train = DIR + '/train/'
data = pd.read_csv(DIR + '/train.csv')
paths = []
labels = []
for name, lbl in zip(data['Id'], data['Target'].str.split(' ')):
y = np.zeros(28)
for key in lbl:
y[int(key)] = 1
paths.append(os.path.join(path_to_train, name))
labels.append(y)
return np.array(paths), np.array(labels)
def split(X,y):
all=np.arange(y.shape[0])
currentSet=set(all)
rs=[]
for c in range(y.shape[1]):
goodClasss=np.where(y[:,c]>0)[0]
rs.append(len(goodClasss))
print(len(goodClasss),c)
len(goodClasss)
zz=np.argsort(np.array(rs))
trainSet=set()
testSet=set()
np.random.seed(12)
if os.path.exists("./train.txt"):
with open("./train.txt","r") as f:
trainLines=[x.strip() for x in f.readlines()]
with open("./test.txt","r") as f:
testLines=[x.strip() for x in f.readlines()]
#just for debug
for v in range(y.shape[0]):
if X[v] in trainLines:
trainSet.add(v)
else:
testSet.add(v)
else:
for v in zz:
# now we should start choosing examples
goodClasss = np.where(y[:, v] > 0)[0]
np.random.shuffle(goodClasss)
test=0
train=0
for c in goodClasss:
if test*5<train:
if not c in trainSet:
testSet.add(c)
test=test+1
else:
train=train+1
trainSet.add(c);
else:
if not c in testSet:
train = train + 1
trainSet.add(c);
else:
testSet.add(c)
test = test + 1
print(test,train)
# vv=goodClasss
# vv1=vv[:vv.shape[0]//5]
# vv2=vv[vv.shape[0]//5:]
trainX,trainY=X[[np.array(list(trainSet))]], y[np.array(list(trainSet)),:]
testX, testY = X[[np.array(list(testSet))]], y[np.array(list(testSet)), :]
return ([trainX,trainY],[testX,testY])