-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_csv_dataset.py
173 lines (140 loc) · 5.34 KB
/
my_csv_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# -*- coding: utf-8 -*-
"""
A simple general csv dataset wrapper for pylearn2.
Can do automatic one-hot encoding based on labels present in a file.
"""
__authors__ = "Zygmunt Zając"
__copyright__ = "Copyright 2013, Zygmunt Zając"
__credits__ = ["Zygmunt Zając", "Nicholas Dronen"]
__license__ = "3-clause BSD"
__maintainer__ = "?"
__email__ = "[email protected]"
import csv
import numpy as np
import os
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from pylearn2.utils import serial
from pylearn2.utils.string_utils import preprocess
class MyCSVDataset(DenseDesignMatrix):
"""A generic class for accessing CSV files
labels, if present, should be in the 'last' column
(original CSVDataset needs label first)
if there's no labels, set expect_labels to False
if there's no header line in your file, set expect_headers to False
Parameters
----------
path : str
The path to the CSV file.
task : str
The type of task in which the dataset will be used -- either
"classification" or "regression". The task determines the shape of the
target variable. For classification, it is a vector; for regression, a
matrix.
expect_labels : bool
Whether the CSV file contains a target variable in the first column.
expect_headers : bool
Whether the CSV file contains column headers.
delimiter : bool
The CSV file's delimiter.
start : int
The first row of the CSV file to load.
stop : int
The last row of the CSV file to load.
start_fraction : float
The fraction of rows, starting at the beginning of the file, to load.
end_fraction : float
The fraction of rows, starting at the end of the file, to load.
"""
def __init__(self,
path='train.csv',
task='classification',
expect_labels=True,
expect_headers=True,
delimiter=',',
start=None,
stop=None,
start_fraction=None,
end_fraction=None):
"""
.. todo::
WRITEME
"""
self.path = path
self.task = task
self.expect_labels = expect_labels
self.expect_headers = expect_headers
self.delimiter = delimiter
self.start = start
self.stop = stop
self.start_fraction = start_fraction
self.end_fraction = end_fraction
self.view_converter = None
if task not in ['classification', 'regression']:
raise ValueError('task must be either "classification" or '
'"regression"; got ' + str(task))
if start_fraction is not None:
if end_fraction is not None:
raise ValueError("Use start_fraction or end_fraction, "
" not both.")
if start_fraction <= 0:
raise ValueError("start_fraction should be > 0")
if start_fraction >= 1:
raise ValueError("start_fraction should be < 1")
if end_fraction is not None:
if end_fraction <= 0:
raise ValueError("end_fraction should be > 0")
if end_fraction >= 1:
raise ValueError("end_fraction should be < 1")
if start is not None:
if start_fraction is not None or end_fraction is not None:
raise ValueError("Use start, start_fraction, or end_fraction,"
" just not together.")
if stop is not None:
if start_fraction is not None or end_fraction is not None:
raise ValueError("Use stop, start_fraction, or end_fraction,"
" just not together.")
# and go
self.path = preprocess(self.path)
X, y = self._load_data()
if self.task == 'regression':
super(MyCSVDataset, self).__init__(X=X, y=y)
else:
super(MyCSVDataset, self).__init__(X=X, y=y,
y_labels=np.max(y) + 1)
def _load_data(self):
"""
.. todo::
WRITEME
"""
assert self.path.endswith('.csv')
if self.expect_headers:
data = np.loadtxt(self.path,
delimiter=self.delimiter,
skiprows=1)
else:
data = np.loadtxt(self.path, delimiter=self.delimiter)
def take_subset(X, y):
if self.start_fraction is not None:
n = X.shape[0]
subset_end = int(self.start_fraction * n)
X = X[0:subset_end, :]
y = y[0:subset_end]
elif self.end_fraction is not None:
n = X.shape[0]
subset_start = int((1 - self.end_fraction) * n)
X = X[subset_start:, ]
y = y[subset_start:]
elif self.start is not None:
X = X[self.start:self.stop, ]
if y is not None:
y = y[self.start:self.stop]
return X, y
if self.expect_labels:
y = np.array([int(elem)-1 for elem in data[:, -1]])
X = data[:, :-1]
y = y.reshape((y.shape[0], 1))
else:
X = data
y = None
X, y = take_subset(X, y)
return X, y