-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_preprocess.py
111 lines (90 loc) · 4.2 KB
/
data_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
def load_data(file_path, sep=',', header=None):
"""
Load the dataset from a CSV file.
Parameters:
file_path (str): Path to the CSV file.
sep (str): Separator used in the CSV file. Default is ','.
header (int or None): Row number to use as the column names. Defaults to None.
Returns:
DataFrame: A pandas DataFrame containing the loaded data.
"""
data = pd.read_csv(file_path, sep=sep, header=header).astype(float)
return data
def check_and_clean_data(data):
"""
Check for null and infinite values in the dataset and remove rows with such values.
Parameters:
data (DataFrame): The dataset to check and clean.
Returns:
DataFrame: The cleaned dataset.
"""
# Check for and handle infinite values by removing rows
if np.isinf(data.values).any():
print("The dataset contains infinite values.")
data = data.replace([np.inf, -np.inf], np.nan) # Optionally replace infinities with NaN
# Now remove rows with NaN values, which include those previously marked as infinities
data = data.dropna()
initial_row_count = data.shape[0]
# Further clean if there are any remaining null values
data = data.dropna()
cleaned_row_count = data.shape[0]
print(f"Removed {initial_row_count - cleaned_row_count} rows with null or infinite values.")
return data
def split_data(data, target_column=0, split=False):
"""
Optionally split the dataset into features and target.
Parameters:
data (DataFrame): The dataset to split.
target_column (int): The index of the target column.
split (bool): Whether to split the dataset into features and targets.
Returns:
DataFrame or (DataFrame, Series): If split is True, returns a tuple containing
the features DataFrame and the target Series.
If split is False, returns the unmodified dataset.
"""
if split:
target = data.loc[:, target_column]
features = data.drop(columns=[target_column])
return features, target
else:
return data
def preprocess_data(file_path, sep=',', header=None, target_column=0, split=False):
"""
Complete preprocessing workflow for loading, checking, cleaning,
and optionally splitting a time series dataset from a CSV file.
Parameters:
file_path (str): Path to the CSV file.
sep (str): Separator used in the CSV file. Default is ','.
header (int or None): Row number to use as the column names, defaults to None.
target_column (int): The index of the target column (only used if split is True).
split (bool): Whether to split the dataset into features and targets.
Returns:
DataFrame or (DataFrame, Series): Depending on the split parameter, returns
either the cleaned dataset or a tuple with
features and targets.
"""
data = load_data(file_path, sep=sep, header=header)
data = check_and_clean_data(data)
if split:
return split_data(data, target_column=target_column, split=True)
else:
return data
def prepare_for_conv1d_training(features, labels, num_classes):
"""
Prepares the dataset for Conv1D training: reshapes features and one-hot encodes labels.
Parameters:
features (DataFrame or ndarray): The features of the dataset.
labels (Series or ndarray): The labels of the dataset.
num_classes (int): The total number of classes in the dataset.
Returns:
tuple: A tuple containing the reshaped features and one-hot encoded labels.
"""
# Reshape input data to be 3D [samples, timesteps, features] for Conv1D
X_reshaped = np.expand_dims(features, axis=2)
# Adjust labels to be zero-based if not already and one-hot encode
labels_adjusted = labels - 1 # Adjust labels if they're not zero-based
y_one_hot = to_categorical(labels_adjusted, num_classes=num_classes)
return X_reshaped, y_one_hot