-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
74 lines (56 loc) · 2.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
print("Loading data. . .")
train = pd.read_csv("train.csv")
# Separate target
y = train['is_fraud']
X = train.drop('is_fraud', axis=1)
def extract_time_features(df):
# Combine date and time into a single datetime for easier feature extraction
dt = pd.to_datetime(df['trans_date'] + ' ' + df['trans_time'])
df['trans_hour'] = dt.dt.hour
df['trans_dayofweek'] = dt.dt.dayofweek
return df
X = extract_time_features(X)
# Drop columns that are not likely useful or are too unique (IDs, names, etc.)
cols_to_drop = ['id', 'trans_num', 'trans_date', 'trans_time', 'dob', 'first', 'last', 'street', 'merchant']
X = X.drop(cols_to_drop, axis=1, errors='ignore')
# Identify categorical and numerical features
numeric_features = ['unix_time', 'amt', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'trans_hour', 'trans_dayofweek']
categorical_features = ['category', 'gender', 'city', 'state', 'job']
numeric_features = [f for f in numeric_features if f in X.columns]
categorical_features = [f for f in categorical_features if f in X.columns]
print("Building preprocessing Pipelines. . .")
# Preprocessing pipeline for numeric and categorical data
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features),
],
remainder='drop' # Drop any columns not specified
)
print("Building model. . .")
model = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier(n_estimators=100, random_state=42, verbose=True))])
print("Training model. . .")
model.fit(X, y)
test = pd.read_csv("test.csv")
test_ids = test['id'].copy()
test = extract_time_features(test)
test = test.drop(cols_to_drop, axis=1, errors='ignore')
predictions = model.predict(test)
submission = pd.DataFrame({'id': test_ids, 'is_fraud': predictions})
submission.to_csv("submission.csv", index=False)
print("Submission file created: submission.csv")