forked from haruharuru16/abc-xyz-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpandera_validate.py
98 lines (81 loc) · 3.23 KB
/
pandera_validate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandera as pa
import pandas as pd
from abc_functions import abc_classification, load_data, filter_dataset
from xyz_functions import xyz_classifier
#---------- Validation 1 ----------#
"""
Validating functions:
(1) load_data(data.csv) -> loading csv data
(2) filter_dataset(parameters) -> filtering dataset and converting Date attribute into datetime datatype
"""
# Loading the data
data = load_data('Product Demand 6 Months.csv')
dataset = filter_dataset(data, 'Last 3 Months')
# Taking a subsample
data_sample = dataset.sample(n=20, random_state=999)
# defining the schema
schema_1 = pa.DataFrameSchema({
# this column should be a string and not null
"Product_Code": pa.Column(pa.String, nullable=False),
# this column should be a datetime and not null
"Date": pa.Column(pa.DateTime, nullable=False),
# this column should be an integer and not null
"Order_Demand": pa.Column(pa.Int,
nullable=False,
checks=pa.Check(lambda s: (s >= 0)))
})
# Validating the data
print(schema_1.validate(data_sample, lazy=True))
#---------- End of Validation 1 ----------#
print("--------------------")
#---------- Validation 2 ----------#
"""
Validating functions:
abc_classification(parameters) -> classifying products based on demand volume
"""
# making abc classification
data_abc, abc_class = abc_classification(dataset, 5, 15)
# taking sample
data_abc_sample = data_abc.sample(n=20, random_state=999)
# defining the schema
schema_2 = pa.DataFrameSchema({
# this column should be a string and not null
"Product_Code": pa.Column(pa.String,
nullable=False,
checks=pa.Check(lambda s: s.duplicated().sum() == 0)), # make sure that all product code is unique from each other
"Order_Demand": pa.Column(pa.Int,
nullable=False,
checks=pa.Check(lambda s: (s >= 0))),
"rank": pa.Column(pa.Int, nullable=False),
"total": pa.Column(pa.Int, nullable=False),
"rank_cumsum": pa.Column(pa.Float, nullable=False),
"class": pa.Column(pa.String, nullable=False)
})
# validating the data
print(schema_2(data_abc_sample, lazy=True))
#---------- End of Validation 2 ----------#
print("--------------------")
#---------- Validation 3 ----------#
"""
Validating functions:
xyz_classifier(parameters)
"""
# making xyz classification
data_xyz, xyz_monthly, xyz_demand = xyz_classifier(dataset, 10, 15)
# taking sample
data_xyz_sample = data_xyz.sample(n=20, random_state=999)
# defining the schema
schema_3 = pa.DataFrameSchema({
"Product_Code": pa.Column(pa.String,
nullable=False,
checks=pa.Check(lambda s: s.duplicated().sum() == 0)), # make sure that all product code is unique from each other
"std": pa.Column(pa.Float, nullable=False),
"total": pa.Column(pa.Float, nullable=False),
"avg": pa.Column(pa.Float, nullable=False),
"cov": pa.Column(pa.Float, nullable=False),
"total_product": pa.Column(pa.Int, nullable=False),
"rank_cumsum": pa.Column(pa.Float, nullable=False),
"xyz_class": pa.Column(pa.String, nullable=False)
})
# validating the data
print(schema_3(data_xyz_sample, lazy=True))