-
Notifications
You must be signed in to change notification settings - Fork 5
/
data_cleaning.py
86 lines (72 loc) · 1.93 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# %%
# from numpy.core.fromnumeric import size
import pandas as pd
df = pd.read_csv("glassdoor_jobs.csv")
df = df[df["Salary Estimate"] != "-1"]
df
# TO DO IN DATA CLEANING
# -------------------
# salary parsing
# %%
salary = df["Salary Estimate"].apply(lambda x: x.split("(")[0])
# salary
minus_Kd = salary.apply(
lambda x: x.replace("K", "").replace("₹", "").replace(",", ""))
minus_Kd[0]
df["min_salary"] = minus_Kd.apply(lambda x: int(x.split("-")[0]))
# df
# #%%
# type(df["min_salary"])
# df["min_salary"].dtype
df["max_salary"] = minus_Kd.apply(lambda x: int((x.split("-")[1])))
# df
df["average-salary"] = (df.min_salary + df.max_salary) / 2
# df
df["currency"] = "LAKh"
# df
df
# company name text only
# %%
df["company_txt"] = df["Company Name"].apply(lambda x: x.split("\n")[0])
df
# state field
# %%
df.Location.value_counts()
# %%
# 2 ways to delete undesired column from the data frame
# 1.
# del df["Headquarters"]
# df = df.drop("Headquarters", 1)
df = df.drop("Competitors", 1)
df
# age of company
# %%
df["age"] = df.Founded.apply(lambda x: x if x < 1 else 2020 - x)
df
# parsing of job description (PYTHON)
# %%
# will check all job descriptions keyword - analysis
# python
df["analysis"] = df["Job Description"].apply(lambda x: 1
if "analysis" in x.lower() else 0)
df.analysis.value_counts()
# %%
df["Job Description"][0]
# df["hourly"] = df["Salary Estimate"].apply(lambda x: 1
# if "per hour" in x.lower() else 0)
# df
# %%
df
# df["employer_provided"] = df["Salary Estimate"].apply(lambda x: 1
# if "employer provided" in x.lower() else 0)
# df
# min_hr = minus_Kd.apply(lambda x: x.lower().replace("per hour". '').replace('employer provided salary:', ''))
# %%
# *df cleaned*
df_out = df
df_out
# %%
df_out.to_csv("GL_sal_data_cleaned.csv", index=False)
# %%
pd.read_csv("GL_sal_data_cleaned.csv")
# %%