Skip to content

Commit

Permalink
<fix> recoding of values in related category (2-->0)
Browse files Browse the repository at this point in the history
fixes issue #1
  • Loading branch information
msrlab committed Sep 23, 2019
1 parent 83244dd commit 5de58b6
Showing 1 changed file with 9 additions and 9 deletions.
18 changes: 9 additions & 9 deletions data/process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def load_data(messages_filepath, categories_filepath):
#merge
df = messages.merge(categories, on = 'id')
return df



def clean_data(df):
Expand All @@ -38,20 +38,20 @@ def clean_data(df):
# convert column from string to numeric
categories[column] = pd.to_numeric(categories[column])
# in column related, replace 2 with 1
categories['related'].replace(2,1, inplace=True)
categories['related'].replace(2,0, inplace=True)
#remove "child_alone" column because there is no positive case
categories.drop(columns=['child_alone'], inplace=True)
# drop the original categories column from `df`
df.drop(columns='categories',inplace=True)
# concatenate the original dataframe with the new `categories` dataframe
df = pd.concat([df,categories],axis=1)

# check number of duplicates
#convert message and original column to string
df['original'] = df['original'].astype(str)
df['message'] = df['message'].astype(str)
# drop duplicates in 'message'
#df_dupl_removed =
#df_dupl_removed =
df_dupl = df[df.duplicated(subset='message')]
print(f"removing {df_dupl.shape[0]} duplicates in columns 'message'")
df_dupl_removed = df.copy()
Expand All @@ -74,7 +74,7 @@ def save_data(df, database_filename):
saves dataframe to sql database
'''
engine = create_engine('sqlite:///'+database_filename)
df.to_sql('messages', engine, index=False)
df.to_sql('messages', engine, index=False)


def main():
Expand All @@ -88,12 +88,12 @@ def main():

print('Cleaning data...')
df = clean_data(df)

print('Saving data...\n DATABASE: {}'.format(database_filepath))
save_data(df, database_filepath)

print('Cleaned data saved to database!')

else:
print('Please provide the filepaths of the messages and categories '\
'datasets as the first and second argument respectively, as '\
Expand All @@ -104,4 +104,4 @@ def main():


if __name__ == '__main__':
main()
main()

0 comments on commit 5de58b6

Please sign in to comment.