From b4a91d07924867cdd8c1e859b3f9f5998a74cbba Mon Sep 17 00:00:00 2001 From: dknoma Date: Tue, 12 May 2020 15:55:28 -0700 Subject: [PATCH] dataset --- data/combine.py | 19 +++++++++++++++++++ data/reduce.py | 11 +++++++++++ dataset.html | 13 +++++++++++++ 3 files changed, 43 insertions(+) create mode 100644 data/combine.py create mode 100644 data/reduce.py diff --git a/data/combine.py b/data/combine.py new file mode 100644 index 0000000..186cb68 --- /dev/null +++ b/data/combine.py @@ -0,0 +1,19 @@ +import pandas as pd +df1 = pd.read_csv('steam_games_titles.csv', error_bad_lines=False) +df2 = pd.read_csv('steam_reviews.csv') + +test = df2['title'].drop_duplicates().to_numpy() + +# print(test) + +df1 = df1.loc[df1['title'].isin(test)] + +df1.drop(["url", "languages", "popular_tags", "game_details", "release_date", "developer", "publisher", "game_description", "achievements", "mature_content", "recent_reviews", "minimum_requirements", "recommended_requirements", "types", "desc_snippet", "original_price", "discount_price"], axis=1, inplace=True) +df2.drop(["funny", "is_early_access_review", "review"], axis=1, inplace=True) + +df2 = df2[df2['date_posted'].dt.year != 2019] + +df_final = df1.merge(df2, left_on = 'title', right_on = 'title') +# # df_final = pd.merge(df1, df2, how = 'inner', on = 'title') + +df_final.to_csv('steam_combined_final.csv') \ No newline at end of file diff --git a/data/reduce.py b/data/reduce.py new file mode 100644 index 0000000..1e2347c --- /dev/null +++ b/data/reduce.py @@ -0,0 +1,11 @@ +import pandas as pd +df1 = pd.read_csv('steam_games_titles.csv', error_bad_lines=False) +df2 = pd.read_csv('steam_reviews.csv') + +title = df2['title'].drop_duplicates().to_numpy() + +df1 = df1.loc[df1['title'].isin(title)] + +df1.drop(["url", "all_reviews", "genre", "languages", "popular_tags", "game_details", "release_date", "developer", "publisher", "game_description", "achievements", "mature_content", "recent_reviews", "minimum_requirements", "recommended_requirements", "types", "desc_snippet", "original_price", "discount_price"], axis=1, inplace=True) + +df1.to_csv('steam_reduced_final.csv') \ No newline at end of file diff --git a/dataset.html b/dataset.html index 2d85da4..1ed4a68 100644 --- a/dataset.html +++ b/dataset.html @@ -143,6 +143,19 @@

Steam Games Complete Dataset

  • The title of the game.
  • + +
    +

    + "This data is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: www.opendatacommons.org/licenses/pddl/1.0/" +

    + Resource retrieved from + https://www.kaggle.com/trolukovich/steam-games-complete-dataset, accessed May 5, 2020 +
    +
    +

    Note

    +

    I included the reduced and combined datasets (steam_combined.final.csv and + steam_reduced_with_genre_no_reviews.json). Also included are the scripts made to do the wrangling.

    +