From b4a91d07924867cdd8c1e859b3f9f5998a74cbba Mon Sep 17 00:00:00 2001
From: dknoma <djknoma@gmail.com>
Date: Tue, 12 May 2020 15:55:28 -0700
Subject: [PATCH] dataset

---
 data/combine.py | 19 +++++++++++++++++++
 data/reduce.py  | 11 +++++++++++
 dataset.html    | 13 +++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 data/combine.py
 create mode 100644 data/reduce.py
diff --git a/data/combine.py b/data/combine.py
new file mode 100644
index 0000000..186cb68
--- /dev/null
+++ b/data/combine.py
@@ -0,0 +1,19 @@
+import pandas as pd
+df1 = pd.read_csv('steam_games_titles.csv', error_bad_lines=False)
+df2 = pd.read_csv('steam_reviews.csv')
+
+test = df2['title'].drop_duplicates().to_numpy()
+
+# print(test)
+
+df1 = df1.loc[df1['title'].isin(test)]
+
+df1.drop(["url", "languages", "popular_tags", "game_details", "release_date", "developer", "publisher", "game_description", "achievements", "mature_content", "recent_reviews", "minimum_requirements", "recommended_requirements", "types", "desc_snippet", "original_price", "discount_price"], axis=1, inplace=True)
+df2.drop(["funny", "is_early_access_review", "review"], axis=1, inplace=True)
+
+df2 = df2[df2['date_posted'].dt.year != 2019]
+
+df_final = df1.merge(df2, left_on = 'title', right_on = 'title')
+# # df_final = pd.merge(df1, df2, how = 'inner', on = 'title')
+
+df_final.to_csv('steam_combined_final.csv')
\ No newline at end of file
diff --git a/data/reduce.py b/data/reduce.py
new file mode 100644
index 0000000..1e2347c
--- /dev/null
+++ b/data/reduce.py
@@ -0,0 +1,11 @@
+import pandas as pd
+df1 = pd.read_csv('steam_games_titles.csv', error_bad_lines=False)
+df2 = pd.read_csv('steam_reviews.csv')
+
+title = df2['title'].drop_duplicates().to_numpy()
+
+df1 = df1.loc[df1['title'].isin(title)]
+
+df1.drop(["url", "all_reviews", "genre", "languages", "popular_tags", "game_details", "release_date", "developer", "publisher", "game_description", "achievements", "mature_content", "recent_reviews", "minimum_requirements", "recommended_requirements", "types", "desc_snippet", "original_price", "discount_price"], axis=1, inplace=True)
+
+df1.to_csv('steam_reduced_final.csv')
\ No newline at end of file
diff --git a/dataset.html b/dataset.html
index 2d85da4..1ed4a68 100644
--- a/dataset.html
+++ b/dataset.html
@@ -143,6 +143,19 @@ <h1>Steam Games Complete Dataset</h1>
               <li>The title of the game.</li>
             </ul></li>
         </ol>
+
+        <blockquote>
+          <p>
+            "This data is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: <a href="http://www.opendatacommons.org/licenses/pddl/1.0/">www.opendatacommons.org/licenses/pddl/1.0/</a>"
+          </p>
+          <cite>Resource retrieved from <a href="https://www.kaggle.com/trolukovich/steam-games-complete-dataset">
+            https://www.kaggle.com/trolukovich/steam-games-complete-dataset</a>, accessed May 5, 2020</cite>
+        </blockquote>
+        <div>
+          <h3>Note</h3>
+          <p>I included the reduced and combined datasets (steam_combined.final.csv and
+            steam_reduced_with_genre_no_reviews.json). Also included are the scripts made to do the wrangling.</p>
+        </div>
       </div>
       <!-- End page content -->
     </div>