From fb064219a1dae3fb3626c6b4a8feaec3fc99ad0d Mon Sep 17 00:00:00 2001 From: "myhema05@gmail.com" Date: Sun, 17 Mar 2024 11:58:17 +0530 Subject: [PATCH] dataset cleaned #7 --- data cleaning.ipynb | 461 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 461 insertions(+) create mode 100644 data cleaning.ipynb diff --git a/data cleaning.ipynb b/data cleaning.ipynb new file mode 100644 index 0000000..2f76c1b --- /dev/null +++ b/data cleaning.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1e50d82d", + "metadata": {}, + "source": [ + "# Importing libraries and the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9a47bc31", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e3072f38", + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.read_csv('wines_SPA.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0ee54770", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
winerywineyearratingnum_reviewscountryregionpricetypebodyacidity
0Teso La MonjaTinto20134.958EspanaToro995.00Toro Red5.03.0
1ArtadiVina El Pison20184.931EspanaVino de Espana313.50Tempranillo4.02.0
2Vega SiciliaUnico20094.81793EspanaRibera del Duero324.95Ribera Del Duero Red5.03.0
3Vega SiciliaUnico19994.81705EspanaRibera del Duero692.96Ribera Del Duero Red5.03.0
4Vega SiciliaUnico19964.81309EspanaRibera del Duero778.06Ribera Del Duero Red5.03.0
....................................
7495ContinoReserva20164.2392EspanaRioja19.98Rioja Red4.03.0
7496Conreria d'Scala DeiLes Brugueres20184.2390EspanaPriorato16.76Priorat Red4.03.0
7497MustiguilloFinca Terrerazo20174.2390EspanaEl Terrerazo24.45Red4.03.0
7498MatarromeraGran Reserva20114.2389EspanaRibera del Duero64.50Ribera Del Duero Red5.03.0
7499Sei SoloPreludio20164.2388EspanaRibera del Duero31.63Ribera Del Duero Red5.03.0
\n", + "

7500 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " winery wine year rating num_reviews \\\n", + "0 Teso La Monja Tinto 2013 4.9 58 \n", + "1 Artadi Vina El Pison 2018 4.9 31 \n", + "2 Vega Sicilia Unico 2009 4.8 1793 \n", + "3 Vega Sicilia Unico 1999 4.8 1705 \n", + "4 Vega Sicilia Unico 1996 4.8 1309 \n", + "... ... ... ... ... ... \n", + "7495 Contino Reserva 2016 4.2 392 \n", + "7496 Conreria d'Scala Dei Les Brugueres 2018 4.2 390 \n", + "7497 Mustiguillo Finca Terrerazo 2017 4.2 390 \n", + "7498 Matarromera Gran Reserva 2011 4.2 389 \n", + "7499 Sei Solo Preludio 2016 4.2 388 \n", + "\n", + " country region price type body acidity \n", + "0 Espana Toro 995.00 Toro Red 5.0 3.0 \n", + "1 Espana Vino de Espana 313.50 Tempranillo 4.0 2.0 \n", + "2 Espana Ribera del Duero 324.95 Ribera Del Duero Red 5.0 3.0 \n", + "3 Espana Ribera del Duero 692.96 Ribera Del Duero Red 5.0 3.0 \n", + "4 Espana Ribera del Duero 778.06 Ribera Del Duero Red 5.0 3.0 \n", + "... ... ... ... ... ... ... \n", + "7495 Espana Rioja 19.98 Rioja Red 4.0 3.0 \n", + "7496 Espana Priorato 16.76 Priorat Red 4.0 3.0 \n", + "7497 Espana El Terrerazo 24.45 Red 4.0 3.0 \n", + "7498 Espana Ribera del Duero 64.50 Ribera Del Duero Red 5.0 3.0 \n", + "7499 Espana Ribera del Duero 31.63 Ribera Del Duero Red 5.0 3.0 \n", + "\n", + "[7500 rows x 11 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "de098392", + "metadata": {}, + "source": [ + "# checking for missing data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f65bfb0e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "winery 0.000000\n", + "wine 0.000000\n", + "year 0.026667\n", + "rating 0.000000\n", + "num_reviews 0.000000\n", + "country 0.000000\n", + "region 0.000000\n", + "price 0.000000\n", + "type 7.266667\n", + "body 15.586667\n", + "acidity 15.586667\n", + "dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().mean()*100" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f6be9860", + "metadata": {}, + "outputs": [], + "source": [ + "missing_columns = df.columns[df.isnull().any()].tolist()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6af543f5", + "metadata": {}, + "outputs": [], + "source": [ + "for col in missing_columns:\n", + " most_frequent_category = df[col].mode()[0]\n", + " df[col].fillna(most_frequent_category, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "365e3ad6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "winery 0.0\n", + "wine 0.0\n", + "year 0.0\n", + "rating 0.0\n", + "num_reviews 0.0\n", + "country 0.0\n", + "region 0.0\n", + "price 0.0\n", + "type 0.0\n", + "body 0.0\n", + "acidity 0.0\n", + "dtype: float64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().mean()*100" + ] + }, + { + "cell_type": "markdown", + "id": "d9ada2a3", + "metadata": {}, + "source": [ + "# checking for duplicate data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e01219e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "yes\n" + ] + } + ], + "source": [ + "duplicates_exist = df.duplicated().any()\n", + "\n", + "if duplicates_exist:\n", + " print(\"yes\")\n", + "else:\n", + " print(\"No\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9d4c4aa", + "metadata": {}, + "source": [ + "# removing duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6e72b1f6", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5d83f3b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No\n" + ] + } + ], + "source": [ + "duplicates_exist = df.duplicated().any()\n", + "\n", + "if duplicates_exist:\n", + " print(\"yes\")\n", + "else:\n", + " print(\"No\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}