From e57aa042932d2976942e7ecf0dfe63f613e1cd72 Mon Sep 17 00:00:00 2001 From: Diwakar Gupta <39624018+Diwakar-Gupta@users.noreply.github.com> Date: Tue, 3 May 2022 12:10:56 +0530 Subject: [PATCH] Pandas assignment_solution --- 22-04-30-Pandas/AssignmentSolution.ipynb | 2387 ++++++++++++++++++++++ 1 file changed, 2387 insertions(+) create mode 100644 22-04-30-Pandas/AssignmentSolution.ipynb diff --git a/22-04-30-Pandas/AssignmentSolution.ipynb b/22-04-30-Pandas/AssignmentSolution.ipynb new file mode 100644 index 0000000..e986046 --- /dev/null +++ b/22-04-30-Pandas/AssignmentSolution.ipynb @@ -0,0 +1,2387 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Excercise1_pandas_Solution.ipynb", + "provenance": [], + "collapsed_sections": [], + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "**import pandas as pd and numpy as np**" + ], + "metadata": { + "id": "zoRkvutsEsv9" + } + }, + { + "cell_type": "code", + "metadata": { + "id": "L8z0tBlXt1mC" + }, + "source": [ + "import pandas as pd\n", + "import numpy as np" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "read csv from this url https://raw.githubusercontent.com/Kokkalo4/Kaggle-SF-Salaries/master/Salaries.csv using pandas in df variable.\n", + "\n", + "\n", + "If any error is printed ignore that." + ], + "metadata": { + "id": "1F1WQpscEhqQ" + } + }, + { + "cell_type": "code", + "metadata": { + "id": "LO98v31ct6iw", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2ad0cf2c-4687-4ffe-cdc1-318541613c69" + }, + "source": [ + "df = pd.read_csv('https://raw.githubusercontent.com/Kokkalo4/Kaggle-SF-Salaries/master/Salaries.csv')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (3,4,5,6,12) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZYHMdf1OuoCF" + }, + "source": [ + "**check the head of DataFrame**\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "id": "shypO4MfPuN9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 389 + }, + "outputId": "3f7b4068-4241-4442-c851-449ada3f7bce" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesAgencyStatus
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY1674110400184NaN567595.43567595.432011NaNSan FranciscoNaN
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)155966245132137811NaN538909.28538909.282011NaNSan FranciscoNaN
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)21273910608816452.6NaN335279.91335279.912011NaNSan FranciscoNaN
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC7791656120.7198307NaN332343.61332343.612011NaNSan FranciscoNaN
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)1344029737182235NaN326373.19326373.192011NaNSan FranciscoNaN
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " Id EmployeeName ... Agency Status\n", + "0 1 NATHANIEL FORD ... San Francisco NaN\n", + "1 2 GARY JIMENEZ ... San Francisco NaN\n", + "2 3 ALBERT PARDINI ... San Francisco NaN\n", + "3 4 CHRISTOPHER CHONG ... San Francisco NaN\n", + "4 5 PATRICK GARDNER ... San Francisco NaN\n", + "\n", + "[5 rows x 13 columns]" + ] + }, + "metadata": {}, + "execution_count": 3 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "3Hs476kddqd_", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 389 + }, + "outputId": "1b5511c0-ccaf-46bd-b495-d6fb0a622235" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdEmployeeNameJobTitleBasePayOvertimePayOtherPayBenefitsTotalPayTotalPayBenefitsYearNotesAgencyStatus
01NATHANIEL FORDGENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY1674110400184NaN567595.43567595.432011NaNSan FranciscoNaN
12GARY JIMENEZCAPTAIN III (POLICE DEPARTMENT)155966245132137811NaN538909.28538909.282011NaNSan FranciscoNaN
23ALBERT PARDINICAPTAIN III (POLICE DEPARTMENT)21273910608816452.6NaN335279.91335279.912011NaNSan FranciscoNaN
34CHRISTOPHER CHONGWIRE ROPE CABLE MAINTENANCE MECHANIC7791656120.7198307NaN332343.61332343.612011NaNSan FranciscoNaN
45PATRICK GARDNERDEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)1344029737182235NaN326373.19326373.192011NaNSan FranciscoNaN
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " Id EmployeeName ... Agency Status\n", + "0 1 NATHANIEL FORD ... San Francisco NaN\n", + "1 2 GARY JIMENEZ ... San Francisco NaN\n", + "2 3 ALBERT PARDINI ... San Francisco NaN\n", + "3 4 CHRISTOPHER CHONG ... San Francisco NaN\n", + "4 5 PATRICK GARDNER ... San Francisco NaN\n", + "\n", + "[5 rows x 13 columns]" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "q4guqLVUu3s4" + }, + "source": [ + "**use the info method to find out how many entries there are.**" + ] + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "id": "sCf-_N-7Pwpb", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c6c29e0f-57af-4aa2-98eb-20e8e6eaacab" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148049 non-null object \n", + " 4 OvertimePay 148654 non-null object \n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 38119 non-null object \n", + "dtypes: float64(3), int64(2), object(8)\n", + "memory usage: 14.7+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9ETc2lOqdrYN", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "47f965c7-4f81-4774-c628-0738d555daa4" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148049 non-null object \n", + " 4 OvertimePay 148654 non-null object \n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 38119 non-null object \n", + "dtypes: float64(3), int64(2), object(8)\n", + "memory usage: 14.7+ MB\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "BasePay and OvertimePay both the fields are of **object** type and contains **missing data**, convert them to numeric using\n", + "\n", + " `pd.to_numeric(df[column_name], errors='coerce')`\n", + "\n", + " errors='coerce' will replace all non-numeric values with NaN\n", + "\n", + " then check datatype using info function" + ], + "metadata": { + "id": "55lbeP7q4Kt7" + } + }, + { + "cell_type": "code", + "source": [ + "df['BasePay'] = pd.to_numeric(df['BasePay'], errors = 'coerce')\n", + "df['OvertimePay'] = pd.to_numeric(df['OvertimePay'], errors = 'coerce')" + ], + "metadata": { + "id": "MCs3nTMEPzmA" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dSKsf7WbE3fP", + "outputId": "aa08dbf0-bc16-45f9-ae9a-804b29efb29d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 38119 non-null object \n", + "dtypes: float64(5), int64(2), object(6)\n", + "memory usage: 14.7+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "yV_JbMJH4ds3", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d9313c24-991e-42a1-8692-592958978065" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Notes 0 non-null float64\n", + " 11 Agency 148654 non-null object \n", + " 12 Status 38119 non-null object \n", + "dtypes: float64(5), int64(2), object(6)\n", + "memory usage: 14.7+ MB\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uMlELt6pvGRv" + }, + "source": [ + "**What is the average BasePay?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "TqFqU5v_dsYK", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "16db4a87-11b4-4f4a-961e-4df333b9a928" + }, + "source": [ + "df['BasePay'].mean()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "66325.44884050643" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_cb6KuDk2cwV", + "outputId": "6379652c-7589-4ccc-d2dc-8d1d4fc9bbf6" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "66325.44884050643" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nJWtU2L9zaDq" + }, + "source": [ + "**What is the highest amount of OvertimePay in the dataset?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "OCN9UhwqdtLk", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "6765ce4c-cd18-4741-f35b-8f4aacbd41b9" + }, + "source": [ + "df['BasePay'].max()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "319275.01" + ] + }, + "metadata": {}, + "execution_count": 12 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gtSR1ZAszYzT", + "outputId": "4786818d-923a-467b-e3c6-3493de21aba3" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "245131.88" + ] + }, + "metadata": {}, + "execution_count": 49 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rYI-Oix34iEi" + }, + "source": [ + "**What is the Job Title of JOSEPH DRISCOLL?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "abIYqLy1dt8Y", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "830af414-0b08-48be-962f-7234932bd179" + }, + "source": [ + "df[df['EmployeeName'] == 'JOSEPH DRISCOLL']['JobTitle']" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "24 CAPTAIN, FIRE SUPPRESSION\n", + "Name: JobTitle, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 16 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nBWc2QjlzxXL", + "outputId": "0a8d83ea-4e4c-4aac-fc2d-a2e1fa9fd76a" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "24 CAPTAIN, FIRE SUPPRESSION\n", + "Name: JobTitle, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 55 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "655t31sq5FsP" + }, + "source": [ + "**How much does JOSEPH DRISCOLL make (including benefits)**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7y0404xYdvG-", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "2a0bba35-87a7-47ad-899c-d5d2933914fa" + }, + "source": [ + "df[df['EmployeeName'] == 'JOSEPH DRISCOLL']['TotalPayBenefits']" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "24 270324.91\n", + "Name: TotalPayBenefits, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kEi-kCQk5M1u", + "outputId": "458f15e4-6c62-403d-b433-09a8d457697b" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "24 270324.91\n", + "Name: TotalPayBenefits, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 59 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7S8IJPSq5xvV" + }, + "source": [ + "**What is the name of highest paid person**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vBk2SnBRdx13", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0ecb0edc-4871-41a1-b906-283d35fdd045" + }, + "source": [ + "df[df['TotalPayBenefits'].max() == df['TotalPayBenefits']]['EmployeeName']" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 NATHANIEL FORD\n", + "Name: EmployeeName, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iwG4BhNb51Yn", + "outputId": "d45a2036-a7d5-4464-dae4-0e1de12e6c89" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0 NATHANIEL FORD\n", + "Name: EmployeeName, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3KPozhE66y_S" + }, + "source": [ + "**What is the name of lowest paid person (including benefits)?**\n", + "\n", + "find his row index then use iloc to access data" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "z1FpptNody88", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c12b9bcd-5166-4a0c-a55a-893d019b1215" + }, + "source": [ + "idx = np.where(df['TotalPayBenefits'].min() == df['TotalPayBenefits'])[0][0]\n", + "\n", + "df.iloc[idx]" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Id 148654\n", + "EmployeeName Joe Lopez\n", + "JobTitle Counselor, Log Cabin Ranch\n", + "BasePay 0\n", + "OvertimePay 0\n", + "OtherPay -618.13\n", + "Benefits 0.00\n", + "TotalPay -618.13\n", + "TotalPayBenefits -618.13\n", + "Year 2014\n", + "Notes NaN\n", + "Agency San Francisco\n", + "Status PT\n", + "Name: 148653, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AS0x187B6fHH", + "outputId": "8ac24798-55f9-4315-d437-250b67a125d8" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Id 148654\n", + "EmployeeName Joe Lopez\n", + "JobTitle Counselor, Log Cabin Ranch\n", + "BasePay 0\n", + "OvertimePay 0\n", + "OtherPay -618.13\n", + "Benefits 0.00\n", + "TotalPay -618.13\n", + "TotalPayBenefits -618.13\n", + "Year 2014\n", + "Notes NaN\n", + "Agency San Francisco\n", + "Status PT\n", + "Name: 148653, dtype: object" + ] + }, + "metadata": {}, + "execution_count": 70 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "use info to find column with **0 non-null** value, then drop that column and print info()" + ], + "metadata": { + "id": "au-0Eu1M-UCv" + } + }, + { + "cell_type": "code", + "source": [ + "df.drop( columns = ['Notes'], inplace = True)\n", + "df.info()" + ], + "metadata": { + "id": "4V3T7O1lQFR6", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "9db7fc99-7c0a-462d-d0d9-16903ec65c8a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Agency 148654 non-null object \n", + " 11 Status 38119 non-null object \n", + "dtypes: float64(4), int64(2), object(6)\n", + "memory usage: 13.6+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "SZRxohSr-QFw", + "outputId": "414af7be-ef95-4cdc-8bce-243052f409e7", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 148654 entries, 0 to 148653\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Id 148654 non-null int64 \n", + " 1 EmployeeName 148654 non-null object \n", + " 2 JobTitle 148654 non-null object \n", + " 3 BasePay 148045 non-null float64\n", + " 4 OvertimePay 148650 non-null float64\n", + " 5 OtherPay 148654 non-null object \n", + " 6 Benefits 112495 non-null object \n", + " 7 TotalPay 148654 non-null float64\n", + " 8 TotalPayBenefits 148654 non-null float64\n", + " 9 Year 148654 non-null int64 \n", + " 10 Agency 148654 non-null object \n", + " 11 Status 38119 non-null object \n", + "dtypes: float64(4), int64(2), object(6)\n", + "memory usage: 13.6+ MB\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RojthL1u7hNf" + }, + "source": [ + "**What was the average (mean) BasePay of all employees per year?(2011, 2014)**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5MWZEkT1d1LV", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "outputId": "8946f74f-ccda-4127-d82e-097d715a31f8" + }, + "source": [ + "df.groupby('Year')[['BasePay']].mean().reset_index()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearBasePay
0201163595.956517
1201265436.406857
2201369630.030216
3201466564.421924
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " Year BasePay\n", + "0 2011 63595.956517\n", + "1 2012 65436.406857\n", + "2 2013 69630.030216\n", + "3 2014 66564.421924" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 173 + }, + "id": "LW1Fcpay7vza", + "outputId": "76443d97-b044-4963-dc83-8cf2dc4547c1" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearBasePay
0201163595.956517
1201265436.406857
2201369630.030216
3201466564.421924
\n", + "
" + ], + "text/plain": [ + " Year BasePay\n", + "0 2011 63595.956517\n", + "1 2012 65436.406857\n", + "2 2013 69630.030216\n", + "3 2014 66564.421924" + ] + }, + "metadata": {}, + "execution_count": 83 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8ODaELxwLwe8" + }, + "source": [ + "**How many unique job titles are there**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "_MUO7JTOd2sL", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "11085b78-b4c0-4341-9959-4aad388cac73" + }, + "source": [ + "# df['JobTitle'].unique().size\n", + "df['JobTitle'].nunique()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2159" + ] + }, + "metadata": {}, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b2U4RvnCLzXe", + "outputId": "9f78cde2-5eae-446f-f3ab-d55e067a2908" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2159" + ] + }, + "metadata": {}, + "execution_count": 87 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gobw6W0qMHnZ" + }, + "source": [ + "**What are the top 5 most common jobs?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "KCwVAPxHd3-B", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "3739f319-7b67-49aa-fc94-ba898fd5323e" + }, + "source": [ + "df['JobTitle'].value_counts().head(5)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Transit Operator 7036\n", + "Special Nurse 4389\n", + "Registered Nurse 3736\n", + "Public Svc Aide-Public Works 2518\n", + "Police Officer 3 2421\n", + "Name: JobTitle, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 44 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XrcEvD3MMGN7", + "outputId": "9a266581-c392-44c6-8a03-84d0b6b51d41" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Transit Operator 7036\n", + "Special Nurse 4389\n", + "Registered Nurse 3736\n", + "Public Svc Aide-Public Works 2518\n", + "Police Officer 3 2421\n", + "Name: JobTitle, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 89 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DYo0zcxBMx8T" + }, + "source": [ + "**How Many JobTitles with only one occurence in 2013?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "5Zm7x9-zd5Hu", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "0afc5092-4c3c-4843-a78e-9e1b2c8a2f72" + }, + "source": [ + "(df[df['Year'] == 2013]['JobTitle'].value_counts() == 1).sum()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "202" + ] + }, + "metadata": {}, + "execution_count": 48 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "i3XJLC_jM3aq", + "outputId": "f72fe225-218d-4239-a713-e8731c04a1fa" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "202" + ] + }, + "metadata": {}, + "execution_count": 96 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AxNuuKlVOqQp" + }, + "source": [ + "**How many people has word chief in there jobtitle?**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "uh8Oay3Yd6Eq", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "8c8ff4a8-0cf9-41cd-d9e9-a63912c0651e" + }, + "source": [ + "# count = 0\n", + "\n", + "# for x in df['JobTitle']:\n", + "# if 'chief' in x.lower().split(' '):\n", + "# count += 1\n", + "\n", + "# print(count)\n", + "\n", + "# def hasChief(x):\n", + "# return 'chief' in x.lower().split(' ')\n", + "\n", + "# df['JobTitle'].apply(hasChief).sum()\n", + "\n", + "df['JobTitle'].apply(lambda x: 'chief' in x.lower().split(' ')).sum()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "477" + ] + }, + "metadata": {}, + "execution_count": 60 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b-2w3Xp0N68f", + "outputId": "f94ae9e0-1bfe-4636-a302-e60b5952671c" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "477" + ] + }, + "metadata": {}, + "execution_count": 101 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wPz_hb0NP0Tf" + }, + "source": [ + "**Find correlation of TotalPay with other fields**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "27TcvkAET8iz", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 238 + }, + "outputId": "9e2d84d7-9f23-49a9-87e9-53486a717eed" + }, + "source": [ + "df.corr()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdBasePayOvertimePayTotalPayTotalPayBenefitsYear
Id1.000000-0.204878-0.081505-0.211478-0.0921900.968171
BasePay-0.2048781.0000000.2667400.9544940.9465950.033751
OvertimePay-0.0815050.2667401.0000000.5048590.4679810.027887
TotalPay-0.2114780.9544940.5048591.0000000.9773130.032090
TotalPayBenefits-0.0921900.9465950.4679810.9773131.0000000.151947
Year0.9681710.0337510.0278870.0320900.1519471.000000
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " Id BasePay ... TotalPayBenefits Year\n", + "Id 1.000000 -0.204878 ... -0.092190 0.968171\n", + "BasePay -0.204878 1.000000 ... 0.946595 0.033751\n", + "OvertimePay -0.081505 0.266740 ... 0.467981 0.027887\n", + "TotalPay -0.211478 0.954494 ... 0.977313 0.032090\n", + "TotalPayBenefits -0.092190 0.946595 ... 1.000000 0.151947\n", + "Year 0.968171 0.033751 ... 0.151947 1.000000\n", + "\n", + "[6 rows x 6 columns]" + ] + }, + "metadata": {}, + "execution_count": 61 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "2Cxq8qE8d7Fi", + "outputId": "2dcce7cb-bac6-4e07-a1ed-1137dd6c1f05", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Id -0.211478\n", + "BasePay 0.954494\n", + "OvertimePay 0.504859\n", + "Benefits 0.632202\n", + "TotalPay 1.000000\n", + "TotalPayBenefits 0.977313\n", + "Year 0.032090\n", + "Notes NaN\n", + "Name: TotalPay, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 54 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Exploring co-reletion" + ], + "metadata": { + "id": "KjzBUTE6LHFr" + } + }, + { + "cell_type": "code", + "source": [ + "df2 = pd.DataFrame({\n", + " 'x': np.arange(1, 11),\n", + " 'y': np.arange(10, 0, -1),\n", + " 'z': np.arange(11, 21),\n", + " 'r': np.random.rand(10)\n", + "})" + ], + "metadata": { + "id": "aMVtNQqpLGiv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df2.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "muclJ2G5LZkD", + "outputId": "44e0faa3-8013-426a-9d00-e3d422870966" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xyzr
0110110.542859
129120.322000
238130.921930
347140.471896
456150.597488
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " x y z r\n", + "0 1 10 11 0.542859\n", + "1 2 9 12 0.322000\n", + "2 3 8 13 0.921930\n", + "3 4 7 14 0.471896\n", + "4 5 6 15 0.597488" + ] + }, + "metadata": {}, + "execution_count": 71 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df2.corr()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "IaQySagKLatT", + "outputId": "361cf01e-f8f1-4b74-95d0-56a636bf4904" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xyzr
x1.000000-1.0000001.000000-0.219664
y-1.0000001.000000-1.0000000.219664
z1.000000-1.0000001.000000-0.219664
r-0.2196640.219664-0.2196641.000000
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ], + "text/plain": [ + " x y z r\n", + "x 1.000000 -1.000000 1.000000 -0.219664\n", + "y -1.000000 1.000000 -1.000000 0.219664\n", + "z 1.000000 -1.000000 1.000000 -0.219664\n", + "r -0.219664 0.219664 -0.219664 1.000000" + ] + }, + "metadata": {}, + "execution_count": 72 + } + ] + } + ] +} \ No newline at end of file