From 459c79a9fc45fbc154a60464b25960714a245204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Sch=C3=A4tz?=
<61615369+martinschatz-cz@users.noreply.github.com>
Date: Thu, 1 Feb 2024 11:17:04 +0100
Subject: [PATCH] Update 02_Pandas_operations.ipynb
solving #12
---
.../01_pandas_statistics/02_Pandas_operations.ipynb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/advanced_image_analysis_mb100t01/01_pandas_statistics/02_Pandas_operations.ipynb b/advanced_image_analysis_mb100t01/01_pandas_statistics/02_Pandas_operations.ipynb
index d773bfb..a6c8c09 100644
--- a/advanced_image_analysis_mb100t01/01_pandas_statistics/02_Pandas_operations.ipynb
+++ b/advanced_image_analysis_mb100t01/01_pandas_statistics/02_Pandas_operations.ipynb
@@ -1 +1 @@
-{"cells":[{"cell_type":"markdown","source":["
"],"metadata":{"id":"_kbesz9rsBo5"}},{"cell_type":"markdown","metadata":{"id":"Vkf1B-vMwpVB"},"source":["# Basic Operations"]},{"cell_type":"markdown","metadata":{"id":"fpPtZgqIvuXz"},"source":["Inspiration and some of the parts came from: Python Data Science [GitHub repository](https://github.com/jakevdp/PythonDataScienceHandbook/tree/master), [MIT License](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/LICENSE-CODE) and [Introduction to Pandas](https://colab.research.google.com/notebooks/mlcc/intro_to_pandas.ipynb) by Google, [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)\n","\n","If running this from Google Colab, uncomment the cell below and run it. Otherwise, just skip it."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"5saSBc40voZF"},"outputs":[],"source":["#!pip install seaborn\n","#!pip install watermark"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"TL7zwFk5sA2o"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import seaborn as sns"]},{"cell_type":"markdown","metadata":{"id":"ZkUd2sa-yP5e"},"source":["## Learning Objectives:\n","\n"," * operations\n"," * selection\n"," * filtering\n"," * concat\n"," * NaNs\n"]},{"cell_type":"markdown","metadata":{"id":"oa5wfZT7VHJl"},"source":["For this notebook, we will continue using the cities and california housing dataframes."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Hn50qDhssA2p","outputId":"b7f1c937-012c-430a-b4d8-1beca9dcd07f"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population\n","0 San Francisco 852469\n","1 San Jose 1015785\n","2 Sacramento 485199"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n","population = pd.Series([852469, 1015785, 485199])\n","\n","cities_dataframe = pd.DataFrame({ 'City name': city_names, 'Population': population })\n","cities_dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":400,"status":"ok","timestamp":1692082096397,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"av6RYOraVG1V","outputId":"acbcb340-2789-428a-c4dc-9c92b57d52c6"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," longitude | \n"," latitude | \n"," housing_median_age | \n"," total_rooms | \n"," total_bedrooms | \n"," population | \n"," households | \n"," median_income | \n"," median_house_value | \n","
\n"," \n"," \n"," \n"," 0 | \n"," -114.31 | \n"," 34.19 | \n"," 15.0 | \n"," 5612.0 | \n"," 1283.0 | \n"," 1015.0 | \n"," 472.0 | \n"," 1.4936 | \n"," 66900.0 | \n","
\n"," \n"," 1 | \n"," -114.47 | \n"," 34.40 | \n"," 19.0 | \n"," 7650.0 | \n"," 1901.0 | \n"," 1129.0 | \n"," 463.0 | \n"," 1.8200 | \n"," 80100.0 | \n","
\n"," \n"," 2 | \n"," -114.56 | \n"," 33.69 | \n"," 17.0 | \n"," 720.0 | \n"," 174.0 | \n"," 333.0 | \n"," 117.0 | \n"," 1.6509 | \n"," 85700.0 | \n","
\n"," \n"," 3 | \n"," -114.57 | \n"," 33.64 | \n"," 14.0 | \n"," 1501.0 | \n"," 337.0 | \n"," 515.0 | \n"," 226.0 | \n"," 3.1917 | \n"," 73400.0 | \n","
\n"," \n"," 4 | \n"," -114.57 | \n"," 33.57 | \n"," 20.0 | \n"," 1454.0 | \n"," 326.0 | \n"," 624.0 | \n"," 262.0 | \n"," 1.9250 | \n"," 65500.0 | \n","
\n"," \n","
\n","
"],"text/plain":[" longitude latitude housing_median_age total_rooms total_bedrooms \\\n","0 -114.31 34.19 15.0 5612.0 1283.0 \n","1 -114.47 34.40 19.0 7650.0 1901.0 \n","2 -114.56 33.69 17.0 720.0 174.0 \n","3 -114.57 33.64 14.0 1501.0 337.0 \n","4 -114.57 33.57 20.0 1454.0 326.0 \n","\n"," population households median_income median_house_value \n","0 1015.0 472.0 1.4936 66900.0 \n","1 1129.0 463.0 1.8200 80100.0 \n","2 333.0 117.0 1.6509 85700.0 \n","3 515.0 226.0 3.1917 73400.0 \n","4 624.0 262.0 1.9250 65500.0 "]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n","california_housing_dataframe.head()"]},{"cell_type":"markdown","metadata":{"id":"OzPb3cEJsA2r"},"source":["## Manipulating Data\n","\n","### Applying functions\n","\n","You may apply Python's basic arithmetic operations to `Series`. For example:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZPvYGXKisA2r","outputId":"f13f9c0e-cafd-4407-da38-e6d738a7b6b2"},"outputs":[{"data":{"text/plain":["0 852.469\n","1 1015.785\n","2 485.199\n","dtype: float64"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["population / 1000"]},{"cell_type":"markdown","metadata":{"id":"Z9QVl3qksA2s"},"source":["*pandas* `Series` can be used as arguments to most NumPy functions:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"FXAIj5tUsA2s","outputId":"72ac6c7b-c545-4a6b-ef05-5ded8445d398"},"outputs":[{"data":{"text/plain":["0 13.655892\n","1 13.831172\n","2 13.092314\n","dtype: float64"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["np.log(population)"]},{"cell_type":"markdown","metadata":{"id":"hRofbzevsA2s"},"source":["For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map),\n","`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n","\n","The example below creates a new `Series` that indicates whether `population` is over one million:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2eN3aG_xsA2s","outputId":"36b769eb-77e3-441e-d8d8-5fe0a91b8d83"},"outputs":[{"data":{"text/plain":["0 False\n","1 True\n","2 False\n","dtype: bool"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["big_city = population.apply(lambda val: val > 1000000)\n","big_city"]},{"cell_type":"markdown","metadata":{"id":"XfzCh1qT1Pn3"},"source":["### Filtering"]},{"cell_type":"markdown","metadata":{"id":"sh7L0H2HsA2t"},"source":["One can use this result as a binary mask to make a sub-dataframe."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XzNvFVotsA2t","outputId":"754ca69f-10c5-454d-cbd5-75fbf0825ec5"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n","
\n"," \n"," \n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population\n","1 San Jose 1015785"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["cities_dataframe[big_city]"]},{"cell_type":"markdown","metadata":{"id":"6KDilny_sA2t"},"source":["Here is another way of generating a binary mask without explicitly using a `lamba` function."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"VpA39aYVsA2t","outputId":"0948b7dd-fc97-444a-fbb7-d9858a0e26fc"},"outputs":[{"data":{"text/plain":["0 False\n","1 True\n","2 False\n","Name: Population, dtype: bool"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["big_city = cities_dataframe['Population'] > 1000000\n","big_city"]},{"cell_type":"markdown","metadata":{"id":"5S6iUV5WsA2t"},"source":["### Adding new columns\n","\n","Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`. One of them is the result of a computation of 2 existing columns."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"WOGb96mosA2t","outputId":"1721868d-eb6b-457b-881e-59662cd14f7f"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density\n","0 San Francisco 852469 46.87 18187.945381\n","1 San Jose 1015785 176.53 5754.177760\n","2 Sacramento 485199 97.92 4955.055147"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["cities_dataframe['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n","cities_dataframe['Population density'] = cities_dataframe['Population'] / cities_dataframe['Area square miles']\n","cities_dataframe"]},{"cell_type":"markdown","metadata":{"id":"ZvqbT9tP030O"},"source":["### Concatenating DataFrames\n","\n","Let's imagine we collect another similar data sample, like the one below."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7tYdJEJOsA2u","outputId":"845945fb-71c9-4be2-8674-f371d9c6132d"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n","
\n"," \n"," 1 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n","
\n"," \n"," 2 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density\n","0 Sao Paulo 12400232 587.34 21112.527667\n","1 Sao Luis 1108975 319.36 3472.491859\n","2 Salvador 2886698 268.00 10771.261194"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["city_names = pd.Series(['Sao Paulo', 'Sao Luis', 'Salvador'])\n","population = pd.Series([12400232, 1108975, 2886698])\n","\n","another_cities_dataframe = pd.DataFrame({ 'City name': city_names, 'Population': population })\n","\n","another_cities_dataframe['Area square miles'] = pd.Series([587.34, 319.36, 268])\n","another_cities_dataframe['Population density'] = another_cities_dataframe['Population'] / another_cities_dataframe['Area square miles']\n","another_cities_dataframe"]},{"cell_type":"markdown","metadata":{"id":"mFZVreVhsA2u"},"source":["Before concatenating, it is probably a good idea to insert an identifier column so that we keep track where data came from.\n","\n","We can easily do that by creating a new column in each dataframe **before** concatenating."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"iIVZNMQTsA2u"},"outputs":[],"source":["cities_dataframe['Country'] = 'USA'\n","another_cities_dataframe['Country'] = 'Brazil'"]},{"cell_type":"markdown","metadata":{"id":"OO5Iw5TYsA2u"},"source":["We can now concatenate similar dataframes with the `pandas.concat` functions."]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096398,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"wlPXyLYw06UV","outputId":"76547221-538c-4d71-e86c-9871e7318ee2"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n"," Country | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n"," USA | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n"," USA | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n"," USA | \n","
\n"," \n"," 0 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n"," Brazil | \n","
\n"," \n"," 1 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n"," Brazil | \n","
\n"," \n"," 2 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n"," Brazil | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density Country\n","0 San Francisco 852469 46.87 18187.945381 USA\n","1 San Jose 1015785 176.53 5754.177760 USA\n","2 Sacramento 485199 97.92 4955.055147 USA\n","0 Sao Paulo 12400232 587.34 21112.527667 Brazil\n","1 Sao Luis 1108975 319.36 3472.491859 Brazil\n","2 Salvador 2886698 268.00 10771.261194 Brazil"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["result = pd.concat([cities_dataframe, another_cities_dataframe])\n","result"]},{"cell_type":"markdown","metadata":{"id":"WICecJHvsA2u"},"source":["We now have a longer dataframe with some repeated indices. To have unique indices, we can use `.reset_index`."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"BvRjm2vUsA2u","outputId":"0fe65ba1-8b8c-4ca4-d3b3-7a0674385dfa"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n"," Country | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n"," USA | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n"," USA | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n"," USA | \n","
\n"," \n"," 3 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n"," Brazil | \n","
\n"," \n"," 4 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n"," Brazil | \n","
\n"," \n"," 5 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n"," Brazil | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density Country\n","0 San Francisco 852469 46.87 18187.945381 USA\n","1 San Jose 1015785 176.53 5754.177760 USA\n","2 Sacramento 485199 97.92 4955.055147 USA\n","3 Sao Paulo 12400232 587.34 21112.527667 Brazil\n","4 Sao Luis 1108975 319.36 3472.491859 Brazil\n","5 Salvador 2886698 268.00 10771.261194 Brazil"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["result = result.reset_index(drop=True)\n","result"]},{"cell_type":"markdown","metadata":{"id":"uy2tR0XR3vbE"},"source":["### NaNs\n","\n","`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. We cannot assume what these values are, because that would distort th results. So we need to deal with these NaNs values."]},{"cell_type":"markdown","metadata":{"id":"bGYpAX7t4bky"},"source":["We can test the missing values using `isnull()` function."]},{"cell_type":"markdown","metadata":{"id":"P0BwiPFhIbfV"},"source":["We can work with one of the `seaborn` training datasets *Penguins*"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9Spm3StwImkL"},"outputs":[],"source":["penguins = sns.load_dataset(\"penguins\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096400,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"GYJkXKke42gp","outputId":"ab0c346a-952e-4fa1-aca9-e0d96c964520"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," species | \n"," island | \n"," bill_length_mm | \n"," bill_depth_mm | \n"," flipper_length_mm | \n"," body_mass_g | \n"," sex | \n","
\n"," \n"," \n"," \n"," 0 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 1 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 2 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 3 | \n"," False | \n"," False | \n"," True | \n"," True | \n"," True | \n"," True | \n"," True | \n","
\n"," \n"," 4 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 339 | \n"," False | \n"," False | \n"," True | \n"," True | \n"," True | \n"," True | \n"," True | \n","
\n"," \n"," 340 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 341 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 342 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 343 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n","
\n","
344 rows × 7 columns
\n","
"],"text/plain":[" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n","0 False False False False False \n","1 False False False False False \n","2 False False False False False \n","3 False False True True True \n","4 False False False False False \n",".. ... ... ... ... ... \n","339 False False True True True \n","340 False False False False False \n","341 False False False False False \n","342 False False False False False \n","343 False False False False False \n","\n"," body_mass_g sex \n","0 False False \n","1 False False \n","2 False False \n","3 True True \n","4 False False \n",".. ... ... \n","339 True True \n","340 False False \n","341 False False \n","342 False False \n","343 False False \n","\n","[344 rows x 7 columns]"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull()"]},{"cell_type":"markdown","metadata":{"id":"m9jBF0A55OBn"},"source":["But it is more practical to test if there are any NaNs, than looking for it. We can use `.isnull().values.any()` approach."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18,"status":"ok","timestamp":1692082096401,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"PucyKyW87geJ","outputId":"ba37a65a-dc0c-47a6-8bba-ed09be8ee1c7"},"outputs":[{"data":{"text/plain":["True"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull().values.any()"]},{"cell_type":"markdown","metadata":{"id":"C5dD79015mYf"},"source":["Or we can explore each column using `.isnull().sum()`."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096401,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"GWIZ4l_z5tYQ","outputId":"bf5f4e7b-9e1c-406d-fade-2e543db8f0ff"},"outputs":[{"data":{"text/plain":["species 0\n","island 0\n","bill_length_mm 2\n","bill_depth_mm 2\n","flipper_length_mm 2\n","body_mass_g 2\n","sex 11\n","dtype: int64"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull().sum()"]},{"cell_type":"markdown","metadata":{"id":"PcKxJ_vNLW3X"},"source":["We will want to drop all rows with unknown entries with `.dropna()` function."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096402,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"Vfqe-aMDMBpi","outputId":"6cd4d58f-78d6-47ff-bc4c-17423e4bd683"},"outputs":[{"data":{"text/plain":["species 0\n","island 0\n","bill_length_mm 0\n","bill_depth_mm 0\n","flipper_length_mm 0\n","body_mass_g 0\n","sex 0\n","dtype: int64"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["penguins_cleaned = penguins.dropna()\n","penguins_cleaned.isnull().sum()"]},{"cell_type":"markdown","metadata":{"id":"VksvzteBsA2z"},"source":["## Exercise\n","\n","The table below contains shape and intensity measurements from a biological sample. Make a subset with the columns `Area` and `Mean`. Remove all rows that contain NaNs from it and count the remaining rows.\n","\n","Afterwards, take the initial table again and make a new subset with the columns `Major` and `Minor`. Remove NaNs and count the remaining rows again.\n","\n","What do you conclude?"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ftJB1JvXsA20","outputId":"e888baf3-5fdb-45a9-d860-f7f570e709e5"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," Area | \n"," Mean | \n"," StdDev | \n"," Min | \n"," Max | \n"," X | \n"," Y | \n"," XM | \n"," YM | \n"," Major | \n"," Minor | \n"," Angle | \n"," %Area | \n"," Type | \n","
\n"," \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n","
\n"," \n"," \n"," \n"," 1 | \n"," 18.0 | \n"," 730.389 | \n"," 103.354 | \n"," 592.0 | \n"," 948.0 | \n"," 435.000 | \n"," 4.722 | \n"," 434.962 | \n"," 4.697 | \n"," 5.987 | \n"," 3.828 | \n"," 168.425 | \n"," 100 | \n"," A | \n","
\n"," \n"," 2 | \n"," 126.0 | \n"," 718.333 | \n"," 90.367 | \n"," 556.0 | \n"," 1046.0 | \n"," 388.087 | \n"," 8.683 | \n"," 388.183 | \n"," 8.687 | \n"," 16.559 | \n"," 9.688 | \n"," 175.471 | \n"," 100 | \n"," A | \n","
\n"," \n"," 3 | \n"," NaN | \n"," NaN | \n"," NaN | \n"," 608.0 | \n"," 964.0 | \n"," NaN | \n"," NaN | \n"," NaN | \n"," 7.665 | \n"," 7.359 | \n"," NaN | \n"," 101.121 | \n"," 100 | \n"," A | \n","
\n"," \n"," 4 | \n"," 68.0 | \n"," 686.985 | \n"," 61.169 | \n"," 571.0 | \n"," 880.0 | \n"," 126.147 | \n"," 8.809 | \n"," 126.192 | \n"," 8.811 | \n"," 15.136 | \n"," 5.720 | \n"," 168.133 | \n"," 100 | \n"," A | \n","
\n"," \n"," 5 | \n"," NaN | \n"," NaN | \n"," 69.438 | \n"," 566.0 | \n"," 792.0 | \n"," 348.500 | \n"," 7.500 | \n"," NaN | \n"," 7.508 | \n"," NaN | \n"," 3.088 | \n"," NaN | \n"," 100 | \n"," A | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 387 | \n"," 152.0 | \n"," 801.599 | \n"," 111.328 | \n"," 582.0 | \n"," 1263.0 | \n"," 348.487 | \n"," 497.632 | \n"," 348.451 | \n"," 497.675 | \n"," 17.773 | \n"," 10.889 | \n"," 11.829 | \n"," 100 | \n"," A | \n","
\n"," \n"," 388 | \n"," 17.0 | \n"," 742.706 | \n"," 69.624 | \n"," 620.0 | \n"," 884.0 | \n"," 420.500 | \n"," 496.382 | \n"," 420.513 | \n"," NaN | \n"," NaN | \n"," 3.663 | \n"," 49.457 | \n"," 100 | \n"," A | \n","
\n"," \n"," 389 | \n"," 60.0 | \n"," 758.033 | \n"," 77.309 | \n"," 601.0 | \n"," 947.0 | \n"," 259.000 | \n"," 499.300 | \n"," 258.990 | \n"," 499.289 | \n"," 9.476 | \n"," 8.062 | \n"," 90.000 | \n"," 100 | \n"," A | \n","
\n"," \n"," 390 | \n"," 12.0 | \n"," 714.833 | \n"," 67.294 | \n"," 551.0 | \n"," 785.0 | \n"," 240.167 | \n"," 498.167 | \n"," 240.179 | \n"," 498.148 | \n"," 4.606 | \n"," 3.317 | \n"," 168.690 | \n"," 100 | \n"," A | \n","
\n"," \n"," 391 | \n"," 23.0 | \n"," 695.043 | \n"," 67.356 | \n"," 611.0 | \n"," 846.0 | \n"," 49.891 | \n"," 503.022 | \n"," 49.882 | \n"," 502.979 | \n"," 6.454 | \n"," 4.537 | \n"," 73.243 | \n"," 100 | \n"," A | \n","
\n"," \n","
\n","
391 rows × 14 columns
\n","
"],"text/plain":[" Area Mean StdDev Min Max X Y XM \\\n"," \n","1 18.0 730.389 103.354 592.0 948.0 435.000 4.722 434.962 \n","2 126.0 718.333 90.367 556.0 1046.0 388.087 8.683 388.183 \n","3 NaN NaN NaN 608.0 964.0 NaN NaN NaN \n","4 68.0 686.985 61.169 571.0 880.0 126.147 8.809 126.192 \n","5 NaN NaN 69.438 566.0 792.0 348.500 7.500 NaN \n",".. ... ... ... ... ... ... ... ... \n","387 152.0 801.599 111.328 582.0 1263.0 348.487 497.632 348.451 \n","388 17.0 742.706 69.624 620.0 884.0 420.500 496.382 420.513 \n","389 60.0 758.033 77.309 601.0 947.0 259.000 499.300 258.990 \n","390 12.0 714.833 67.294 551.0 785.0 240.167 498.167 240.179 \n","391 23.0 695.043 67.356 611.0 846.0 49.891 503.022 49.882 \n","\n"," YM Major Minor Angle %Area Type \n"," \n","1 4.697 5.987 3.828 168.425 100 A \n","2 8.687 16.559 9.688 175.471 100 A \n","3 7.665 7.359 NaN 101.121 100 A \n","4 8.811 15.136 5.720 168.133 100 A \n","5 7.508 NaN 3.088 NaN 100 A \n",".. ... ... ... ... ... ... \n","387 497.675 17.773 10.889 11.829 100 A \n","388 NaN NaN 3.663 49.457 100 A \n","389 499.289 9.476 8.062 90.000 100 A \n","390 498.148 4.606 3.317 168.690 100 A \n","391 502.979 6.454 4.537 73.243 100 A \n","\n","[391 rows x 14 columns]"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["dataframe = pd.read_csv('../../data/Results.csv', index_col=0, delimiter=';')\n","dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7IPdbT2NsA20"},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"id":"fH1zusN7GKCx"},"source":["**Watermark**"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1692082096620,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"iH1jL0baGMJW","outputId":"8639a749-8bab-4fd8-d6be-29230edccdea"},"outputs":[{"name":"stdout","output_type":"stream","text":["Last updated: 2023-08-24T14:25:40.097708+02:00\n","\n","Python implementation: CPython\n","Python version : 3.9.17\n","IPython version : 8.14.0\n","\n","Compiler : MSC v.1929 64 bit (AMD64)\n","OS : Windows\n","Release : 10\n","Machine : AMD64\n","Processor : Intel64 Family 6 Model 165 Stepping 2, GenuineIntel\n","CPU cores : 16\n","Architecture: 64bit\n","\n","watermark : 2.4.3\n","numpy : 1.23.5\n","pandas : 2.0.3\n","seaborn : 0.12.2\n","pivottablejs: 0.9.0\n","\n"]}],"source":["from watermark import watermark\n","watermark(iversions=True, globals_=globals())\n","print(watermark())\n","print(watermark(packages=\"watermark,numpy,pandas,seaborn,pivottablejs\"))"]}],"metadata":{"colab":{"provenance":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.17"}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
+{"cells":[{"cell_type":"markdown","source":["
"],"metadata":{"id":"_kbesz9rsBo5"}},{"cell_type":"markdown","metadata":{"id":"Vkf1B-vMwpVB"},"source":["# Basic Operations"]},{"cell_type":"markdown","metadata":{"id":"fpPtZgqIvuXz"},"source":["Inspiration and some of the parts came from: Python Data Science [GitHub repository](https://github.com/jakevdp/PythonDataScienceHandbook/tree/master), [MIT License](https://github.com/jakevdp/PythonDataScienceHandbook/blob/master/LICENSE-CODE) and [Introduction to Pandas](https://colab.research.google.com/notebooks/mlcc/intro_to_pandas.ipynb) by Google, [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0)\n","\n","If running this from Google Colab, uncomment the cell below and run it. Otherwise, just skip it."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"5saSBc40voZF"},"outputs":[],"source":["#!pip install seaborn\n","#!pip install watermark"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"TL7zwFk5sA2o"},"outputs":[],"source":["import pandas as pd\n","import numpy as np\n","import seaborn as sns"]},{"cell_type":"markdown","metadata":{"id":"ZkUd2sa-yP5e"},"source":["## Learning Objectives:\n","\n"," * operations\n"," * selection\n"," * filtering\n"," * concat\n"," * NaNs\n"]},{"cell_type":"markdown","metadata":{"id":"oa5wfZT7VHJl"},"source":["For this notebook, we will continue using the cities and california housing dataframes."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Hn50qDhssA2p","outputId":"b7f1c937-012c-430a-b4d8-1beca9dcd07f"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population\n","0 San Francisco 852469\n","1 San Jose 1015785\n","2 Sacramento 485199"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["city_names = pd.Series(['San Francisco', 'San Jose', 'Sacramento'])\n","population = pd.Series([852469, 1015785, 485199])\n","\n","cities_dataframe = pd.DataFrame({ 'City name': city_names, 'Population': population })\n","cities_dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"executionInfo":{"elapsed":400,"status":"ok","timestamp":1692082096397,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"av6RYOraVG1V","outputId":"acbcb340-2789-428a-c4dc-9c92b57d52c6"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," longitude | \n"," latitude | \n"," housing_median_age | \n"," total_rooms | \n"," total_bedrooms | \n"," population | \n"," households | \n"," median_income | \n"," median_house_value | \n","
\n"," \n"," \n"," \n"," 0 | \n"," -114.31 | \n"," 34.19 | \n"," 15.0 | \n"," 5612.0 | \n"," 1283.0 | \n"," 1015.0 | \n"," 472.0 | \n"," 1.4936 | \n"," 66900.0 | \n","
\n"," \n"," 1 | \n"," -114.47 | \n"," 34.40 | \n"," 19.0 | \n"," 7650.0 | \n"," 1901.0 | \n"," 1129.0 | \n"," 463.0 | \n"," 1.8200 | \n"," 80100.0 | \n","
\n"," \n"," 2 | \n"," -114.56 | \n"," 33.69 | \n"," 17.0 | \n"," 720.0 | \n"," 174.0 | \n"," 333.0 | \n"," 117.0 | \n"," 1.6509 | \n"," 85700.0 | \n","
\n"," \n"," 3 | \n"," -114.57 | \n"," 33.64 | \n"," 14.0 | \n"," 1501.0 | \n"," 337.0 | \n"," 515.0 | \n"," 226.0 | \n"," 3.1917 | \n"," 73400.0 | \n","
\n"," \n"," 4 | \n"," -114.57 | \n"," 33.57 | \n"," 20.0 | \n"," 1454.0 | \n"," 326.0 | \n"," 624.0 | \n"," 262.0 | \n"," 1.9250 | \n"," 65500.0 | \n","
\n"," \n","
\n","
"],"text/plain":[" longitude latitude housing_median_age total_rooms total_bedrooms \\\n","0 -114.31 34.19 15.0 5612.0 1283.0 \n","1 -114.47 34.40 19.0 7650.0 1901.0 \n","2 -114.56 33.69 17.0 720.0 174.0 \n","3 -114.57 33.64 14.0 1501.0 337.0 \n","4 -114.57 33.57 20.0 1454.0 326.0 \n","\n"," population households median_income median_house_value \n","0 1015.0 472.0 1.4936 66900.0 \n","1 1129.0 463.0 1.8200 80100.0 \n","2 333.0 117.0 1.6509 85700.0 \n","3 515.0 226.0 3.1917 73400.0 \n","4 624.0 262.0 1.9250 65500.0 "]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["california_housing_dataframe = pd.read_csv(\"https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv\", sep=\",\")\n","california_housing_dataframe.head()"]},{"cell_type":"markdown","metadata":{"id":"OzPb3cEJsA2r"},"source":["## Manipulating Data\n","\n","### Applying functions\n","\n","You may apply Python's basic arithmetic operations to `Series`. For example:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ZPvYGXKisA2r","outputId":"f13f9c0e-cafd-4407-da38-e6d738a7b6b2"},"outputs":[{"data":{"text/plain":["0 852.469\n","1 1015.785\n","2 485.199\n","dtype: float64"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}],"source":["population / 1000"]},{"cell_type":"markdown","metadata":{"id":"Z9QVl3qksA2s"},"source":["*pandas* `Series` can be used as arguments to most NumPy functions:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"FXAIj5tUsA2s","outputId":"72ac6c7b-c545-4a6b-ef05-5ded8445d398"},"outputs":[{"data":{"text/plain":["0 13.655892\n","1 13.831172\n","2 13.092314\n","dtype: float64"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["np.log(population)"]},{"cell_type":"markdown","metadata":{"id":"hRofbzevsA2s"},"source":["For more complex single-column transformations, you can use `Series.apply`. Like the Python [map function](https://docs.python.org/2/library/functions.html#map),\n","`Series.apply` accepts as an argument a [lambda function](https://docs.python.org/2/tutorial/controlflow.html#lambda-expressions), which is applied to each value.\n","\n","The example below creates a new `Series` that indicates whether `population` is over one million:"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2eN3aG_xsA2s","outputId":"36b769eb-77e3-441e-d8d8-5fe0a91b8d83"},"outputs":[{"data":{"text/plain":["0 False\n","1 True\n","2 False\n","dtype: bool"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}],"source":["big_city = population.apply(lambda val: val > 1000000)\n","big_city"]},{"cell_type":"markdown","metadata":{"id":"XfzCh1qT1Pn3"},"source":["### Filtering"]},{"cell_type":"markdown","metadata":{"id":"sh7L0H2HsA2t"},"source":["One can use this result as a binary mask to make a sub-dataframe."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"XzNvFVotsA2t","outputId":"754ca69f-10c5-454d-cbd5-75fbf0825ec5"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n","
\n"," \n"," \n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population\n","1 San Jose 1015785"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}],"source":["cities_dataframe[big_city]"]},{"cell_type":"markdown","metadata":{"id":"6KDilny_sA2t"},"source":["Here is another way of generating a binary mask without explicitly using a `lamba` function."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"VpA39aYVsA2t","outputId":"0948b7dd-fc97-444a-fbb7-d9858a0e26fc"},"outputs":[{"data":{"text/plain":["0 False\n","1 True\n","2 False\n","Name: Population, dtype: bool"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}],"source":["big_city = cities_dataframe['Population'] > 1000000\n","big_city"]},{"cell_type":"markdown","metadata":{"id":"5S6iUV5WsA2t"},"source":["### Adding new columns\n","\n","Modifying `DataFrames` is also straightforward. For example, the following code adds two `Series` to an existing `DataFrame`. One of them is the result of a computation of 2 existing columns."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"WOGb96mosA2t","outputId":"1721868d-eb6b-457b-881e-59662cd14f7f"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density\n","0 San Francisco 852469 46.87 18187.945381\n","1 San Jose 1015785 176.53 5754.177760\n","2 Sacramento 485199 97.92 4955.055147"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}],"source":["cities_dataframe['Area square miles'] = pd.Series([46.87, 176.53, 97.92])\n","cities_dataframe['Population density'] = cities_dataframe['Population'] / cities_dataframe['Area square miles']\n","cities_dataframe"]},{"cell_type":"markdown","metadata":{"id":"ZvqbT9tP030O"},"source":["### Concatenating DataFrames\n","\n","Let's imagine we collect another similar data sample, like the one below."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7tYdJEJOsA2u","outputId":"845945fb-71c9-4be2-8674-f371d9c6132d"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n","
\n"," \n"," \n"," \n"," 0 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n","
\n"," \n"," 1 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n","
\n"," \n"," 2 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density\n","0 Sao Paulo 12400232 587.34 21112.527667\n","1 Sao Luis 1108975 319.36 3472.491859\n","2 Salvador 2886698 268.00 10771.261194"]},"execution_count":11,"metadata":{},"output_type":"execute_result"}],"source":["city_names = pd.Series(['Sao Paulo', 'Sao Luis', 'Salvador'])\n","population = pd.Series([12400232, 1108975, 2886698])\n","\n","another_cities_dataframe = pd.DataFrame({ 'City name': city_names, 'Population': population })\n","\n","another_cities_dataframe['Area square miles'] = pd.Series([587.34, 319.36, 268])\n","another_cities_dataframe['Population density'] = another_cities_dataframe['Population'] / another_cities_dataframe['Area square miles']\n","another_cities_dataframe"]},{"cell_type":"markdown","metadata":{"id":"mFZVreVhsA2u"},"source":["Before concatenating, it is probably a good idea to insert an identifier column so that we keep track where data came from.\n","\n","We can easily do that by creating a new column in each dataframe **before** concatenating."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"iIVZNMQTsA2u"},"outputs":[],"source":["cities_dataframe['Country'] = 'USA'\n","another_cities_dataframe['Country'] = 'Brazil'"]},{"cell_type":"markdown","metadata":{"id":"OO5Iw5TYsA2u"},"source":["We can now concatenate similar dataframes with the `pandas.concat` functions."]},{"cell_type":"code","execution_count":null,"metadata":{"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096398,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"wlPXyLYw06UV","outputId":"76547221-538c-4d71-e86c-9871e7318ee2"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n"," Country | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n"," USA | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n"," USA | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n"," USA | \n","
\n"," \n"," 0 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n"," Brazil | \n","
\n"," \n"," 1 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n"," Brazil | \n","
\n"," \n"," 2 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n"," Brazil | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density Country\n","0 San Francisco 852469 46.87 18187.945381 USA\n","1 San Jose 1015785 176.53 5754.177760 USA\n","2 Sacramento 485199 97.92 4955.055147 USA\n","0 Sao Paulo 12400232 587.34 21112.527667 Brazil\n","1 Sao Luis 1108975 319.36 3472.491859 Brazil\n","2 Salvador 2886698 268.00 10771.261194 Brazil"]},"execution_count":13,"metadata":{},"output_type":"execute_result"}],"source":["result = pd.concat([cities_dataframe, another_cities_dataframe])\n","result"]},{"cell_type":"markdown","metadata":{"id":"WICecJHvsA2u"},"source":["We now have a longer dataframe with some repeated indices. To have unique indices, we can use `.reset_index`."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"BvRjm2vUsA2u","outputId":"0fe65ba1-8b8c-4ca4-d3b3-7a0674385dfa"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," City name | \n"," Population | \n"," Area square miles | \n"," Population density | \n"," Country | \n","
\n"," \n"," \n"," \n"," 0 | \n"," San Francisco | \n"," 852469 | \n"," 46.87 | \n"," 18187.945381 | \n"," USA | \n","
\n"," \n"," 1 | \n"," San Jose | \n"," 1015785 | \n"," 176.53 | \n"," 5754.177760 | \n"," USA | \n","
\n"," \n"," 2 | \n"," Sacramento | \n"," 485199 | \n"," 97.92 | \n"," 4955.055147 | \n"," USA | \n","
\n"," \n"," 3 | \n"," Sao Paulo | \n"," 12400232 | \n"," 587.34 | \n"," 21112.527667 | \n"," Brazil | \n","
\n"," \n"," 4 | \n"," Sao Luis | \n"," 1108975 | \n"," 319.36 | \n"," 3472.491859 | \n"," Brazil | \n","
\n"," \n"," 5 | \n"," Salvador | \n"," 2886698 | \n"," 268.00 | \n"," 10771.261194 | \n"," Brazil | \n","
\n"," \n","
\n","
"],"text/plain":[" City name Population Area square miles Population density Country\n","0 San Francisco 852469 46.87 18187.945381 USA\n","1 San Jose 1015785 176.53 5754.177760 USA\n","2 Sacramento 485199 97.92 4955.055147 USA\n","3 Sao Paulo 12400232 587.34 21112.527667 Brazil\n","4 Sao Luis 1108975 319.36 3472.491859 Brazil\n","5 Salvador 2886698 268.00 10771.261194 Brazil"]},"execution_count":14,"metadata":{},"output_type":"execute_result"}],"source":["result = result.reset_index(drop=True)\n","result"]},{"cell_type":"markdown","metadata":{"id":"uy2tR0XR3vbE"},"source":["### NaNs\n","\n","`DataFrame` objects can be created by passing a `dict` mapping `string` column names to their respective `Series`. If the `Series` don't match in length, missing values are filled with special [NA/NaN](http://pandas.pydata.org/pandas-docs/stable/missing_data.html) values. We cannot assume what these values are, because that would distort th results. So we need to deal with these NaNs values."]},{"cell_type":"markdown","metadata":{"id":"bGYpAX7t4bky"},"source":["We can test the missing values using `isnull()` function."]},{"cell_type":"markdown","metadata":{"id":"P0BwiPFhIbfV"},"source":["We can work with one of the `seaborn` training datasets *Penguins*"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9Spm3StwImkL"},"outputs":[],"source":["penguins = sns.load_dataset(\"penguins\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":424},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096400,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"GYJkXKke42gp","outputId":"ab0c346a-952e-4fa1-aca9-e0d96c964520"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," species | \n"," island | \n"," bill_length_mm | \n"," bill_depth_mm | \n"," flipper_length_mm | \n"," body_mass_g | \n"," sex | \n","
\n"," \n"," \n"," \n"," 0 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 1 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 2 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 3 | \n"," False | \n"," False | \n"," True | \n"," True | \n"," True | \n"," True | \n"," True | \n","
\n"," \n"," 4 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 339 | \n"," False | \n"," False | \n"," True | \n"," True | \n"," True | \n"," True | \n"," True | \n","
\n"," \n"," 340 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 341 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 342 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n"," 343 | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n"," False | \n","
\n"," \n","
\n","
344 rows × 7 columns
\n","
"],"text/plain":[" species island bill_length_mm bill_depth_mm flipper_length_mm \\\n","0 False False False False False \n","1 False False False False False \n","2 False False False False False \n","3 False False True True True \n","4 False False False False False \n",".. ... ... ... ... ... \n","339 False False True True True \n","340 False False False False False \n","341 False False False False False \n","342 False False False False False \n","343 False False False False False \n","\n"," body_mass_g sex \n","0 False False \n","1 False False \n","2 False False \n","3 True True \n","4 False False \n",".. ... ... \n","339 True True \n","340 False False \n","341 False False \n","342 False False \n","343 False False \n","\n","[344 rows x 7 columns]"]},"execution_count":16,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull()"]},{"cell_type":"markdown","metadata":{"id":"m9jBF0A55OBn"},"source":["But it is more practical to test if there are any NaNs, than looking for it. We can use `.isnull().values.any()` approach."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18,"status":"ok","timestamp":1692082096401,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"PucyKyW87geJ","outputId":"ba37a65a-dc0c-47a6-8bba-ed09be8ee1c7"},"outputs":[{"data":{"text/plain":["True"]},"execution_count":17,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull().values.any()"]},{"cell_type":"markdown","metadata":{"id":"C5dD79015mYf"},"source":["Or we can explore each column using `.isnull().sum()`."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096401,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"GWIZ4l_z5tYQ","outputId":"bf5f4e7b-9e1c-406d-fade-2e543db8f0ff"},"outputs":[{"data":{"text/plain":["species 0\n","island 0\n","bill_length_mm 2\n","bill_depth_mm 2\n","flipper_length_mm 2\n","body_mass_g 2\n","sex 11\n","dtype: int64"]},"execution_count":18,"metadata":{},"output_type":"execute_result"}],"source":["penguins.isnull().sum()"]},{"cell_type":"markdown","metadata":{"id":"PcKxJ_vNLW3X"},"source":["We will want to drop all rows with unknown entries with `.dropna()` function."]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17,"status":"ok","timestamp":1692082096402,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"Vfqe-aMDMBpi","outputId":"6cd4d58f-78d6-47ff-bc4c-17423e4bd683"},"outputs":[{"data":{"text/plain":["species 0\n","island 0\n","bill_length_mm 0\n","bill_depth_mm 0\n","flipper_length_mm 0\n","body_mass_g 0\n","sex 0\n","dtype: int64"]},"execution_count":19,"metadata":{},"output_type":"execute_result"}],"source":["penguins_cleaned = penguins.dropna()\n","penguins_cleaned.isnull().sum()"]},{"cell_type":"markdown","metadata":{"id":"VksvzteBsA2z"},"source":["## Exercise\n","\n","The table below contains shape and intensity measurements from a biological sample. Make a subset with the columns `Area` and `Mean`. Remove all rows that contain NaNs from it and count the remaining rows.\n","\n","Afterwards, take the initial table again and make a new subset with the columns `Major` and `Minor`. Remove NaNs and count the remaining rows again.\n","\n","What do you conclude?"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ftJB1JvXsA20","outputId":"e888baf3-5fdb-45a9-d860-f7f570e709e5"},"outputs":[{"data":{"text/html":["\n","\n","
\n"," \n"," \n"," | \n"," Area | \n"," Mean | \n"," StdDev | \n"," Min | \n"," Max | \n"," X | \n"," Y | \n"," XM | \n"," YM | \n"," Major | \n"," Minor | \n"," Angle | \n"," %Area | \n"," Type | \n","
\n"," \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n"," | \n","
\n"," \n"," \n"," \n"," 1 | \n"," 18.0 | \n"," 730.389 | \n"," 103.354 | \n"," 592.0 | \n"," 948.0 | \n"," 435.000 | \n"," 4.722 | \n"," 434.962 | \n"," 4.697 | \n"," 5.987 | \n"," 3.828 | \n"," 168.425 | \n"," 100 | \n"," A | \n","
\n"," \n"," 2 | \n"," 126.0 | \n"," 718.333 | \n"," 90.367 | \n"," 556.0 | \n"," 1046.0 | \n"," 388.087 | \n"," 8.683 | \n"," 388.183 | \n"," 8.687 | \n"," 16.559 | \n"," 9.688 | \n"," 175.471 | \n"," 100 | \n"," A | \n","
\n"," \n"," 3 | \n"," NaN | \n"," NaN | \n"," NaN | \n"," 608.0 | \n"," 964.0 | \n"," NaN | \n"," NaN | \n"," NaN | \n"," 7.665 | \n"," 7.359 | \n"," NaN | \n"," 101.121 | \n"," 100 | \n"," A | \n","
\n"," \n"," 4 | \n"," 68.0 | \n"," 686.985 | \n"," 61.169 | \n"," 571.0 | \n"," 880.0 | \n"," 126.147 | \n"," 8.809 | \n"," 126.192 | \n"," 8.811 | \n"," 15.136 | \n"," 5.720 | \n"," 168.133 | \n"," 100 | \n"," A | \n","
\n"," \n"," 5 | \n"," NaN | \n"," NaN | \n"," 69.438 | \n"," 566.0 | \n"," 792.0 | \n"," 348.500 | \n"," 7.500 | \n"," NaN | \n"," 7.508 | \n"," NaN | \n"," 3.088 | \n"," NaN | \n"," 100 | \n"," A | \n","
\n"," \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n"," ... | \n","
\n"," \n"," 387 | \n"," 152.0 | \n"," 801.599 | \n"," 111.328 | \n"," 582.0 | \n"," 1263.0 | \n"," 348.487 | \n"," 497.632 | \n"," 348.451 | \n"," 497.675 | \n"," 17.773 | \n"," 10.889 | \n"," 11.829 | \n"," 100 | \n"," A | \n","
\n"," \n"," 388 | \n"," 17.0 | \n"," 742.706 | \n"," 69.624 | \n"," 620.0 | \n"," 884.0 | \n"," 420.500 | \n"," 496.382 | \n"," 420.513 | \n"," NaN | \n"," NaN | \n"," 3.663 | \n"," 49.457 | \n"," 100 | \n"," A | \n","
\n"," \n"," 389 | \n"," 60.0 | \n"," 758.033 | \n"," 77.309 | \n"," 601.0 | \n"," 947.0 | \n"," 259.000 | \n"," 499.300 | \n"," 258.990 | \n"," 499.289 | \n"," 9.476 | \n"," 8.062 | \n"," 90.000 | \n"," 100 | \n"," A | \n","
\n"," \n"," 390 | \n"," 12.0 | \n"," 714.833 | \n"," 67.294 | \n"," 551.0 | \n"," 785.0 | \n"," 240.167 | \n"," 498.167 | \n"," 240.179 | \n"," 498.148 | \n"," 4.606 | \n"," 3.317 | \n"," 168.690 | \n"," 100 | \n"," A | \n","
\n"," \n"," 391 | \n"," 23.0 | \n"," 695.043 | \n"," 67.356 | \n"," 611.0 | \n"," 846.0 | \n"," 49.891 | \n"," 503.022 | \n"," 49.882 | \n"," 502.979 | \n"," 6.454 | \n"," 4.537 | \n"," 73.243 | \n"," 100 | \n"," A | \n","
\n"," \n","
\n","
391 rows × 14 columns
\n","
"],"text/plain":[" Area Mean StdDev Min Max X Y XM \\\n"," \n","1 18.0 730.389 103.354 592.0 948.0 435.000 4.722 434.962 \n","2 126.0 718.333 90.367 556.0 1046.0 388.087 8.683 388.183 \n","3 NaN NaN NaN 608.0 964.0 NaN NaN NaN \n","4 68.0 686.985 61.169 571.0 880.0 126.147 8.809 126.192 \n","5 NaN NaN 69.438 566.0 792.0 348.500 7.500 NaN \n",".. ... ... ... ... ... ... ... ... \n","387 152.0 801.599 111.328 582.0 1263.0 348.487 497.632 348.451 \n","388 17.0 742.706 69.624 620.0 884.0 420.500 496.382 420.513 \n","389 60.0 758.033 77.309 601.0 947.0 259.000 499.300 258.990 \n","390 12.0 714.833 67.294 551.0 785.0 240.167 498.167 240.179 \n","391 23.0 695.043 67.356 611.0 846.0 49.891 503.022 49.882 \n","\n"," YM Major Minor Angle %Area Type \n"," \n","1 4.697 5.987 3.828 168.425 100 A \n","2 8.687 16.559 9.688 175.471 100 A \n","3 7.665 7.359 NaN 101.121 100 A \n","4 8.811 15.136 5.720 168.133 100 A \n","5 7.508 NaN 3.088 NaN 100 A \n",".. ... ... ... ... ... ... \n","387 497.675 17.773 10.889 11.829 100 A \n","388 NaN NaN 3.663 49.457 100 A \n","389 499.289 9.476 8.062 90.000 100 A \n","390 498.148 4.606 3.317 168.690 100 A \n","391 502.979 6.454 4.537 73.243 100 A \n","\n","[391 rows x 14 columns]"]},"execution_count":21,"metadata":{},"output_type":"execute_result"}],"source":["dataframe = pd.read_csv('https://github.com/vmcf-konfmi/MB100T01/raw/main/data/Results.csv', index_col=0, delimiter=';')\n","dataframe"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7IPdbT2NsA20"},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"id":"fH1zusN7GKCx"},"source":["**Watermark**"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":3,"status":"ok","timestamp":1692082096620,"user":{"displayName":"Martin Schätz","userId":"14609383414092679868"},"user_tz":-120},"id":"iH1jL0baGMJW","outputId":"8639a749-8bab-4fd8-d6be-29230edccdea"},"outputs":[{"name":"stdout","output_type":"stream","text":["Last updated: 2023-08-24T14:25:40.097708+02:00\n","\n","Python implementation: CPython\n","Python version : 3.9.17\n","IPython version : 8.14.0\n","\n","Compiler : MSC v.1929 64 bit (AMD64)\n","OS : Windows\n","Release : 10\n","Machine : AMD64\n","Processor : Intel64 Family 6 Model 165 Stepping 2, GenuineIntel\n","CPU cores : 16\n","Architecture: 64bit\n","\n","watermark : 2.4.3\n","numpy : 1.23.5\n","pandas : 2.0.3\n","seaborn : 0.12.2\n","pivottablejs: 0.9.0\n","\n"]}],"source":["from watermark import watermark\n","watermark(iversions=True, globals_=globals())\n","print(watermark())\n","print(watermark(packages=\"watermark,numpy,pandas,seaborn,pivottablejs\"))"]}],"metadata":{"colab":{"provenance":[],"toc_visible":true},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.9.17"}},"nbformat":4,"nbformat_minor":0}