From 1f779210625f9a6b24e30408be573333228baf0d Mon Sep 17 00:00:00 2001 From: Skipper Seabold Date: Wed, 1 Nov 2017 06:25:28 -0500 Subject: [PATCH] Exercise updates --- 1 - Reading Data.ipynb | 167 ++++-- 2 - Data Wrangling with Pandas.ipynb | 456 ++++++++++++++-- 3 - Exploratory Analysis with Pandas.ipynb | 350 +++++++++++- 4 - Exploratory Plotting.ipynb | 93 +++- 5 - Modeling with scikit-learn.ipynb | 607 +++++++++++++++++---- load_data.py | 34 ++ solutions/clustering_comments.py | 49 ++ solutions/frequent_word.py | 22 + solutions/month_visits.py | 34 ++ solutions/numpy_reshape.py | 4 + solutions/object_hook_json.py | 23 + solutions/pandas_read_csv.py | 39 ++ solutions/plot_clusters.py | 10 + solutions/random_poisson.py | 6 + solutions/read_json.py | 7 + solutions/read_json_twice.py | 11 + solutions/read_url_json.py | 5 + solutions/top_words_loadings.py | 14 + solutions/violation_distribution.py | 38 ++ 19 files changed, 1766 insertions(+), 203 deletions(-) create mode 100644 load_data.py create mode 100644 solutions/clustering_comments.py create mode 100644 solutions/frequent_word.py create mode 100644 solutions/month_visits.py create mode 100644 solutions/numpy_reshape.py create mode 100644 solutions/object_hook_json.py create mode 100644 solutions/pandas_read_csv.py create mode 100644 solutions/plot_clusters.py create mode 100644 solutions/random_poisson.py create mode 100644 solutions/read_json.py create mode 100644 solutions/read_json_twice.py create mode 100644 solutions/read_url_json.py create mode 100644 solutions/top_words_loadings.py create mode 100644 solutions/violation_distribution.py diff --git a/1 - Reading Data.ipynb b/1 - Reading Data.ipynb index dbfab27..85b1b8e 100644 --- a/1 - Reading Data.ipynb +++ b/1 - Reading Data.ipynb @@ -34,14 +34,14 @@ "metadata": {}, "outputs": [], "source": [ - "csv_file = open(\"data/health_inspection_sample.csv\")" + "csv_file = open(\"data/health_inspection_chi_sample.csv\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "File objects are lazy **iterators**. Lazy means that they only do things, in this case read data, when you ask them to. You can call **next** on iterator objects to explicitly get the next item." + "File objects are lazy **iterators** (here, *stream objects*). Lazy means that they only do things, in this case read data, when you ask them to. You can call **next** on iterator objects to explicitly get the next item." ] }, { @@ -119,7 +119,7 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"data/health_inspection_sample.csv\") as csv_file:\n", + "with open(\"data/health_inspection_chi_sample.csv\") as csv_file:\n", " for line in csv_file:\n", " pass" ] @@ -131,6 +131,33 @@ "By using the `open` function as a context manager, we get an automatic call to close the open file when we exit the context (determined by the indentation level). When working with files non-interactively, you'll almost always want to use open as a context manager." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Write some code that iterates through the file `data/health_inspection_chi_sample.json` twice. Only call `open` once, however, then close the file. Can you find out, programatically, how many characters are in the file?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/read_json_twice.py" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -168,7 +195,7 @@ "metadata": {}, "outputs": [], "source": [ - "csv_file = open(\"data/health_inspection_sample.csv\")\n", + "csv_file = open(\"data/health_inspection_chi_sample.csv\")\n", "\n", "reader = csv.reader(csv_file)" ] @@ -222,7 +249,7 @@ "source": [ "The biggest difference in using `csv.reader` vs. iterating through the file is that it automatically splits the csv on commas and returns the line of the file split into a list.\n", "\n", - "You can control this behavior through a `Dialect` object. By default, `csv.reader` uses a Dialect object called \"excel.\" " + "You can control this behavior through a `Dialect` object. By default, `csv.reader` uses a Dialect object called \"excel.\" Here let's look at the attributes of the excel dialect. Don't worry too much about the code used to achieve this. We'll look more at this later." ] }, { @@ -292,7 +319,7 @@ "metadata": {}, "outputs": [], "source": [ - "file_name = \"data/health_inspection_sample.csv\"\n", + "file_name = \"data/health_inspection_chi_sample.csv\"\n", "\n", "with open(file_name) as csv_file:\n", " \n", @@ -365,7 +392,7 @@ "source": [ "The final thing to note in the block above is the use of `print` to provide some information about what went wrong. Logging is another really good habit to get into, and print statements are the dead simplest way to log the behavior of your code.\n", "\n", - "In practice, you probably don't want to use `print`. You want to use the logging module (TODO: link)." + "In practice, you probably don't want to use `print`. You want to use the [logging](https://docs.python.org/3/library/logging.html) module, but we're not going to talk about best practices in logging anymore today." ] }, { @@ -386,7 +413,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Each line in the file `data/health_inspection_sample.json` is a single json object that represents the same data above. " + "Each line in the file `data/health_inspection_chi_sample.json` is a single json object that represents the same data above. " ] }, { @@ -418,7 +445,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since each line is a json object here, we need to iterate over the file and parse each line. We use the `json.loads` function here for \"load string.\" The function `json.load` will take a file-like object." + "Since each line is a json object here, we need to iterate over the file and parse each line. We use the `json.loads` function here for \"load string.\" The similar function `json.load` takes a file-like object." ] }, { @@ -443,48 +470,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`json.loads` places each json object into a Python dictionary, helpfully filling in `None` for `null` for missing values and otherwise preserving types. It also, works recursively as we see in the `location` field.\n", - "\n", - "We can take further control over how the data is read in by using the `object_hook` argument. Say we wanted to remove the `location` field above. We don't need the `geoJSON` formatted information. We could do so with the `object_hook`." + "`json.loads` places each json object into a Python dictionary, helpfully filling in `None` for `null` for missing values and otherwise preserving types. It also, works recursively as we see in the `location` field." ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "def remove_entry(record):\n", - " try:\n", - " del record['location']\n", - " # this is called recursively on objects so not all have it\n", - " except KeyError:\n", - " pass\n", - " \n", - " return record\n", - "\n", - "\n", - "def parse_json(record):\n", - " return json.loads(record, object_hook=remove_entry)" + "## Aside: List Comprehensions" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "with open(\"data/health_inspection_chi_sample.json\") as json_file:\n", - " dta = [parse_json(line) for line in json_file]\n", - " \n", - "pprint(dta[0])" + "Let's take a look at another Pythonic concept, introduced a bit above, called a **list comprehension**. This is what's called *syntactic sugar*. It's a concise way to create a list." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "You'll notice two things in the above code. First is the line within the context manager. This is another Pythonic concept called a **list comprehension**. This is what's called *syntactic sugar*. It's a concise way to createa list." + "[i for i in range(1, 6)]" ] }, { @@ -570,6 +579,33 @@ "{key: value for key, value in pairs}" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Returning to the code that we introduced above, we can take further control over how a file with json objects is read in by using the `object_hook` argument. Say we wanted to remove the `location` field above. We don't need the `geoJSON` formatted information. We could do so with the `object_hook`. Write a function called `remove_entry` that removes the `'location'` field from each record in the `'data/health_inspection_chi_sample.json'` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/object_hook_json.py" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -590,7 +626,7 @@ "source": [ "#### Introducing Pandas\n", "\n", - "First, a few words of introduction for **pandas**. Pandas is a Python package providing fast, flexible, and expressive data structures designed to work with relational or labeled data both. It is a high-level tool for doing practical, real world data analysis in Python.\n", + "First, a few words of introduction for **pandas**. Pandas is a Python package providing fast, flexible, and expressive data structures designed to work with relational or labeled data. It is a high-level tool for doing practical, real world data analysis in Python.\n", "\n", "You reach for pandas when you have:\n", "\n", @@ -677,17 +713,31 @@ "The JSON counterpart to `read_csv` is `read_json`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Use `pd.read_json` to read in the Chicago health inspections json sample in the `data` folder." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "pd.read_json(\n", - " \"data/health_inspection_chi_sample.json\", \n", - " orient=\"records\",\n", - " lines=True,\n", - ")" + "# Type your solution Here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/read_json.py" ] }, { @@ -703,7 +753,7 @@ "source": [ "So far, we've seen some ways that we can read data from disk. As Data Scientists, we often need to go out and grab data from the Internet.\n", "\n", - "Generally Python is \"batteries included\" and reading data from the Internet is no exception, but there are some *great* packages out there. [requests]() is one of them for making HTTP requests. Use it. (TODO: link)\n", + "Generally Python is \"batteries included\" and reading data from the Internet is no exception, but there are some *great* packages out there. [requests](http://docs.python-requests.org/en/master/) is one of them for making HTTP requests.\n", "\n", "Let's look at how we can use the [Chicago Data Portal](https://data.cityofchicago.org/) API to get this data in the first place. (I originally used San Francisco for this, but the data was just too clean to be terribly interesting.)" ] @@ -736,6 +786,13 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Requests returns a [Reponse](http://docs.python-requests.org/en/master/api/#requests.Response) object with many helpful methods and attributes." + ] + }, { "cell_type": "code", "execution_count": null, @@ -788,15 +845,31 @@ "Of course, pandas can also load data directly from a URL, but I encourage you to reach for `requests` as often as you need it." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Try passing the URL above to `pd.read_json`. What happens?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "url = ('https://data.cityofchicago.org/'\n", - " 'resource/cwig-ma7x.json?$limit=5')\n", - "pd.read_json(url, orient='records')" + "%load solutions/read_url_json.py" ] }, { @@ -858,7 +931,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Sometimes we need to be resourceful in order to get data. Knowing how to scrape the web can really come in handy. We're not going to go into details here, but you'll likely find libraries like [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/), [lxml](http://lxml.de/), and [mechanize](https://mechanize.readthedocs.io/en/latest/) to be helpful." + "Sometimes we need to be resourceful in order to get data. Knowing how to scrape the web can really come in handy. We're not going to go into details today, but you'll likely find libraries like [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/), [lxml](http://lxml.de/), and [mechanize](https://mechanize.readthedocs.io/en/latest/) to be helpful. There's also a `read_html` function in pandas that will quickly scrape HTML tables for you and put them into a DataFrame. " ] } ], diff --git a/2 - Data Wrangling with Pandas.ipynb b/2 - Data Wrangling with Pandas.ipynb index 4904ae8..e041171 100644 --- a/2 - Data Wrangling with Pandas.ipynb +++ b/2 - Data Wrangling with Pandas.ipynb @@ -85,7 +85,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For example, with this data set we have a natural unique identifier in the `inspection_id` column. We might wish to make this out index." + "For example, with this data set we have a natural unique identifier in the `inspection_id` column. We might wish to make this our index." ] }, { @@ -281,21 +281,182 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Reading Data and Dealing with Types" + "## Cleaning Data for Types" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We saw above that `csv` reads everything in as strings, `json` does some type conversion with facility for doing more, and `pandas` does a bit more type conversion (but it isn't always what we want. We want the zip codes to stay strings)." + "So far, we've explicitly made an index. We may next want to convert to the dates to datetime types. Here we'll use the **apply** function to apply a function to each row of a Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.inspection_date = dta.inspection_date.apply(pd.to_datetime)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's cast zip code from a float to a string. Some zip codes can start with 0 (not in Chicago), and we need to account for that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "def float_to_zip(zip_code):\n", + " if np.isnan(zip_code):\n", + " return np.nan\n", + " \n", + " # 0 makes sure to left-pad with zero\n", + " # zip codes have 5 digits\n", + " # .0 means, we don't want anything after the decimal\n", + " # f is for float\n", + " zip_code = \"{:05.0f}\".format(zip_code)\n", + " return zip_code" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, we can use the `parse_dates` argument to read in the larger inspections data sample and tell pandas that one of our columns is a date column. We'll also go ahead and make `inspection_id` the index." + "Here we use Python's **string formatting** facilities to convert from a numeric type to a string. Some of the zip codes are empty strings in the file. Pandas uses numpy's `NaN` to indicate missingness, so we'll return it here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.zip = dta.zip.apply(float_to_zip)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataFrames have a `dtypes` attribute for checking the data types. Pandas relies on NumPy's dtypes objects. Here we see that the `object` dtype is used to hold strings. This for technical reasons." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.dtypes[['inspection_date', 'zip']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also convert variables' types, using `astype`. Here, we'll explicitly cast to pandas Categorical type, which is the only non-native numpy type." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.results = dta.results.astype('category')\n", + "dta.risk = dta.risk.astype('category')\n", + "dta.inspection_type = dta.inspection_type.astype('category')\n", + "dta.facility_type = dta.facility_type.astype('category')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we only select the categorical types, we can see some categorical variables descriptions." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the `select_dtypes` method to pull out a DataFrame with only the asked for types." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.select_dtypes(['category'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we might want to exclude a column like `location` since we have the separate `latitude` and `longitude` columns. We can delete columns in a DataFrame using Python's built-in `del` statement." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "del dta['location']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dealing with Types using csv Reader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can do everything that we did above by providing options to `pd.read_csv`.\n", + "\n", + "We saw before that `csv` reads everything in as strings, `json` does some type conversion with facility for doing more, and `pandas` does a bit more type conversion but it isn't always what we want. For example, we want the zip codes to stay strings." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at how to do with pandas `read_csv`. First, we can use the `parse_dates` argument to read in the larger inspections data sample and tell pandas that one of our columns is a date column. We'll also go ahead and make `inspection_id` the index." ] }, { @@ -315,7 +476,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And let's cast zip code from a float to a string." + "Next, we want to turn the zip codes into strings. Here, we need to assume that the input (from the file) is a string as opposed to the above." ] }, { @@ -326,6 +487,7 @@ "source": [ "import numpy as np\n", "\n", + "\n", "def float_to_zip(zip_code):\n", " # convert from the string in the file to a float\n", " try:\n", @@ -341,13 +503,6 @@ " return zip_code" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here we use Python's **string formatting** facilities to convert from a numeric type to a string. Some of the zip codes are empty strings. Pandas uses numpy's `NaN` to indicate missingness, so we'll return it here." - ] - }, { "cell_type": "code", "execution_count": null, @@ -366,6 +521,13 @@ "float_to_zip('123456')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As another example of defensive programming, we have to make sure that empty strings are handled." + ] + }, { "cell_type": "code", "execution_count": null, @@ -390,8 +552,6 @@ "source": [ "dta = pd.read_csv(\n", " \"data/health_inspection_chi.csv\",\n", - " index_col='inspection_id',\n", - " parse_dates=['inspection_date'],\n", " converters={\n", " 'zip': float_to_zip\n", " },\n", @@ -411,7 +571,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we might want to exclude a column like `location` since we have the separate `latitude` and `longitude` columns. We can take advantage of the fact that the `usecols` argument accepts a function to exclude `location`." + "To exclude location, we can take advantage of the fact that the `usecols` argument accepts a function to exclude `location`." ] }, { @@ -422,11 +582,6 @@ "source": [ "dta = pd.read_csv(\n", " \"data/health_inspection_chi.csv\",\n", - " index_col='inspection_id',\n", - " parse_dates=['inspection_date'],\n", - " converters={\n", - " 'zip': float_to_zip\n", - " },\n", " usecols=lambda col: col != 'location'\n", ")" ] @@ -435,14 +590,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we are using a **lambda function** that returns `False` for the location parameter. Lambda functions are what are known as anonymous functions, because they don't have a name. This kind of thing is precisely their intended use." + "Here we are using a **lambda function** that returns `False` for the location parameter. Lambda functions are what are known as anonymous functions, because they don't have a name. This kind of thing is precisely their intended use.\n", + "\n", + "Here we use a function `lambda x: x` to map the identity function over a list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list(map(lambda x: x, [1, 2, 3]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Of course, you don't have to let `read_csv` have all the fun. You can do all of this on-the-fly with the DataFrames themselves." + "Finally, in a few cases we may want to take advantage of the pandas native `categorical` type. We can use the `dtype` argument for this, passing a dictionary of type mappings." ] }, { @@ -451,14 +617,33 @@ "metadata": {}, "outputs": [], "source": [ - "dta = pd.read_csv(\"data/health_inspection_chi.csv\")" + "dta = pd.read_csv(\n", + " \"data/health_inspection_chi.csv\",\n", + " dtype={\n", + " 'results': 'category',\n", + " 'risk': 'category',\n", + " 'inspection_type': 'category',\n", + " 'facility_type': 'category'\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.risk.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can set the index. Note the use of `inplace`." + "## Exercise\n", + "\n", + "Put all of the above `read_csv` options together in a single call to `read_csv`." ] }, { @@ -467,14 +652,37 @@ "metadata": {}, "outputs": [], "source": [ - "dta.set_index(\"inspection_id\", inplace=True)" + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/pandas_read_csv.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## String Cleaning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, let's start to dig into the data a little bit more. One of the things we're going to be really interested in exploring is the free text of the violations field." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Convert to datetime types. Here we'll use the **apply** function to apply a function to each row of a Series." + "The first thing to notice is that the violations field has null values in it." ] }, { @@ -483,14 +691,14 @@ "metadata": {}, "outputs": [], "source": [ - "dta.inspection_date = dta.inspection_date.apply(pd.to_datetime)" + "dta.info()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "And finally, convert the zip code data." + "We may want to ask ourselves if these values are missing at random or if there is some reason there's no written violation field." ] }, { @@ -499,7 +707,14 @@ "metadata": {}, "outputs": [], "source": [ - "dta.zip = dta.zip.apply(float_to_zip)" + "dta.loc[dta.violations.isnull()].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It looks like we're ok. The next thing to notice is that the violation field actually has a lot of violations in the same field for the same visit." ] }, { @@ -508,14 +723,28 @@ "metadata": {}, "outputs": [], "source": [ - "dta.head()" + "with pd.option_context(\"display.max_colwidth\", 500):\n", + " print(dta.violations.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "DataFrames have a `dtypes` attribute for checking the data types. Pandas relies on NumPy's dtypes objects. Here we see that the `object` dtype is used to hold strings. This for technical reasons." + "Let's split these out to make a longer DataFrame where each violation is a single row. Pandas provides a nice way to munge string data through the `str` accessor on string columns.\n", + "\n", + "```python\n", + "dta.violations.str.\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Let's see how many violations we have per visit. What does the distribution of visits look like? Explore the methods on the `str` accessor and, perhaps, the `quantile` method." ] }, { @@ -524,14 +753,23 @@ "metadata": {}, "outputs": [], "source": [ - "dta.dtypes[['inspection_date', 'zip']]" + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/violation_distribution.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In a few cases, we may want to take advantage of the pandas native `categorical` type. We can convert these variables, using `astype`." + "Ok, we have a manageable number of violations. Let's split the violations and then turn them into a long DataFrame with a single row for each violation within each visit." ] }, { @@ -540,7 +778,15 @@ "metadata": {}, "outputs": [], "source": [ - "dta.info()" + "violations = dta.violations.str.split(\"\\|\", expand=True)\n", + "violations.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When we `unstack` the DataFrame, we're left with what's called a `MultiIndex`. This index has two *levels* now. One is the original `inspection_id`. The other is the, rather meaningless, column names." ] }, { @@ -549,24 +795,30 @@ "metadata": {}, "outputs": [], "source": [ - "dta.results = dta.results.astype('category')\n", - "dta.risk = dta.risk.astype('category')\n", - "dta.inspection_type = dta.inspection_type.astype('category')\n", - "dta.facility_type = dta.facility_type.astype('category')" + "violations.unstack().head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If we only select the categorical types, we can see some categorical variables descriptions." + "Let's get rid of the empty rows first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violations = violations.unstack().dropna()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can use the `select_dtypes` method to pull out a DataFrame with only the asked for types." + "Now we can drop the column name level, which we don't need." ] }, { @@ -575,14 +827,23 @@ "metadata": {}, "outputs": [], "source": [ - "dta.select_dtypes(['category'])" + "violations.reset_index(level=0, drop=True, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violations.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Finally, we can delete columns in a DataFrame using Python's built-in `del` statement." + "One last cleaning step may be helpful here. When we split on the pipe ('`|`'), we likely kept some surrounding whitespace. We can remove that." ] }, { @@ -591,7 +852,116 @@ "metadata": {}, "outputs": [], "source": [ - "del dta['location']" + "violations.str.startswith(\" \").any()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violations.str.strip().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violations = violations.str.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(violations.str.startswith(\" \").any()) | (violations.str.endswith(\" \").any())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Later, we'll see how to combine these violations back with our original data to do some analysis." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with Dates and Categoricals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Above, we used the `str` accessor on the DataFrame. This isn't the only convenient accessor that pandas provides. There is also the `dt` accessor for datetime types and the `cat` accessor for categorical types." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "dta.inspection_date.dt.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.inspection_date.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.inspection_date.dt.month.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's take a look at the categorical types." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```python\n", + "dta.risk.cat.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.risk.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.risk.cat.codes.head()" ] } ], diff --git a/3 - Exploratory Analysis with Pandas.ipynb b/3 - Exploratory Analysis with Pandas.ipynb index 0618fae..6eeb51b 100644 --- a/3 - Exploratory Analysis with Pandas.ipynb +++ b/3 - Exploratory Analysis with Pandas.ipynb @@ -6,7 +6,16 @@ "source": [ "## Exploratory Analysis\n", "\n", - "That's it for some preliminary cleaning. Don't worry, there will be more. Let's start to look in a bit more detail at the data, though." + "That's it for some preliminary cleaning. Don't worry, there will be more. Let's start to look in a bit more detail at the data, though. In this section, we're going to start to write some code that's typical for day-to-day data cleaning tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from load_data import dta" ] }, { @@ -29,7 +38,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "And `describe` goes in to a bit more detail for the *numeric* types, of which we don't have many here." + "And `describe` goes into a bit more detail for the *numeric* types, of which we don't have many here." ] }, { @@ -61,21 +70,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### GroupBy" + "### GroupBy" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now, let's ask the most obvious question. Which are the best and the worst restaurants? We'll want to use pandas `GroupBy` functionality to implement the `split-apply-combine` pattern." + "Now, let's ask the most obvious question. Which are the best and the worst restaurants? We'll want to use pandas `GroupBy` functionality to implement the `split-apply-combine` pattern.\n", + "\n", + "The idea here is that we **split** the data by some key or set of keys then **apply** a function to each group and then **combine** the outputs back into a single DataFrame." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "First, how many result categories are there?" + "First, let's see how many result categories there are. We can use `value_counts` to answer this question. " ] }, { @@ -92,7 +103,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Ok, let's group on the inspection results and see who the best and worst are.\n", + "Ok, let's group on the inspection `results` column and see who the best and worst are.\n", "\n", "When we call the `groupby` method we get back a `DataFrameGroupBy` object." ] @@ -119,7 +130,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can access the variables on this object, the same as a DataFrame, and any code called will execute within the groups. Here we use `value_counts` to count the number of observed for each level of the categorical." + "You can access the variables on this object, the same as a DataFrame, and any code called will execute within the groups." ] }, { @@ -160,7 +171,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can index on the first element in a `MultiIndex` using square brackets and then use `sort_values` to find the " + "We can index on the first element in a `MultiIndex` using square brackets and then use `sort_values` to find those restaurants that had a result of Fail the most." ] }, { @@ -512,7 +523,328 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Often, I'll try to place a method chain to get the data ready for more exploratory work at the top of a notebook, so I can proceed with any analyses." + "Often, I'll try to place a method chain to get the data ready for more exploratory work at the top of a notebook, so I can proceed with any analyses. Let's fold back in" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's go back and add in the unstacked violations. Recall that we unstack the violations as follows. There are two new things to note here though. We add in a `to_frame` method to turn the unstacked Series into a DataFrame, and we `rename` the unnamed column in the resulting DataFrame back to `violations`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(dta.violations\n", + " .str.split(\"|\", expand=True)\n", + " .unstack()\n", + " .dropna()\n", + " .reset_index(level=0, drop=True)\n", + " .str.strip()\n", + " .to_frame()\n", + " .rename(columns={0: 'violations'}))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we need to drop the violations from the original DataFrame, then we need to merge it with the unstacked violations Series that we created before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.drop([\"violations\"], axis='columns').head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we drop the original violations from the DataFrame, unstack them as before, turn this result into a DataFrame for joining, rename the unnamed column to `violations` and perform a **right join**.\n", + "\n", + "We use **join** here rather than merge. Join uses merge under the hood but conveniently allows us to join on the indices of the two DataFrames by default. One other difference is that join uses an inner merge by default, but that's not what we want here. Since we drop the null violations on the right-hand side DataFrame, we want to do a right join. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta = dta.drop([\"violations\"], axis='columns').join(\n", + " dta.violations.str.split(\"|\", expand=True)\n", + " .unstack()\n", + " .dropna()\n", + " .str.strip()\n", + " .reset_index(level=0, drop=True)\n", + " .to_frame()\n", + " .rename(columns={0: 'violations'}),\n", + " how='right'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a relatively clean DataFrame, let's ask a few more questions. \n", + "\n", + "First, how many unique violations do we have?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.violations.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.violations.unique().shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Is this true? Do we really think there are this many violation numbers? Probably not. We can use the `str` accessor and some more munging to answer this. Here we pass a **regular expression** to `str.extract`. Extract expects a *capture group*, indicated by `()`. The regular expression `(\\d+\\)(?=\\.)` means capture 1 or more (`+`) digits (`\\d`) that is followed by (`(?=)`) a period `\\.`. We escape the period because a plain `.` is a wildcard for any character." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(dta.violations\n", + " .str.extract(\"(\\d+)(?=\\.)\", expand=False)\n", + " .astype(int))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.sort(\n", + " dta.violations\n", + " .str.extract(\"(\\d+)(?=\\.)\", expand=False)\n", + " .astype(int)\n", + " .unique()\n", + ").shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Second, can we figure out how many times previous did an establishment fail an inspection (within the sample we have)?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "visits = dta.drop_duplicates([\"address\", \"dba_name\", \"inspection_date\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to ensure that the inspection dates are sorted within each group. GroupBy will preserve this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "visits = visits.sort_values([\"address\", \"dba_name\", \"inspection_date\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grouper = visits.groupby((visits.address, visits.dba_name))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ok, we might ask, \"now what?\" Remember the trick to pull out groups?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_key = list(grouper.groups.keys())[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group_key" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group = grouper.get_group(group_key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group[['inspection_date', 'results']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group.merge()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since, we need this to be backwards looking, we have to **shift** the data by one visit. Shifting will move the data around by either a number of periods or a frequency. In this case, we use a number of periods and shift forward by 1 period." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "group.shift(1)[['inspection_date', 'results']]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we take the cumulative sum of this, we'll have an accurate picture of previous failures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(group.shift(1).results == 'Fail').cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "visit_num = grouper.apply(lambda df: (df.shift(1).results == 'Fail').cumsum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "visit_num.head(n=15)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "(visit_num.reset_index(level=[0, 1], drop=True)\n", + " .to_frame()\n", + " .rename(\n", + " columns={'inspection_date': 'num_fails'}\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta.join((visit_num.reset_index(level=[0, 1], drop=True)\n", + " .to_frame()\n", + " .rename(\n", + " columns={'inspection_date': 'visit_number'}\n", + ")))" ] } ], diff --git a/4 - Exploratory Plotting.ipynb b/4 - Exploratory Plotting.ipynb index d3fe9e6..99f852e 100644 --- a/4 - Exploratory Plotting.ipynb +++ b/4 - Exploratory Plotting.ipynb @@ -36,13 +36,20 @@ "%matplotlib inline" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is canonical import for matplotlib." + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plot" + "import matplotlib.pyplot as plt" ] }, { @@ -69,7 +76,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "I dont *really* recommend using the interactive interface in practice or for anything serious. Alternatively, you can use the object-oriented interface. This usually entails a call to **plt.subplots**.\n", + "I don't *really* recommend using the interactive interface in practice or for anything serious. Alternatively, you can use the object-oriented interface. This usually entails a call to **plt.subplots**.\n", "\n", "This call returns instances of two objects -- a Figure object and an Axes object. Understanding the [anatomy of a matplotlib plot](http://matplotlib.org/faq/usage_faq.html#parts-of-a-figure) and these two underlying concepts will help unlock the power of **matplotlib**." ] @@ -374,7 +381,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Plotting on Maps" + "## Health Inspection Data" ] }, { @@ -390,13 +397,85 @@ "metadata": {}, "outputs": [], "source": [ - "dta = pd.read_csv(\n", - " \"data/health_inspection_chi.csv\",\n", - " parse_dates=[\"inspection_date\"],\n", - " usecols=lambda x: x != \"location\",\n", + "from load_dta import dta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dta = dta.drop([\"violations\"], axis='columns').join(\n", + " dta.violations.str.split(\"|\", expand=True)\n", + " .unstack()\n", + " .dropna()\n", + " .str.strip()\n", + " .reset_index(level=0, drop=True)\n", + " .to_frame()\n", + " .rename(columns={0: 'violations'}),\n", + " how='right'\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One thing we may wonder is if the violations are fairly evenly distributed or if some are much more common than others." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "violation_num = dta.violations.str.extract(\"(\\d+)(?=\\.)\", expand=False).astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(14, 10))\n", + "\n", + "ax = violation_num.groupby(violation_num).size().plot.bar(ax=ax)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise:\n", + "\n", + "See if there are any meaningful differences in the number of violations by quarter. Recall that you can use the `dt` accessor for datetime functionality on a Series." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "quarter = (dta.inspection_date.dt.month - 1) // 3\n", + "\n", + "quarter_size = dta.groupby((quarter, violation_num)).size()\n", + "\n", + "axes = quarter_size.unstack(level=0).plot.bar(\n", + " figsize=(14, 8), \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting Geographic Data" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/5 - Modeling with scikit-learn.ipynb b/5 - Modeling with scikit-learn.ipynb index 68dd84f..b4094e2 100644 --- a/5 - Modeling with scikit-learn.ipynb +++ b/5 - Modeling with scikit-learn.ipynb @@ -208,6 +208,192 @@ "x.dot(y)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Array Creation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Both the `zeros` and `ones` functions can be useful for creating data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.zeros(5, dtype=float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.zeros(5, dtype=int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.zeros(5, dtype=complex)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.ones(5, dtype=float)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Two other handy functions to know about are `arange`, `linspace`, and `logspace`.\n", + "\n", + "`np.arange` creates an array of a range of integers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.arange(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`np.linspace` and `np.logspace` create linearly and logarithmically-spaced grids, respectively, with a fixed number of points." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.linspace(0, 1, num=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.logspace(-1, 3, num=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Create an array with 1000 numbers, 0 to 999, using `arange`. Have a look at the `reshape` method on arrays to turn this into an array with 10 columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/numpy_reshape.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, it is often useful to create arrays with random numbers that follow a specific distribution. The np.random module contains a number of functions that can be used to this effect, for example this will produce an array of 5 random samples taken from a standard normal distribution (0 mean and variance 1) $ X \\sim N(0, 1) $:\n", + "\n", + "$$f(x \\mid \\mu = 0, \\sigma=1) = \\sqrt{\\frac{1}{2\\pi\\sigma^2}}\\exp {-\\frac{x^2}{2\\sigma^2} }$$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.randn(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "$X \\sim N(9, 3)$" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "norm10 = np.random.normal(loc=9, scale=3, size=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate a NumPy array of 1000 random numbers sampled from a Poisson distribution, with parameter `lam=5`. What is the modal value in the sample? You maybe interested in using `np.bincounts`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/random_poisson.py" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -401,9 +587,9 @@ "source": [ "For a given scikit-learn estimator object named model, several methods are available. Irrespective of the type of estimator, there will be a fit method:\n", "\n", - "* model.fit : fit training data. For supervised learning applications, this accepts two arguments: the data X and the labels y (e.g. model.fit(X, y)). For unsupervised learning applications, this accepts only a single argument, the data X (e.g. model.fit(X)).\n", + "* model.fit : fit training data. For supervised learning applications, this accepts two arguments: the data X and the labels y (e.g. `model.fit(X, y)`). For unsupervised learning applications, this accepts only a single argument, the data X (e.g. `model.fit(X)`).\n", "\n", - "> During the fitting process, the state of the **estimator** is stored in attributes of the estimator instance named with a trailing underscore character (_). For example, the sequence of regression trees `sklearn.tree.DecisionTreeRegressor` is stored in `estimators_` attribute." + "> During the fitting process, the state of the **estimator** is stored in attributes of the estimator instance named with a trailing underscore character (`_`). For example, the sequence of regression trees `sklearn.tree.DecisionTreeRegressor` is stored in `estimators_` attribute." ] }, { @@ -456,49 +642,34 @@ }, "outputs": [], "source": [ - "def float_to_zip(zip_code):\n", - " # convert from the string in the file to a float\n", - " try:\n", - " zip_code = float(zip_code)\n", - " except ValueError: # some of them are empty\n", - " return np.nan\n", - " \n", - " # 0 makes sure to left-pad with zero\n", - " # zip codes have 5 digits\n", - " # .0 means, we don't want anything after the decimal\n", - " # f is for float\n", - " zip_code = \"{:05.0f}\".format(zip_code)\n", - " return zip_code" + "from load_data import dta" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "dta = pd.read_csv(\n", - " \"data/health_inspection_chi.csv\",\n", - " index_col='inspection_id',\n", - " parse_dates=['inspection_date'],\n", - " converters={\n", - " 'zip': float_to_zip\n", - " },\n", - " usecols=lambda col: col != 'location'\n", + "dta = dta.drop([\"violations\"], axis='columns').join(\n", + " dta.violations.str.split(\"|\", expand=True)\n", + " .unstack()\n", + " .dropna()\n", + " .str.strip()\n", + " .reset_index(level=0, drop=True)\n", + " .to_frame()\n", + " .rename(columns={0: 'violations'}),\n", + " how='right'\n", ")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "dta = dta.loc[~dta.violations.isnull()]" + "dta.info()" ] }, { @@ -663,18 +834,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We might ask, what is the most frequent word?" + "## Exercise\n", + "\n", + "What is the most frequent word in this vocabulary? Explore the `count_vectorizer` method to see if it offers anything helpful in uncovering this." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "inverse_vocabulay = {v: k for k, v in count_vectorizer.vocabulary_.items()}" + "# Type your solution here" ] }, { @@ -685,14 +856,7 @@ }, "outputs": [], "source": [ - "inverse_vocabulay[count_matrix.sum(0).argmax()]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is unsurprising, since almost every violation contains the word comments." + "%load solutions/frequent_word.py" ] }, { @@ -712,8 +876,51 @@ "\n", "$$tf_{ij}=\\frac{w_{ij}}{\\sum_jw_{ij}}$$\n", "\n", - "You might go about computing this.\n", + "You might go about computing this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import normalize\n", + "\n", + "tf = normalize(count_matrix, norm='l1', axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using the `l1`-norm across axis 1 (the column) index, we now have frequencies within the document. Each document now sums to 1.\n", "\n", + "One thing to point out here is that summations over a `scipy.sparse` matrix returns a numpy `matrix`. This is mostly for historical reasons, and I *don't* recommend working with the `matrix` data structure if you can avoid it. You can turn this into an array by accessing the matrix's `A` attribute." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.sum(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf.sum(1).A" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "Another important concept is that of inverse document frequency. This is a measure of how important a word is. Words like stop words or words that are otherwise popular in a corpus will still have a high term frequency. Inverse document frequency is a way to downweight the frequent terms but upweight the rare ones. The inverse document frequency is\n", "\n", "$$idf = \\log\\left(\\frac{N_{\\text{documents}}}{N_{\\text{documents with term}}}\\right)$$\n", @@ -816,7 +1023,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "TODO: introduce truncated SVD and why it's useful. Point out the transformer." + "Let's take a look at another kind of transformer in scikit-learn, one that provides dimensionality reduction. Here we'll use Truncated SVD on the tf-idf matrix. Formally, this is known as Latent Semantic Analysis (LSA), because it transforms the documents to a low-dimensional \"semantic\" space. Formally, truncated SVD is a lot like Principle Components Analysis (PCA), except that the decomposition is on the documents rather than the covariance matrix. \n", + "\n", + "\n", + "Mathematically, truncated SVD applied to training samples X produces a low-rank approximation $X_k$:\n", + "\n", + "$$X \\approx X_k = U_k \\Sigma_k V_k^\\top$$\n", + "\n", + "After this operation, $U_k \\Sigma_k^\\top$ is the transformed training set with k features (called `n_components` in the API).\n", + "\n", + "To also transform a test set $X$, we multiply it with $V_k$:\n", + "\n", + "$$X' = X V_k$$\n", + "\n", + "If we were to center the matrix $X$ then TruncatedSVD would be equivalent to PCA. Not doing so allows us to continue to work with sparse matrices as documents almost always produce." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`TruncatedSVD` is available under the `decomposition` namespace." ] }, { @@ -834,7 +1061,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Mention random_state" + "Here we'll fit the `TruncatedSVD` transformer using 10 components. This is an arbitrary choice. In practice, you may want to tune the number of components using whatever metric is appropriate for your task. Note that we use `random_state` here to make sure that our results are repeatable. Any of the algorithms in scikit-learn that are non-deterministic will provide a `random_state` keyword. It is really important that you use it to ensure **repeatable** results." ] }, { @@ -857,7 +1084,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Project X" + "We'll use `fit_transform` to perform the singular value decomposition (up to the first $k$ components) and to project the original matrix into the reduced space." ] }, { @@ -871,6 +1098,24 @@ "X_reduced = svd.fit_transform(X)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_reduced" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_reduced.shape" + ] + }, { "cell_type": "code", "execution_count": null, @@ -897,7 +1142,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's look at the top words in each dimension." + "## Exercise\n", + "\n", + "Write a loop that prints the top ~6 words for each component according to the the magnitude of their loadings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Type your solution here" ] }, { @@ -908,11 +1164,7 @@ }, "outputs": [], "source": [ - "for i in range(n_components):\n", - " idx = svd.components_[i].argsort()[::-1][:6]\n", - " \n", - " top_k = words[idx]\n", - " print(\"{i}: {words}\".format(i=i, words=top_k))" + "%load solutions/top_words_loadings.py" ] }, { @@ -926,7 +1178,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Normalize so that k-Means works" + "Now that we've done some dimensionality reduction, we may be interested in clustering the documents in this reduced space. Scikit-Learn has a number of clustering algorithms. Here we'll use K-means.\n", + "\n", + "The K-means algorithm clusters data by separating it into groups of equal variance, choosing them in order to minimize the within-cluster sum of squares. Formally, we divide $n$ samples into $k$ clusters $C$. Each cluster is defined by its mean, or centroid, $u_i$.\n", + "\n", + "$$\\sum_{i=0}^n\\underset{u_j\\in C}\\min (\\|x_j - u_i\\|^2)$$\n", + "\n", + "K-means proceeds as follows:\n", + "\n", + "1. We pick $k$ random points from the dataset and call them the cluster centroids\n", + "2. We assign each data point to its closest centroid.\n", + "3. We recompute the centroids.\n", + "4. The distance between the old and new centroids are computed until they stop moving.\n", + "\n", + "Eventually k-means will converge. However, there is no guarantee that it will converge to a global optimum. One thing we can do to mitigate this is to pick better starting points than $k$ random points in the data. Scikit-learn uses a better choice by default through the `init='k-means++'` argument, which attempts to pick starting centroids that are generally 'far' from each other.\n", + "\n", + "First, let's normalize the data row-wise. If we do this, for documents the euclidean distance above becomes cosine similarity. Now we're performing spherical k-means so that all of our comparisons between documents are equal and indpendent of the size of the document." ] }, { @@ -964,6 +1231,34 @@ "np.linalg.norm(X_norm, axis=1)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we have some guidance on the number of clusters, we might expect, so let's use it.\n", + "\n", + "Recall that we found 45 distinct violation numbers in the data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_clusters = (dta.violations.str.extract(\"(\\d+)(?=\\.)\", expand=False)\n", + " .astype(int).unique().shape[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_clusters" + ] + }, { "cell_type": "code", "execution_count": null, @@ -974,13 +1269,22 @@ "source": [ "from sklearn.cluster import KMeans\n", "\n", - "n_clusters = 20\n", - "\n", "kmeans = KMeans(n_clusters=n_clusters, random_state=0)\n", "\n", "kmeans.fit(X_norm)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", + "\n", + "Plot the histogram of the number of found clusters vs. our known violations. They don't have to line up exactly. I.e., the bins found for clustering will be different than the extracted violation numbers.\n", + "\n", + "What are the first five violations in, say, the first three clusters?" + ] + }, { "cell_type": "code", "execution_count": null, @@ -989,111 +1293,220 @@ }, "outputs": [], "source": [ - "fig, ax = plt.subplots()\n", + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solution/plot_clusters.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exercise\n", "\n", - "ax.hist(kmeans.labels_, bins=n_clusters);" + "See if you can strip out the comments and still find (semi-)meaningful clusters. Here's a hint, you'll want to again use regular expressions and the `str` accessor in pandas." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we use a *lookbehind* (`(?<=)`) to capture via `()` one or more (`+`) of any character (`.`) that follows the word \"Comments.\"" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "dta.violations[kmeans.labels_ == 0].iloc[0]" + "import re\n", + "\n", + "result = re.search(\"(?<=Comments:)(.+)\", \"1. This is a violation. Comments: This was a really egregious violation.\")\n", + "\n", + "result" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "result.group()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "dta.violations[kmeans.labels_ == 0].iloc[1]" + "# Type your solution here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load solutions/clustering_comments.py" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Visualizing Clusters" + "## Modeling" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Can we predict a pass/fail rating from features in the data? First, let's start to build our modeling set by turning the data back into inspection level data." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "import seaborn as sns\n", - "import matplotlib.pyplot as plt" + "dta.columns" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "palette = np.array(sns.color_palette(\"hls\", n_clusters))\n", - "\n", - "fig, ax = plt.subplots(figsize=(12, 8))\n", + "columns = [\n", + " \"inspection_id\",\n", + " \"inspection_date\", \n", + " \"inspection_type\", \n", + " \"facility_type\", \n", + " \"results\", \n", + " \"risk\",\n", + " \"zip\"\n", + "]\n", "\n", - "ax.scatter(\n", - " tsne.embedding_[:, 0],\n", - " tsne.embedding_[:, 1],\n", - " lw=0,\n", - " s=40,\n", - " c=palette[kmeans.labels_]\n", + "modeling_dta = (\n", + " dta.reset_index()\n", + " .drop_duplicates([\"inspection_id\"])\n", + " .loc[:, columns]\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do some **feature engineering** to turn our data into some more features we think may be predictive of passing or failing scores." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We might use the number of violations per establishment during an inspection." + ] + }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "words = pd.DataFrame(X.A, columns=sorted(tfidf_vect.vocabulary_.keys()))" + "number_of_violations = dta.groupby(dta.index).size()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The comments are free text." + "Maybe some violation severity is (average) weather dependent." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ - "words.columns" + "modeling_dta[\"month\"] = modeling_dta.inspection_date.dt.month" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's add in the number of previously failed inspections. We'll ignore the fact that we may care more about the rate of failed inspections." ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, + "outputs": [], + "source": [ + "fail_num = (dta.groupby((dta.address, \n", + " dta.dba_name,\n", + " dta.inspection_date))\n", + " .apply(lambda df: (df.shift(1).results == 'Fail').cumsum()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's join the violations data back together and run truncated SVD." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "words.groupby(kmeans.labels_).get_group(0).mean()#nlargest(10)" + "violations = dta.violations.groupby(dta.index).apply(lambda df: \" \".join(df))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Do TruncatedSVD again." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's join this all together to create a modeling dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dummy everything out. TECHNICALLY, this is incorrect." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dealing with Bigger Data" ] } ], diff --git a/load_data.py b/load_data.py new file mode 100644 index 0000000..b57cfb3 --- /dev/null +++ b/load_data.py @@ -0,0 +1,34 @@ +import numpy as np +import pandas as pd + + +def float_to_zip(zip_code): + # convert from the string in the file to a float + try: + zip_code = float(zip_code) + except ValueError: # some of them are empty + return np.nan + + # 0 makes sure to left-pad with zero + # zip codes have 5 digits + # .0 means, we don't want anything after the decimal + # f is for float + zip_code = "{:05.0f}".format(zip_code) + return zip_code + + +dta = pd.read_csv( + "data/health_inspection_chi.csv", + index_col='inspection_id', + parse_dates=['inspection_date'], + converters={ + 'zip': float_to_zip + }, + usecols=lambda col: col != 'location', + dtype={ + 'results': 'category', + 'risk': 'category', + 'inspection_type': 'category', + 'facility_type': 'category' + } +) diff --git a/solutions/clustering_comments.py b/solutions/clustering_comments.py new file mode 100644 index 0000000..4184572 --- /dev/null +++ b/solutions/clustering_comments.py @@ -0,0 +1,49 @@ +# Let's be a little defensive/assertive and be sure there aren't any lower-case comments. + +assert not dta.violations.str.contains("comments").any() + +dta["comments"] = dta.violations.str.extract("(?<=Comments:)(.+)", expand=False) + +has_comments = dta.comments.notnull() + +X + +tfidf_vect = TfidfVectorizer( + stop_words='english', + min_df=5, + max_df=.95, + token_pattern=r"(?u)\b[A-Za-z_][A-Za-z_]+\b" +) + +X = tfidf_vect.fit_transform(dta.loc[has_comments].comments) + +n_components = 10 + +svd = TruncatedSVD( + n_components=n_components, + random_state=0 +) + +X_reduced = svd.fit_transform(X) + +X_norm = normalizer.fit_transform(X_reduced) + +kmeans = KMeans() + +kmeans = KMeans(n_clusters=n_clusters, random_state=0) + +kmeans.fit(X_norm) + +# Let's look at the top 6 words again + +words = np.array(sorted(tfidf_vect.vocabulary_.keys())) + +for i in range(n_components): + idx = svd.components_[i].argsort()[::-1][:6] + + top_k = words[idx] + print("{i}: {words}".format(i=i, words=top_k)) + +dta.loc[has_comments].violations[kmeans.labels_ == 0].head() +dta.loc[has_comments].violations[kmeans.labels_ == 1].head() +dta.loc[has_comments].violations[kmeans.labels_ == 2].head() diff --git a/solutions/frequent_word.py b/solutions/frequent_word.py new file mode 100644 index 0000000..e4b40e7 --- /dev/null +++ b/solutions/frequent_word.py @@ -0,0 +1,22 @@ +from sklearn.feature_extraction.text import CountVectorizer +from load_data import dta + + +dta = dta.drop(["violations"], axis='columns').join( + dta.violations.str.split("|", expand=True) + .unstack() + .dropna() + .str.strip() + .reset_index(level=0, drop=True) + .to_frame() + .rename(columns={0: 'violations'}), + how='right' +) + +count_vectorizer = CountVectorizer(stop_words='english') +count_vectorizer.fit(dta.violations) +count_matrix = count_vectorizer.transform(dta.violations) + + +top_idx = count_matrix.sum(0).argmax() +count_vectorizer.get_feature_names()[top_idx] diff --git a/solutions/month_visits.py b/solutions/month_visits.py new file mode 100644 index 0000000..b57cfb3 --- /dev/null +++ b/solutions/month_visits.py @@ -0,0 +1,34 @@ +import numpy as np +import pandas as pd + + +def float_to_zip(zip_code): + # convert from the string in the file to a float + try: + zip_code = float(zip_code) + except ValueError: # some of them are empty + return np.nan + + # 0 makes sure to left-pad with zero + # zip codes have 5 digits + # .0 means, we don't want anything after the decimal + # f is for float + zip_code = "{:05.0f}".format(zip_code) + return zip_code + + +dta = pd.read_csv( + "data/health_inspection_chi.csv", + index_col='inspection_id', + parse_dates=['inspection_date'], + converters={ + 'zip': float_to_zip + }, + usecols=lambda col: col != 'location', + dtype={ + 'results': 'category', + 'risk': 'category', + 'inspection_type': 'category', + 'facility_type': 'category' + } +) diff --git a/solutions/numpy_reshape.py b/solutions/numpy_reshape.py new file mode 100644 index 0000000..440eb7c --- /dev/null +++ b/solutions/numpy_reshape.py @@ -0,0 +1,4 @@ +import numpy as np + + +np.arange(1000).reshape(-1, 10) diff --git a/solutions/object_hook_json.py b/solutions/object_hook_json.py new file mode 100644 index 0000000..ea36e58 --- /dev/null +++ b/solutions/object_hook_json.py @@ -0,0 +1,23 @@ +import json +from pprint import pprint + + +def remove_entry(record): + try: + del record['location'] + # this is called recursively on objects so not all have it + except KeyError: + pass + + return record + + +def parse_json(record): + return json.loads(record, object_hook=remove_entry) + + +with open("data/health_inspection_chi_sample.json") as json_file: + dta = [parse_json(line) for line in json_file] + + +pprint(dta[0]) diff --git a/solutions/pandas_read_csv.py b/solutions/pandas_read_csv.py new file mode 100644 index 0000000..e0df739 --- /dev/null +++ b/solutions/pandas_read_csv.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd + + +def float_to_zip(zip_code): + # convert from the string in the file to a float + try: + zip_code = float(zip_code) + except ValueError: # some of them are empty + return np.nan + + # 0 makes sure to left-pad with zero + # zip codes have 5 digits + # .0 means, we don't want anything after the decimal + # f is for float + zip_code = "{:05.0f}".format(zip_code) + return zip_code + + +dta = pd.read_csv( + "data/health_inspection_chi.csv", + index_col='inspection_id', + parse_dates=['inspection_date'], + converters={ + 'zip': float_to_zip + }, + usecols=lambda col: col != 'location', + dtype={ + 'results': 'category', + 'risk': 'category', + 'inspection_type': 'category', + 'facility_type': 'category' + } +) + + +assert float_to_zip('1234') +assert float_to_zip('123456') +assert np.isnan(float_to_zip('')) diff --git a/solutions/plot_clusters.py b/solutions/plot_clusters.py new file mode 100644 index 0000000..57e7947 --- /dev/null +++ b/solutions/plot_clusters.py @@ -0,0 +1,10 @@ +import matplotlib.pyplot as plt + +fig, axes = plt.subplots(nrows=2, figsize=(12, 10)) + +axes[0].hist(kmeans.labels_, bins=n_clusters) +axes[1].hist(dta.violations.str.extract("(\d+)(?=\.)", expand=False).astype(int), bins=n_clusters); + +dta.violations[kmeans.labels_ == 0].iloc[:5].values +dta.violations[kmeans.labels_ == 1].iloc[:5].values +dta.violations[kmeans.labels_ == 2].iloc[:5].values diff --git a/solutions/random_poisson.py b/solutions/random_poisson.py new file mode 100644 index 0000000..8f68158 --- /dev/null +++ b/solutions/random_poisson.py @@ -0,0 +1,6 @@ +import numpy as np + +y = np.random.poisson(lam=5, size=1000) +bins = np.bincount(y) + +print(bins.argmax()) diff --git a/solutions/read_json.py b/solutions/read_json.py new file mode 100644 index 0000000..a5d3999 --- /dev/null +++ b/solutions/read_json.py @@ -0,0 +1,7 @@ +import pandas as pd + +pd.read_json( + "data/health_inspection_chi_sample.json", + orient="records", + lines=True, +) diff --git a/solutions/read_json_twice.py b/solutions/read_json_twice.py new file mode 100644 index 0000000..98242de --- /dev/null +++ b/solutions/read_json_twice.py @@ -0,0 +1,11 @@ +json_file = open('data/health_inspection_chi_sample.json') + +for line in json_file: + pass + +print(json_file.tell()) + +json_file.seek(0) + +for line in json_file: + pass diff --git a/solutions/read_url_json.py b/solutions/read_url_json.py new file mode 100644 index 0000000..385ad78 --- /dev/null +++ b/solutions/read_url_json.py @@ -0,0 +1,5 @@ +import pandas as pd + +url = ('https://data.cityofchicago.org/' + 'resource/cwig-ma7x.json?$limit=5') +pd.read_json(url, orient='records') diff --git a/solutions/top_words_loadings.py b/solutions/top_words_loadings.py new file mode 100644 index 0000000..0133738 --- /dev/null +++ b/solutions/top_words_loadings.py @@ -0,0 +1,14 @@ +import numpy as np + +words = np.array(sorted(tfidf_vect.vocabulary_.keys())) + +# the words are lexicographically ordered + +print(words[:15]) + + +for i in range(n_components): + idx = np.abs(svd.components_[i]).argsort()[::-1][:6] + + top_k = words[idx] + print("{i}: {words}".format(i=i, words=top_k)) diff --git a/solutions/violation_distribution.py b/solutions/violation_distribution.py new file mode 100644 index 0000000..2bae3a4 --- /dev/null +++ b/solutions/violation_distribution.py @@ -0,0 +1,38 @@ +import numpy as np +import pandas as pd + + +def float_to_zip(zip_code): + # convert from the string in the file to a float + try: + zip_code = float(zip_code) + except ValueError: # some of them are empty + return np.nan + + # 0 makes sure to left-pad with zero + # zip codes have 5 digits + # .0 means, we don't want anything after the decimal + # f is for float + zip_code = "{:05.0f}".format(zip_code) + return zip_code + + +dta = pd.read_csv( + "data/health_inspection_chi.csv", + index_col='inspection_id', + parse_dates=['inspection_date'], + converters={ + 'zip': float_to_zip + }, + usecols=lambda col: col != 'location', + dtype={ + 'results': 'category', + 'risk': 'category', + 'inspection_type': 'category', + 'facility_type': 'category' + } +) + + +quantiles = [0, .05, .25, .50, .75, .95, 1.00] +(dta.violations.str.count("\|") + 1).quantile()