CityofToronto · gabrielwol · Jul 31, 2024 · Jul 16, 2024 · Jul 16, 2024 · Jul 17, 2024
diff --git a/dags/ecocounter_pull.py b/dags/ecocounter_pull.py
@@ -31,9 +31,8 @@
     from dags.common_tasks import check_jan_1st, wait_for_weather_timesensor
     from dags.custom_operators import SQLCheckOperatorWithReturnValue
     from volumes.ecocounter.pull_data_from_api import (
-        getToken, getSites, getFlowData, siteIsKnownToUs, insertSite,
-        insertFlow, flowIsKnownToUs, truncateFlowSince, insertFlowCounts,
-        getKnownSites, getKnownFlows
+        getToken, getSites, siteIsKnownToUs, insertSite, insertFlow,
+        flowIsKnownToUs, getKnownSites, getKnownFlows, truncate_and_insert
     )
 except:
     raise ImportError("Cannot import DAG helper functions.")
@@ -85,9 +84,8 @@ def check_partitions():
         )
 
         check_jan_1st.override(task_id="check_annual_partition")() >> create_annual_partition
-
-    @task(trigger_rule='none_failed')
-    def update_sites_and_flows(**context):
+
+    def get_connections():
         api_conn = BaseHook.get_connection('ecocounter_api_key')
         token = getToken(
             api_conn.host,
@@ -96,7 +94,11 @@ def update_sites_and_flows(**context):
             api_conn.extra_dejson['secret_api_hash']
         )
         eco_postgres = PostgresHook("ecocounter_bot")
+        return eco_postgres, token
 
+    @task(trigger_rule='none_failed')
+    def update_sites_and_flows(**context):
+        eco_postgres, token = get_connections()
         new_sites, new_flows = [], []
         with eco_postgres.get_conn() as conn:
             for site in getSites(token):
@@ -140,38 +142,30 @@ def update_sites_and_flows(**context):
 
     @task(trigger_rule='none_failed')
     def pull_ecocounter(ds):
-        api_conn = BaseHook.get_connection('ecocounter_api_key')
-        token = getToken(
-            api_conn.host,
-            api_conn.login,
-            api_conn.password,
-            api_conn.extra_dejson['secret_api_hash']
-        )
-        eco_postgres = PostgresHook("ecocounter_bot")
-
+        eco_postgres, token = get_connections()
         start_date = dateutil.parser.parse(str(ds))
         end_date = dateutil.parser.parse(str(ds_add(ds, 1)))
         LOGGER.info(f'Pulling data from {start_date} to {end_date}.')
         with eco_postgres.get_conn() as conn:
             for site_id in getKnownSites(conn):
                 LOGGER.debug(f'Starting on site {site_id}.')
                 for flow_id in getKnownFlows(conn, site_id):
-                    LOGGER.debug(f'Starting on flow {flow_id} for site {site_id}.')
-                    # empty the count table for this flow
-                    truncateFlowSince(flow_id, conn, start_date, end_date)          
-                    # and fill it back up!
-                    LOGGER.debug(f'Fetching data for flow {flow_id}.')
-                    counts = getFlowData(token, flow_id, start_date, end_date)
-                    #convert response into a tuple for inserting
-                    volume=[]
-                    for count in counts:
-                        row=(flow_id, count['date'], count['counts'])
-                        volume.append(row)
-                    if len(volume) == 0:
-                        LOGGER.info(f'{len(volume)} rows fetched for flow {flow_id} of site {site_id}.')
-                    insertFlowCounts(conn, volume)
-                LOGGER.info(f'Data inserted for site {site_id}.')
-          
+                    truncate_and_insert(conn, token, flow_id, start_date, end_date)
+
+    @task(trigger_rule='none_failed')
+    def pull_recent_outages():
+        eco_postgres, token = get_connections()
+        #get list of outages
+        outage_query = "SELECT flow_id, start_time, end_time FROM ecocounter.identify_outages('60 days'::interval);"
+        with eco_postgres.get_conn() as conn, conn.cursor() as curr:
+            curr.execute(outage_query)
+            recent_outages = curr.fetchall()
+        #for each outage, try to pull data
+        with eco_postgres.get_conn() as conn:
+            for outage in recent_outages:
+                flow_id, start_date, end_date = outage
+                truncate_and_insert(conn, token, flow_id, start_date, end_date)
+
     t_done = ExternalTaskMarker(
         task_id="done",
         external_dag_id="ecocounter_check",
@@ -219,6 +213,7 @@ def data_checks():
         ]
 
     (
+        pull_recent_outages(),
         check_partitions() >>
         update_sites_and_flows() >>
         pull_ecocounter() >>

diff --git a/volumes/ecocounter/functions/create-function-identify-outages.sql b/volumes/ecocounter/functions/create-function-identify-outages.sql
@@ -0,0 +1,84 @@
+CREATE OR REPLACE FUNCTION ecocounter.identify_outages(
+    num_days interval
+)
+RETURNS TABLE (
+    flow_id numeric,
+    start_time timestamp,
+    end_time timestamp
+)
+LANGUAGE plpgsql
+COST 100
+VOLATILE
+
+AS $BODY$
+
+BEGIN
+
+RETURN QUERY
+WITH ongoing_outages AS (
+    SELECT
+        f.flow_id,
+        f.site_id,
+        dates.dt::date,
+        dates.dt - lag(dates.dt) OVER w = interval '1 day' AS consecutive
+    FROM ecocounter.flows_unfiltered AS f
+    CROSS JOIN
+        generate_series(
+            now()::date - num_days,
+            now()::date - interval '2 day', --2 bc last interval will be this + 1 day
+            interval '1 day'
+        ) AS dates (dt)
+    LEFT JOIN ecocounter.counts_unfiltered AS c
+        ON c.flow_id = f.flow_id
+        AND c.datetime_bin >= dates.dt
+        AND c.datetime_bin < dates.dt + interval '1 day'
+        --select counts partitions
+        AND c.datetime_bin >= now()::date - num_days
+        AND c.datetime_bin < now()::date - interval '1 day'
+    WHERE
+        f.validated
+        AND dates.dt < COALESCE(f.date_decommissioned, now()::date - interval '1 day')
+    GROUP BY
+        f.flow_id,
+        f.site_id,
+        f.validated,
+        f.last_active,
+        f.date_decommissioned,
+        dates.dt
+    HAVING SUM(c.volume) IS NULL
+    WINDOW w AS (PARTITION BY f.flow_id ORDER BY dates.dt)
+    ORDER BY
+        f.flow_id,
+        dates.dt
+),
+
+group_ids AS (
+    SELECT
+        oo.flow_id,
+        oo.dt,
+        SUM(CASE WHEN oo.consecutive IS TRUE THEN 0 ELSE 1 END) OVER w AS group_id
+    FROM ongoing_outages AS oo
+    WINDOW w AS (PARTITION BY oo.flow_id ORDER BY oo.dt)
+)
+
+SELECT
+    gi.flow_id,
+    MIN(gi.dt)::timestamp AS start_time,
+    MAX(gi.dt) + interval '1 day' AS end_time
+FROM group_ids AS gi
+GROUP BY
+    gi.flow_id,
+    gi.group_id;
+
+END;
+$BODY$;
+
+ALTER FUNCTION ecocounter.identify_outages(interval) OWNER TO ecocounter_admins;
+GRANT ALL ON FUNCTION ecocounter.identify_outages(interval) TO ecocounter_admins;
+
+GRANT EXECUTE ON FUNCTION ecocounter.identify_outages(interval) TO bdit_humans;
+GRANT EXECUTE ON FUNCTION ecocounter.identify_outages(interval) TO ecocounter_bot;
+
+COMMENT ON FUNCTION ecocounter.identify_outages(interval)
+IS 'A function to identify day level outages (null volume) in Ecocounter data and group
+them into runs for ease of pulling. Used by Airflow ecocounter_pull.pull_recent_outages task.';
diff --git a/volumes/ecocounter/pull_data_from_api.py b/volumes/ecocounter/pull_data_from_api.py
@@ -1,10 +1,13 @@
 import requests
+import logging
 from configparser import ConfigParser
 from psycopg2 import connect
 from psycopg2.extras import execute_values
 from datetime import datetime, timedelta
 from airflow.exceptions import AirflowFailException
 
+LOGGER = logging.getLogger(__name__)
+
 default_start = datetime.now().replace(hour = 0, minute = 0, second = 0, microsecond = 0)-timedelta(days=1)
 default_end = datetime.now().replace(hour = 0, minute = 0, second = 0, microsecond = 0)
 
@@ -148,6 +151,20 @@ def insertFlow(conn: any, flow_id: int, site_id: int, flow_name: str, bin_size:
     with conn.cursor() as cur:
         cur.execute(insert_query, (flow_id, site_id, flow_name, bin_size))
 
+def truncate_and_insert(conn, token, flow_id, start_date, end_date):
+    LOGGER.info(f'Attempting to fetch data for flow {flow_id} from {start_date} to {end_date}.')
+    # empty the count table for this flow
+    truncateFlowSince(flow_id, conn, start_date, end_date)          
+    # and fill it back up!
+    counts = getFlowData(token, flow_id, start_date, end_date)
+    #convert response into a tuple for inserting
+    volume=[]
+    for count in counts:
+        row=(flow_id, count['date'], count['counts'])
+        volume.append(row)
+    LOGGER.info(f'{len(volume)} rows fetched for flow {flow_id} from {start_date} to {end_date}.')
+    insertFlowCounts(conn, volume)
+
 #for testing/pulling data without use of airflow.
 def run_api(
         start_date: datetime = default_start,
@@ -177,16 +194,4 @@ def run_api(
             if not flowIsKnownToUs(flow_id, conn):
                 print('unknown flow', flow_id)
                 continue
-            # we do have this site and flow in the database; let's update its counts
-            print(f'starting on flow {flow_id}')
-            # empty the count table for this flow
-            truncateFlowSince(flow_id, conn, start_date, end_date)
-            # and fill it back up!
-            print(f'fetching data for flow {flow_id}')
-            counts = getFlowData(token, flow_id, start_date, end_date)
-            print(f'inserting data for flow {flow_id}')
-            volume=[]
-            for count in counts:
-                row=(flow_id, count['date'], count['counts'])
-                volume.append(row)
-            insertFlowCounts(conn, volume)
+            truncate_and_insert(conn, token, flow_id, start_date, end_date)
diff --git a/volumes/ecocounter/readme.md b/volumes/ecocounter/readme.md
@@ -1,17 +1,11 @@
 <!-- TOC -->
 
-- [Bicycle loop detectors](#bicycle-loop-detectors)
-  - [Installation types](#installation-types)
-  - [Ecocounter data](#ecocounter-data)
-    - [Flows - what we know](#flows---what-we-know)
-  - [Discontinuities](#discontinuities)
-  - [Using the Ecocounter API](#using-the-ecocounter-api)
-    - [Note](#note)
-  - [Historical data](#historical-data)
-  - [`ecocounter_pull` DAG](#ecocounter_pull-dag)
-    - [`check_partitions` TaskGroup](#check_partitions-taskgroup)
-    - [`data_checks` TaskGroup](#data_checks-taskgroup)
-  - [`ecocounter_check` DAG](#ecocounter_check-dag)
+    - [Discontinuities](#discontinuities)
+    - [Using the Ecocounter API](#using-the-ecocounter-api)
+        - [Note](#note)
+    - [Historical data](#historical-data)
+    - [ecocounter_pull DAG](#ecocounter_pull-dag)
+    - [ecocounter_check DAG](#ecocounter_check-dag)
 - [SQL Tables](#sql-tables)
   - [Main Tables](#main-tables)
     - [`ecocounter.sites_unfiltered`](#ecocountersites_unfiltered)
@@ -27,35 +21,6 @@
 
 <!-- /TOC -->
 
-# Bicycle loop detectors
-
-This dataset comes from a small but growing number of permanent [loop detectors](https://en.wikipedia.org/wiki/Induction_loop) installed within designated bicycle infrastructure such as bike lanes and multi-use paths. This is actually one of our older data collection programs, and the data have been handled in a number of ways over the years and now reside in a couple different places in the `bigdata` database.
-
-Ecocounter is the vendor that manages our current sensor installations. There is a web dashboard at https://www.eco-visio.net that should show all active installations.
-
-## Installation types
-There are two types of sensors, which can be easily distinguished. The single sensor installations, as below simply count the number of bikes that pass over the sensor. These are installed in one-way infrastructure such as a typical bike lane. 
-
-![a single ecocounter sensor installed in a one-way bike lane](./single-sensor.jpg)
-
-Increasingly however newer installations are using a double sensor that can detect the direction of travel as well. In cases like the image below this allows us to measure contra-flow travel within the bike lane. 
-
-![a double sensor installed in a one-way bike lane](./double-sensor.jpg)
-
-Sometimes these paired sensors are themselves installed in pairs, giving four measured flows per site, two per lane. 
-
-![a pair of bidirectional sensors recently installed in a multi-use path](double-double-sensor.jpg)
-
-## Ecocounter data
-
-Data from these sensors is stored in the `ecocounter` schema in three **views**:
-
-* `sites`
-* `flows`
-* `counts`
-
-A **site** is a distinct location, sometimes referring to one and sometimes to two directions of travel on the same path or street. A site is recorded as a point geometry at the centroid of the sensor(s) it represents.
-
 A **flow** (sometimes also referred to as a _channel_) is a direction of travel recorded at a site. A site may have 1, 2, or 4 flows depending on whether one or two sensors are installed and whether they record the two directions of travel separately.
 
 Each flow has `counts` of bikes at regularly spaced intervals. All sensors now use 15 minute bins though some previously were configured with 30 minute or one-hour bins. The bin size is indicated in the `flows` table and the `counts` table gives the _start_ time of the bin.
@@ -112,25 +77,28 @@ LIMIT 1000;
 ```
 
 <!-- ecocounter_pull_doc_md -->
+
 ## `ecocounter_pull` DAG
 The `ecocounter_pull` DAG runs daily at 3am to populate `ecocounter` schema with new data. 
 
-### `check_partitions` TaskGroup
-- `check_annual_partition` checks if execution date is January 1st.  
-- `create_annual_partitions` creates a new annual partition for `ecocounter.counts_unfiltered` if previous task succeeds.  
+- `pull_recent_outages` task is similar to `pull_ecocounter` task except it tries to pull data corresponding to zero volume outages within the last 60 days. This was implemented following the finding that some Ecocounters will suddenly backfill missing data due to spotty cellular signal. Max ~2 weeks of backfilling has been observed so the task was conservatively set to look back 60 days. 
 
+- `check_partitions` TaskGroup
+  - `check_annual_partition` checks if execution date is January 1st.  
+  - `create_annual_partitions` creates a new annual partition for `ecocounter.counts_unfiltered` if previous task succeeds.  
+
 - `update_sites_and_flows` task identifies any sites and "flows" (known as channels in the API) in the API which do not exist in our database and adds them to `ecocounter.sites_unfiltered` and `ecocounter.flows_unfiltered`. The new rows contain a flag `validated = null` indicating they still need to be manually validated. A notification is sent with any new additions.  
 - `pull_ecocounter` task pulls data from the Ecocounter API and inserts into the `ecocounter.counts_unfiltered` table. 
 - `done` is an external task marker to trigger the `ecocounter_check` DAG for additional "yellow card" data checks.  
 
-### `data_checks` TaskGroup
-This task group runs data quality checks on the pipeline output.  
-- `wait_for_weather` delays the downstream data check by a few hours until the historical weather is available to add context.  
-- `check_volume` checks the sum of volume in `ecocounter.counts` (filtered view) and notifies if less than 70% of the 60 day lookback avg.  
-- `check_distinct_flow_ids` checks the count of distinct flow_ids appearing in `ecocounter.counts` (filtered view) and notifies if less than 70% of the 60 day lookback avg.  
+- `data_checks` TaskGroup: This task group runs data quality checks on the pipeline output.  
+  - `wait_for_weather` delays the downstream data check by a few hours until the historical weather is available to add context.  
+  - `check_volume` checks the sum of volume in `ecocounter.counts` (filtered view) and notifies if less than 70% of the 60 day lookback avg.  
+  - `check_distinct_flow_ids` checks the count of distinct flow_ids appearing in `ecocounter.counts` (filtered view) and notifies if less than 70% of the 60 day lookback avg.  
 <!-- ecocounter_pull_doc_md -->
 
 <!-- ecocounter_check_doc_md -->
+
 ## `ecocounter_check` DAG
 The `ecocounter_check` DAG runs daily at 4am following completion of `ecocounter_pull` to perform additional "yellow card" data checks on the new data.  
 

diff --git a/volumes/ecocounter/tables/flows_unfiltered.sql b/volumes/ecocounter/tables/flows_unfiltered.sql
@@ -11,6 +11,7 @@ CREATE TABLE ecocounter.flows_unfiltered (
     validated boolean,
     first_active timestamp without time zone,
     last_active timestamp without time zone,
+    date_decommissioned timestamp without time zone,
     CONSTRAINT locations_pkey PRIMARY KEY (flow_id),
     CONSTRAINT flows_replaced_by_flow_id_fkey FOREIGN KEY (replaced_by_flow_id)
     REFERENCES ecocounter.flows_unfiltered (flow_id) MATCH SIMPLE

diff --git a/volumes/ecocounter/tables/sites_unfiltered.sql b/volumes/ecocounter/tables/sites_unfiltered.sql
@@ -9,6 +9,7 @@ CREATE TABLE ecocounter.sites_unfiltered (
     centreline_id integer,
     first_active timestamp without time zone,
     last_active timestamp without time zone,
+    date_decommissioned timestamp without time zone,
     CONSTRAINT sites_pkey PRIMARY KEY (site_id),
     CONSTRAINT sites_replaced_by_fkey FOREIGN KEY (replaced_by_site_id)
     REFERENCES ecocounter.sites_unfiltered (site_id) MATCH SIMPLE

diff --git a/volumes/ecocounter/views/create-view-flows.sql b/volumes/ecocounter/views/create-view-flows.sql
@@ -10,7 +10,8 @@ CREATE OR REPLACE VIEW ecocounter.flows AS (
         replaces_flow_id,
         includes_contraflow,
         first_active,
-        last_active
+        last_active,
+        date_decommissioned
     FROM ecocounter.flows_unfiltered
     WHERE validated
 );

diff --git a/volumes/ecocounter/views/create-view-sites.sql b/volumes/ecocounter/views/create-view-sites.sql
@@ -8,7 +8,8 @@ CREATE OR REPLACE VIEW ecocounter.sites AS (
         replaced_by_site_id,
         centreline_id,
         first_active,
-        last_active
+        last_active,
+        date_decommissioned
     FROM ecocounter.sites_unfiltered
     WHERE validated
 );