diff --git a/.gitignore b/.gitignore index e93c8598..11cacb64 100644 --- a/.gitignore +++ b/.gitignore @@ -59,4 +59,5 @@ Session.vim rustc-ice*.txt rustc-ice-* -.env \ No newline at end of file +.env +testing-gtfs \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ad65dda7..4a213661 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -127,3 +127,7 @@ path = "src/spruce/main.rs" [[bin]] name = "pg_tests" path = "src/pg_tests/main.rs" + +[[bin]] +name = "test_maple_syrup" +path = "src/maple_syrup/test.rs" \ No newline at end of file diff --git a/migrations/2024-04-06-053500_timetable-compression-v1/down.sql b/migrations/2024-04-06-053500_timetable-compression-v1/down.sql new file mode 100644 index 00000000..91e6c6e4 --- /dev/null +++ b/migrations/2024-04-06-053500_timetable-compression-v1/down.sql @@ -0,0 +1,25 @@ +-- This file should undo anything in `up.sql` +CREATE TABLE gtfs.stoptimes ( + onestop_feed_id text NOT NULL, + attempt_id text NOT NULL, + trip_id text NOT NULL, + stop_sequence int NOT NULL, + arrival_time OID, + departure_time OID, + stop_id text NOT NULL, + stop_headsign text, + stop_headsign_translations jsonb, + pickup_type smallint NOT NULL, + drop_off_type smallint NOT NULL, + shape_dist_traveled float4, + -- true is 1, false is 0 + timepoint bool NOT NULL, + continuous_pickup smallint NOT NULL, + continuous_drop_off smallint NOT NULL, + -- point GEOMETRY(POINT, 4326), + route_id text NOT NULL, + chateau text NOT NULL, + PRIMARY KEY (onestop_feed_id, attempt_id, trip_id, stop_sequence) +); + +CREATE INDEX stoptimes_chateau_idx ON gtfs.stoptimes (chateau); \ No newline at end of file diff --git a/migrations/2024-04-06-053500_timetable-compression-v1/up.sql b/migrations/2024-04-06-053500_timetable-compression-v1/up.sql new file mode 100644 index 00000000..1013fa28 --- /dev/null +++ b/migrations/2024-04-06-053500_timetable-compression-v1/up.sql @@ -0,0 +1,3 @@ +-- Your SQL goes here +DROP TABLE IF EXISTS gtfs.stoptimes CASCADE; + diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 271800cb..31578d3b 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly" \ No newline at end of file +channel = "stable" \ No newline at end of file diff --git a/src/enum_to_int.rs b/src/enum_to_int.rs new file mode 100644 index 00000000..341cf2d5 --- /dev/null +++ b/src/enum_to_int.rs @@ -0,0 +1,78 @@ +use gtfs_structures::BikesAllowedType; +use gtfs_structures::ContinuousPickupDropOff; +use gtfs_structures::LocationType; +use gtfs_structures::RouteType; +use gtfs_structures::TimepointType; + +pub fn location_type_conversion(input: &LocationType) -> i16 { + match input { + LocationType::StopPoint => 0, + LocationType::StopArea => 1, + LocationType::StationEntrance => 2, + LocationType::GenericNode => 3, + LocationType::BoardingArea => 4, + LocationType::Unknown(i) => *i, + } +} + +pub fn route_type_to_int(input: &RouteType) -> i16 { + match input { + RouteType::Tramway => 0, + RouteType::Subway => 1, + RouteType::Rail => 2, + RouteType::Bus => 3, + RouteType::Ferry => 4, + RouteType::CableCar => 5, + RouteType::Gondola => 6, + RouteType::Funicular => 7, + RouteType::Coach => 200, + RouteType::Air => 1100, + RouteType::Taxi => 1500, + RouteType::Other(i) => *i, + } +} + +pub fn availability_to_int(input: >fs_structures::Availability) -> i16 { + match input { + gtfs_structures::Availability::Available => 1, + gtfs_structures::Availability::NotAvailable => 2, + gtfs_structures::Availability::Unknown(unknown) => *unknown, + gtfs_structures::Availability::InformationNotAvailable => 0, + } +} + +pub fn timepoint_to_bool(timepoint: &TimepointType) -> bool { + match timepoint { + TimepointType::Exact => true, + TimepointType::Approximate => false, + } +} + +pub fn pickup_dropoff_to_i16(x: >fs_structures::PickupDropOffType) -> i16 { + match x { + gtfs_structures::PickupDropOffType::Regular => 0, + gtfs_structures::PickupDropOffType::NotAvailable => 1, + gtfs_structures::PickupDropOffType::ArrangeByPhone => 2, + gtfs_structures::PickupDropOffType::CoordinateWithDriver => 3, + gtfs_structures::PickupDropOffType::Unknown(x) => *x, + } +} + +pub fn continuous_pickup_drop_off_to_i16(x: &ContinuousPickupDropOff) -> i16 { + match x { + ContinuousPickupDropOff::Continuous => 0, + ContinuousPickupDropOff::NotAvailable => 1, + ContinuousPickupDropOff::ArrangeByPhone => 2, + ContinuousPickupDropOff::CoordinateWithDriver => 3, + ContinuousPickupDropOff::Unknown(x) => *x, + } +} + +pub fn bikes_allowed_to_int(bikes_allowed: &BikesAllowedType) -> i16 { + match bikes_allowed { + BikesAllowedType::NoBikeInfo => 0, + BikesAllowedType::AtLeastOneBike => 1, + BikesAllowedType::NoBikesAllowed => 2, + BikesAllowedType::Unknown(unknown) => *unknown, + } +} diff --git a/src/lib.rs b/src/lib.rs index de65de91..9d6c9570 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,9 @@ extern crate diesel_derive_newtype; pub mod agency_secret; pub mod aspen; pub mod custom_pg_types; +pub mod enum_to_int; pub mod gtfs_rt_handlers; +pub mod maple_syrup; pub mod models; pub mod postgis_to_diesel; pub mod postgres_tools; diff --git a/src/maple/gtfs_handlers/enum_to_int.rs b/src/maple/gtfs_handlers/enum_to_int.rs deleted file mode 100644 index e5ea1cb2..00000000 --- a/src/maple/gtfs_handlers/enum_to_int.rs +++ /dev/null @@ -1,30 +0,0 @@ -use gtfs_structures::LocationType; -use gtfs_structures::RouteType; - -pub fn location_type_conversion(input: &LocationType) -> i16 { - match input { - LocationType::StopPoint => 0, - LocationType::StopArea => 1, - LocationType::StationEntrance => 2, - LocationType::GenericNode => 3, - LocationType::BoardingArea => 4, - LocationType::Unknown(i) => *i, - } -} - -pub fn route_type_to_int(input: &RouteType) -> i16 { - match input { - RouteType::Tramway => 0, - RouteType::Subway => 1, - RouteType::Rail => 2, - RouteType::Bus => 3, - RouteType::Ferry => 4, - RouteType::CableCar => 5, - RouteType::Gondola => 6, - RouteType::Funicular => 7, - RouteType::Coach => 200, - RouteType::Air => 1100, - RouteType::Taxi => 1500, - RouteType::Other(i) => *i, - } -} diff --git a/src/maple/gtfs_handlers/gtfs_to_int.rs b/src/maple/gtfs_handlers/gtfs_to_int.rs deleted file mode 100644 index 55a35ee7..00000000 --- a/src/maple/gtfs_handlers/gtfs_to_int.rs +++ /dev/null @@ -1,39 +0,0 @@ -use gtfs_structures::LocationType; -use gtfs_structures::RouteType; - -pub fn route_type_to_int(input: &RouteType) -> i16 { - match input { - RouteType::Tramway => 0, - RouteType::Subway => 1, - RouteType::Rail => 2, - RouteType::Bus => 3, - RouteType::Ferry => 4, - RouteType::CableCar => 5, - RouteType::Gondola => 6, - RouteType::Funicular => 7, - RouteType::Coach => 200, - RouteType::Air => 1100, - RouteType::Taxi => 1500, - RouteType::Other(i) => (*i), - } -} - -pub fn location_type_conversion(input: &LocationType) -> i16 { - match input { - LocationType::StopPoint => 0, - LocationType::StopArea => 1, - LocationType::StationEntrance => 2, - LocationType::GenericNode => 3, - LocationType::BoardingArea => 4, - LocationType::Unknown(i) => *i, - } -} - -pub fn availability_to_int(input: >fs_structures::Availability) -> i16 { - match input { - gtfs_structures::Availability::Available => 1, - gtfs_structures::Availability::NotAvailable => 2, - gtfs_structures::Availability::Unknown(unknown) => *unknown, - gtfs_structures::Availability::InformationNotAvailable => 0, - } -} diff --git a/src/maple/gtfs_handlers/mod.rs b/src/maple/gtfs_handlers/mod.rs index 773236bc..e8255844 100644 --- a/src/maple/gtfs_handlers/mod.rs +++ b/src/maple/gtfs_handlers/mod.rs @@ -1,8 +1,6 @@ pub mod colour_correction; pub mod convex_hull; -pub mod enum_to_int; pub mod flatten; -pub mod gtfs_to_int; pub mod hull_from_gtfs; pub mod rename_route_labels; pub mod shape_colour_calculator; diff --git a/src/maple/gtfs_handlers/shape_colour_calculator.rs b/src/maple/gtfs_handlers/shape_colour_calculator.rs index 899a8f25..a4b86092 100644 --- a/src/maple/gtfs_handlers/shape_colour_calculator.rs +++ b/src/maple/gtfs_handlers/shape_colour_calculator.rs @@ -18,7 +18,7 @@ pub fn shape_to_colour(feed_id: &str, gtfs: >fs_structures::Gtfs) -> ShapeToCo //metrolink colours are all bonked because trips don't have shape ids in them if (feed_id == "f-9qh-metrolinktrains") { for (shape_id, shape) in >fs.shapes { - let cleanedline = shape_id.to_owned().replace("in","").replace("out",""); + let cleanedline = shape_id.to_owned().replace("in", "").replace("out", ""); let value = match cleanedline.as_str() { "91" => "91 Line", @@ -39,10 +39,14 @@ pub fn shape_to_colour(feed_id: &str, gtfs: >fs_structures::Gtfs) -> ShapeToCo }) .or_insert(HashSet::from_iter([shape_id.clone()])); - shape_id_to_route_ids_lookup.insert(shape_id.clone(), HashSet::from_iter([value.to_string()])); + shape_id_to_route_ids_lookup + .insert(shape_id.clone(), HashSet::from_iter([value.to_string()])); if let Some(route) = gtfs.routes.get(&value.to_string()) { - println!("Route data found for shape {} and route id {}", shape_id, value); + println!( + "Route data found for shape {} and route id {}", + shape_id, value + ); let color = colour_correction::fix_background_colour_rgb_feed_route( feed_id, route.color, @@ -52,9 +56,12 @@ pub fn shape_to_colour(feed_id: &str, gtfs: >fs_structures::Gtfs) -> ShapeToCo shape_to_color_lookup.insert(shape_id.clone(), color); shape_to_text_color_lookup.insert(shape_id.clone(), route.text_color); } else { - eprintln!("Could not find the route data for shape {} and route id {}", shape_id, value); + eprintln!( + "Could not find the route data for shape {} and route id {}", + shape_id, value + ); } - } + } } for (trip_id, trip) in >fs.trips { diff --git a/src/maple/gtfs_handlers/stops_associated_items.rs b/src/maple/gtfs_handlers/stops_associated_items.rs index 69856829..e08a4db6 100644 --- a/src/maple/gtfs_handlers/stops_associated_items.rs +++ b/src/maple/gtfs_handlers/stops_associated_items.rs @@ -1,4 +1,4 @@ -use crate::gtfs_handlers::enum_to_int::route_type_to_int; +use catenary::enum_to_int::route_type_to_int; use std::collections::{HashMap, HashSet}; pub fn make_hashmap_stops_to_route_types_and_ids( diff --git a/src/maple/gtfs_ingestion_sequence/shapes_into_postgres.rs b/src/maple/gtfs_ingestion_sequence/shapes_into_postgres.rs index 158fbe09..a73d07d7 100644 --- a/src/maple/gtfs_ingestion_sequence/shapes_into_postgres.rs +++ b/src/maple/gtfs_ingestion_sequence/shapes_into_postgres.rs @@ -6,7 +6,7 @@ use std::error::Error; use std::sync::Arc; use crate::gtfs_handlers::colour_correction; -use crate::gtfs_handlers::enum_to_int::route_type_to_int; +use catenary::enum_to_int::route_type_to_int; use crate::gtfs_handlers::rename_route_labels::*; use catenary::postgres_tools::CatenaryConn; use catenary::postgres_tools::CatenaryPostgresPool; diff --git a/src/maple/gtfs_ingestion_sequence/stops_into_postgres.rs b/src/maple/gtfs_ingestion_sequence/stops_into_postgres.rs index 487daffb..24b9546e 100644 --- a/src/maple/gtfs_ingestion_sequence/stops_into_postgres.rs +++ b/src/maple/gtfs_ingestion_sequence/stops_into_postgres.rs @@ -4,6 +4,7 @@ use diesel_async::AsyncConnection; use diesel_async::RunQueryDsl; use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use catenary::enum_to_int::*; use titlecase::titlecase; pub async fn stops_into_postgres( @@ -42,7 +43,7 @@ pub async fn stops_into_postgres( code: stop.code.clone(), gtfs_desc: stop.description.clone(), gtfs_desc_translations: None, - location_type: crate::gtfs_handlers::gtfs_to_int::location_type_conversion( + location_type: location_type_conversion( &stop.location_type, ), children_ids: match stop_id_to_children_ids.get(&stop.id) { @@ -68,7 +69,7 @@ pub async fn stops_into_postgres( timezone: stop.timezone.clone(), level_id: stop.level_id.clone(), station_feature: false, - wheelchair_boarding: crate::gtfs_handlers::gtfs_to_int::availability_to_int( + wheelchair_boarding: availability_to_int( &stop.wheelchair_boarding, ), primary_route_type: match stop_ids_to_route_types.get(&stop.id) { diff --git a/src/maple/gtfs_process.rs b/src/maple/gtfs_process.rs index 986d10c9..0ce7271d 100644 --- a/src/maple/gtfs_process.rs +++ b/src/maple/gtfs_process.rs @@ -3,13 +3,13 @@ use crate::gtfs_handlers::colour_correction::fix_foreground_colour_rgb; use crate::gtfs_handlers::colour_correction::fix_foreground_colour_rgb_feed; // Initial version 3 of ingest written by Kyler Chin // Removal of the attribution is not allowed, as covered under the AGPL license -use crate::gtfs_handlers::gtfs_to_int::availability_to_int; use crate::gtfs_handlers::shape_colour_calculator::shape_to_colour; use crate::gtfs_handlers::shape_colour_calculator::ShapeToColourResponse; use crate::gtfs_handlers::stops_associated_items::*; use crate::gtfs_ingestion_sequence::shapes_into_postgres::shapes_into_postgres; use crate::gtfs_ingestion_sequence::stops_into_postgres::stops_into_postgres; use crate::DownloadedFeedsInformation; +use catenary::enum_to_int::*; use catenary::models::Route as RoutePgModel; use catenary::postgres_tools::CatenaryConn; use catenary::postgres_tools::CatenaryPostgresPool; @@ -235,12 +235,7 @@ pub async fn gtfs_process_feed( }), None => None, }, - bikes_allowed: match trip.bikes_allowed { - BikesAllowedType::NoBikeInfo => 0, - BikesAllowedType::AtLeastOneBike => 1, - BikesAllowedType::NoBikesAllowed => 2, - BikesAllowedType::Unknown(unknown) => unknown, - }, + bikes_allowed: bikes_allowed_to_int(&trip.bikes_allowed), block_id: trip.block_id.clone(), shape_id: trip.shape_id.clone(), wheelchair_accessible: availability_to_int(&trip.wheelchair_accessible), @@ -374,7 +369,7 @@ pub async fn gtfs_process_feed( long_name_translations: None, gtfs_desc: route.desc.clone(), gtfs_desc_translations: None, - route_type: crate::gtfs_handlers::gtfs_to_int::route_type_to_int(&route.route_type), + route_type: route_type_to_int(&route.route_type), url: route.url.clone(), url_translations: None, shapes_list: match route_ids_to_shape_ids.get(&route_id.clone()) { diff --git a/src/maple_syrup/README.md b/src/maple_syrup/README.md new file mode 100644 index 00000000..bedcceec --- /dev/null +++ b/src/maple_syrup/README.md @@ -0,0 +1,3 @@ +# GTFS compression algorithm + +The goal is to reduce the current 197 GB of Stop times into a series of transfer patterns and trip patterns. \ No newline at end of file diff --git a/src/maple_syrup/mod.rs b/src/maple_syrup/mod.rs new file mode 100644 index 00000000..63b74a91 --- /dev/null +++ b/src/maple_syrup/mod.rs @@ -0,0 +1,158 @@ +use core::hash; +// GTFS stop time compression algorithm +// Probably not compatible with transfer patterns yet, this is just for schedule lookup for now +use crate::enum_to_int::*; +use fasthash::MetroHasher; +use gtfs_structures::ContinuousPickupDropOff; +use gtfs_structures::DirectionType; +use gtfs_structures::TimepointType; +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; + +#[derive(Hash, Clone, Debug)] +pub struct ItineraryCover { + pub stop_sequences: Vec, + //map 0 to false and 1 to true + pub direction_id: Option, + pub route_id: String, + pub trip_headsign: Option, +} + +#[derive(Hash, Debug, Clone, PartialEq, Eq)] +struct StopDifference { + pub stop_id: String, + pub arrival_time_since_start: Option, + pub departure_time_since_start: Option, + pub continuous_pickup: i16, + pub continuous_drop_off: i16, + pub stop_headsign: Option, + pub drop_off_type: i16, + pub pickup_type: i16, + //true is exact, false is approximate + pub timepoint: bool, +} + +#[derive(Clone, Debug)] +struct TripUnderItinerary { + pub trip_id: String, + pub start_time: u32, + pub service_id: String, + pub wheelchair_accessible: i16, + pub block_id: Option, + pub bikes_allowed: i16, + pub frequencies: Vec, +} + +fn hash(t: &T) -> u64 { + let mut s: MetroHasher = Default::default(); + t.hash(&mut s); + s.finish() +} + +pub struct ResponseFromReduce { + pub itineraries: HashMap, + pub trips_to_itineraries: HashMap, + pub itineraries_to_trips: HashMap>, +} + +pub fn reduce(gtfs: >fs_structures::Gtfs) -> ResponseFromReduce { + let mut itineraries: HashMap = HashMap::new(); + let mut trips_to_itineraries: HashMap = HashMap::new(); + let mut itineraries_to_trips: HashMap> = HashMap::new(); + + for (trip_id, trip) in >fs.trips { + + let mut stop_diffs: Vec = Vec::new(); + + if trip.stop_times.len() < 2 { + println!("Trip {} doesn't contain enough times", trip); + continue; + } + + //according to the gtfs spec + //Arrival times are "Required for the first and last stop in a trip (defined by stop_times.stop_sequence)" + if trip.stop_times[0].arrival_time.is_none() || trip.stop_times[trip.stop_times.len() - 1].departure_time.is_none() { + println!("Invalid trip {} with no start or end time", trip_id); + continue; + } + + let mut start_time: u32 = trip.stop_times[0].arrival_time.unwrap(); + + //this trip "starts at 09:00" local time or something + for stop_time in trip.stop_times.iter() { + + let arrival_time_since_start:Option = match stop_time.arrival_time { + Some(arrival_time) => Some(arrival_time as i32 - start_time as i32), + None => None, + }; + + let departure_time_since_start:Option = match stop_time.departure_time { + Some(departure_time) => Some(departure_time as i32 - start_time as i32), + None => None, + }; + + let stop_diff = StopDifference { + stop_id: stop_time.stop.id.clone(), + arrival_time_since_start: arrival_time_since_start, + departure_time_since_start: departure_time_since_start, + continuous_pickup: continuous_pickup_drop_off_to_i16(&stop_time.continuous_pickup), + continuous_drop_off: continuous_pickup_drop_off_to_i16( + &stop_time.continuous_drop_off, + ), + stop_headsign: stop_time.stop_headsign.clone(), + drop_off_type: pickup_dropoff_to_i16(&stop_time.drop_off_type), + pickup_type: pickup_dropoff_to_i16(&stop_time.pickup_type), + timepoint: timepoint_to_bool(&stop_time.timepoint), + }; + + stop_diffs.push(stop_diff); + } + + let itinerary_cover = ItineraryCover { + stop_sequences: stop_diffs, + direction_id: match trip.direction_id { + Some(direction) => Some(match direction { + DirectionType::Outbound => false, + DirectionType::Inbound => true, + }), + None => None, + }, + route_id: trip.route_id.clone(), + trip_headsign: trip.trip_headsign.clone(), + }; + + //itinerary id generated + let hash_of_itinerary = hash(&itinerary_cover); + + itineraries.insert(hash_of_itinerary, itinerary_cover); + trips_to_itineraries.insert(trip_id.clone(), hash_of_itinerary); + + let trip_under_itinerary = TripUnderItinerary { + trip_id: trip_id.clone(), + start_time: start_time, + service_id: trip.service_id.clone(), + wheelchair_accessible: availability_to_int(&trip.wheelchair_accessible), + block_id: trip.block_id.clone(), + bikes_allowed: bikes_allowed_to_int(&trip.bikes_allowed), + frequencies: trip.frequencies.clone(), + }; + + itineraries_to_trips + .entry(hash_of_itinerary) + .and_modify(|existing_trips| { + existing_trips.push(trip_under_itinerary.clone()); + }) + .or_insert(vec![trip_under_itinerary]); + } + + ResponseFromReduce { + itineraries, + trips_to_itineraries, + itineraries_to_trips, + } +} + +#[cfg(test)] +mod tests { + use super::*; +} diff --git a/src/maple_syrup/test.rs b/src/maple_syrup/test.rs new file mode 100644 index 00000000..681c907c --- /dev/null +++ b/src/maple_syrup/test.rs @@ -0,0 +1,29 @@ +fn main() -> Result<(), Box> { + let arguments = arguments::parse(std::env::args()); + + let path = arguments + .expect("expected path to gtfs file like --path FILE.zip") + .get::("path").expect("expected path to gtfs file like --path FILE.zip"); + + println!("Loading {}", path); + let gtfs = gtfs_structures::Gtfs::new(&path.as_str()).expect("failed to load gtfs file"); + + let start = std::time::Instant::now(); + let response = catenary::maple_syrup::reduce(>fs); + let duration = start.elapsed(); + + let route_count = gtfs.routes.len(); + let trip_count = gtfs.trips.len(); + + let stop_times_count = gtfs.trips.iter().map(|(_, trip)| trip.stop_times.len()).sum::(); + + let iten_count = response.itineraries.len(); + + println!("Reduced schedule in {:?}", duration); + println!("{} has {} routes, {} trips and {} stop times, reduced to {} itineraries", path, route_count, trip_count, stop_times_count, iten_count); + + println!("Compression ratio: {:.2}", (trip_count) as f64 / iten_count as f64); +// println!("Weissman score: ") + + Ok(()) +} \ No newline at end of file