From 805b2a5489398707245ef0a03faa8ef32715de1c Mon Sep 17 00:00:00 2001 From: Mayank Khetan Date: Sat, 6 Apr 2024 02:21:20 +0530 Subject: [PATCH] [WIP] Add incremental mode support in Simple Stats Importer tool. --- run_test.sh | 32 ++ simple/sample/input_incremental_1/config.json | 68 +++ .../sample/input_incremental_1/countries.csv | 15 + simple/sample/input_incremental_1/geoids.csv | 3 + .../input_incremental_1/latlng_events.csv | 52 +++ simple/sample/input_incremental_2/config.json | 54 +++ simple/sample/input_incremental_2/latlng.csv | 3 + .../input_incremental_2/powerplants.csv | 15 + simple/sample/input_incremental_2/s2cells.csv | 3 + .../input_incremental_2/wikidataids.csv | 3 + simple/sample/output/tables/imports.csv | 2 +- .../output_incremental/nl/sentences.csv | 6 + .../process/debug_resolve_countries.csv | 15 + .../process/debug_resolve_geoids.csv | 3 + .../process/debug_resolve_latlng.csv | 3 + .../process/debug_resolve_latlng_events.csv | 51 +++ .../process/debug_resolve_powerplants.csv | 15 + .../process/debug_resolve_s2cells.csv | 3 + .../process/debug_resolve_wikidataids.csv | 3 + .../output_incremental/process/report.json | 27 ++ .../output_incremental/tables/imports.csv | 3 + .../tables/observations.csv | 100 ++++ .../output_incremental/tables/triples.csv | 429 ++++++++++++++++++ simple/stats/db.py | 109 ++++- simple/stats/main.py | 9 +- simple/stats/nl.py | 8 +- simple/stats/reporter.py | 1 + simple/stats/runner.py | 9 +- 28 files changed, 1024 insertions(+), 20 deletions(-) create mode 100644 simple/sample/input_incremental_1/config.json create mode 100644 simple/sample/input_incremental_1/countries.csv create mode 100644 simple/sample/input_incremental_1/geoids.csv create mode 100644 simple/sample/input_incremental_1/latlng_events.csv create mode 100644 simple/sample/input_incremental_2/config.json create mode 100644 simple/sample/input_incremental_2/latlng.csv create mode 100644 simple/sample/input_incremental_2/powerplants.csv create mode 100644 simple/sample/input_incremental_2/s2cells.csv create mode 100644 simple/sample/input_incremental_2/wikidataids.csv create mode 100644 simple/sample/output_incremental/nl/sentences.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_countries.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_geoids.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_latlng.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_latlng_events.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_powerplants.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_s2cells.csv create mode 100644 simple/sample/output_incremental/process/debug_resolve_wikidataids.csv create mode 100644 simple/sample/output_incremental/process/report.json create mode 100644 simple/sample/output_incremental/tables/imports.csv create mode 100644 simple/sample/output_incremental/tables/observations.csv create mode 100644 simple/sample/output_incremental/tables/triples.csv diff --git a/run_test.sh b/run_test.sh index f22d118f..40b98da6 100755 --- a/run_test.sh +++ b/run_test.sh @@ -114,6 +114,33 @@ function run_sample { deactivate } +function run_sample_incremental { + # Do not use Cloud SQL. + export USE_CLOUDSQL=false + + python3 -m venv .env + source .env/bin/activate + + cd simple + pip3 install -r requirements.txt + + echo "Deleting existing datacommons.db file." + rm -f sample/output_incremental/datacommons.db + + echo "Running sample." + python3 -m stats.main --input_dir=sample/input_incremental_1 --output_dir=sample/output_incremental --freeze_time + echo "Running sample again." + python3 -m stats.main --input_dir=sample/input_incremental_2 --output_dir=sample/output_incremental --freeze_time --incremental + + echo "Writing tables to CSVs." + mkdir -p sample/output_incremental/tables + sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from observations;" > sample/output_incremental/tables/observations.csv + sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from triples;" > sample/output_incremental/tables/triples.csv + sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from imports;" > sample/output_incremental/tables/imports.csv + + deactivate +} + function run_main_dc_sample { python3 -m venv .env source .env/bin/activate @@ -199,6 +226,11 @@ while [[ "$#" -gt 0 ]]; do run_sample shift 1 ;; + -i) + echo -e "### Running sample incremental" + run_sample_incremental + shift 1 + ;; *) help exit 1 diff --git a/simple/sample/input_incremental_1/config.json b/simple/sample/input_incremental_1/config.json new file mode 100644 index 00000000..fa0a3e97 --- /dev/null +++ b/simple/sample/input_incremental_1/config.json @@ -0,0 +1,68 @@ +{ + "inputFiles": { + "countries.csv": { + "importType": "observations", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "geoids.csv": { + "importType": "observations", + "entityType": "", + "ignoreColumns": [ + "ignore1", + "ignore2" + ], + "provenance": "Provenance1 Name" + }, + "latlng_events.csv": { + "importType": "events", + "eventType": "CrimeEvent", + "entityType": "CensusZipCodeTabulationArea", + "provenance": "Provenance1 Name", + "idColumn": "CASE", + "computedVariables": [ + "Crime Count" + ] + } + }, + "variables": { + "var1": { + "name": "Good var1 name", + "description": "Good var1 description", + "nlSentences": [ + "Natural language sentence 1", + "Natural language sentence 2" + ], + "group": "Parent Group/Child Group 1", + "properties": { + "populationType": "Person", + "measuredProperty": "age", + "statType": "medianValue", + "gender": "Female" + } + }, + "var2": { + "name": "Good var2 name", + "group": "Parent Group/Child Group 2" + }, + "Variable 1": { + "group": "Parent Group" + }, + "Crime Count": { + "description": "Number of crimes", + "aggregation": { + "period": "month", + "method": "count" + } + } + }, + "sources": { + "Source1 Name": { + "url": "http://source1.com", + "provenances": { + "Provenance1 Name": "http://source1.com/provenance1", + "Provenance2 Name": "http://source1.com/provenance2" + } + } + } +} \ No newline at end of file diff --git a/simple/sample/input_incremental_1/countries.csv b/simple/sample/input_incremental_1/countries.csv new file mode 100644 index 00000000..e62275b3 --- /dev/null +++ b/simple/sample/input_incremental_1/countries.csv @@ -0,0 +1,15 @@ +place,year,var1,var2 +Afghanistan,2023,0.19,6 +Yemen,2023,0.21,56 +Angola,2023,0.29,6 +Zambia,2023,0.31,34 +Zimbabwe,2023,0.37,76 +Albania,2023,0.50,34 +dcid: wikidataId/Q22062741,2023,0.50,97 +Algeria,2023,0.52,92 +West Bank and Gaza,2023,0.53,64 +Andorra,2023,0.76,9 +American Samoa,2023,#N/A,34 +Anguilla,2023,#N/A,42 +Wallis and Futuna Islands,2023,#N/A,75 +Western Sahara,2023,#N/A,65 diff --git a/simple/sample/input_incremental_1/geoids.csv b/simple/sample/input_incremental_1/geoids.csv new file mode 100644 index 00000000..15bf5311 --- /dev/null +++ b/simple/sample/input_incremental_1/geoids.csv @@ -0,0 +1,3 @@ +geoId,year,ignore1,Variable 1,Variable 2,ignore2 +01,2021,foo,555, 666 ,bar +122,2022,#N/A,321 , "123,456",baz diff --git a/simple/sample/input_incremental_1/latlng_events.csv b/simple/sample/input_incremental_1/latlng_events.csv new file mode 100644 index 00000000..4c74544e --- /dev/null +++ b/simple/sample/input_incremental_1/latlng_events.csv @@ -0,0 +1,52 @@ +lat#lng,DATE OF OCCURRENCE,CASE,PRIMARY DESCRIPTION +41.927407329#-87.70729439,11/08/2023 8:50:00 PM,JG497095,THEFT +41.896671699#-87.628635323,11/08/2023 3:14:00 PM,JG496991,ASSAULT +41.808525157#-87.672792896,11/08/2023 10:55:00 PM,JG497145,ASSAULT +41.979505088#-87.693158103,11/08/2023 4:39:00 PM,JG496701,OTHER OFFENSE +41.771890947#-87.638705659,10/28/2023 7:30:00 PM,JG484195,THEFT +41.985611859#-87.713834343,10/28/2023 5:00:00 PM,JG483131,CRIMINAL DAMAGE +41.733053891#-87.568330657,11/08/2023 8:25:00 AM,JG498494,ASSAULT +41.949586612#-87.664085689,11/08/2023 2:38:00 PM,JG496575,THEFT +41.704388397#-87.626879123,09/17/2023 3:00:00 AM,JG427641,THEFT +41.881944424#-87.634195294,08/02/2023 9:25:00 AM,JG365961,ASSAULT +41.755481563#-87.649019949,11/08/2023 9:00:00 AM,JG496115,THEFT +41.970433391#-87.763029002,11/08/2023 7:45:00 PM,JG496955,ROBBERY +41.802269632#-87.605372566,11/08/2023 3:00:00 PM,JG501047,BURGLARY +41.721303358#-87.655873595,11/08/2023 1:00:00 PM,JG496779,ASSAULT +41.884497529#-87.625838595,11/08/2023 11:00:00 AM,JG496296,THEFT +41.778436411#-87.589657198,11/05/2023 6:00:00 PM,JG504330,OFFENSE INVOLVING CHILDREN +41.838219696#-87.704850674,11/08/2023 1:49:00 PM,JG496568,ASSAULT +41.70319162#-87.651369057,11/08/2023 11:30:00 AM,JG496295,ROBBERY +41.883969722#-87.644191276,10/28/2023 9:00:00 AM,JG488191,MOTOR VEHICLE THEFT +41.884276844#-87.622098929,10/28/2023 12:10:00 PM,JG482122,ROBBERY +41.87493626#-87.748170814,10/28/2023 1:30:00 AM,JG481621,CRIMINAL DAMAGE +41.95417672#-87.677232056,11/08/2023 6:30:00 PM,JG499040,CRIMINAL TRESPASS +41.948044095#-87.664039332,11/08/2023 9:01:00 PM,JG497052,BURGLARY +41.875625633#-87.629450396,12/14/2022 7:00:00 PM,JF511492,THEFT +41.976489992#-87.788483018,11/08/2023 10:40:00 PM,JG498785,THEFT +41.828080528#-87.686233684,11/08/2023 12:02:00 PM,JG496372,BURGLARY +41.993043969#-87.660360363,10/28/2023 9:15:00 AM,JG481891,THEFT +41.858444489#-87.716414102,10/28/2023 2:29:00 PM,JG482300,OTHER OFFENSE +41.879874073#-87.769750673,10/28/2023 2:00:00 AM,JG482034,THEFT +41.771296232#-87.729149311,08/31/2023 7:00:00 PM,JG406115,BATTERY +41.875679322#-87.62657476,09/08/2023 8:00:00 AM,JG416492,DECEPTIVE PRACTICE +41.82539977#-87.637026874,08/31/2023 10:52:00 AM,JG405111,BATTERY +41.863196881#-87.614817819,12/07/2022 1:39:00 PM,JF501686,CRIMINAL TRESPASS +41.93743245#-87.649180491,10/28/2023 7:41:00 AM,JG484000,THEFT +41.680799541#-87.669942159,10/28/2023 10:40:00 PM,JG482660,ASSAULT +41.891874434#-87.647617474,10/15/2023 2:30:00 AM,JG464444,ROBBERY +41.888993854#-87.626934833,12/08/2022 3:38:00 PM,JF254640,HOMICIDE +41.902821551#-87.775389625,10/28/2023 4:00:00 PM,JG483104,THEFT +41.724654303#-87.622283278,08/31/2023 9:13:00 PM,JG412467,MOTOR VEHICLE THEFT +41.773780824#-87.645848665,09/29/2023 12:00:00 AM,JG453780,THEFT +41.841289747#-87.628142362,09/30/2023 11:45:00 PM,JG445684,BATTERY +41.776150283#-87.615522623,11/08/2023 3:00:00 AM,JG495860,CRIMINAL DAMAGE +41.836069707#-87.613033345,11/08/2023 3:00:00 PM,JG497647,MOTOR VEHICLE THEFT +41.74974473#-87.652507329,09/30/2023 9:30:00 PM,JG445669,ASSAULT +41.946653043#-87.700875462,09/30/2023 10:01:00 AM,JG445052,THEFT +41.890400093#-87.628021143,10/28/2023 3:34:00 AM,JG481737,BATTERY +41.793842185#-87.620286919,11/13/2023 9:30:00 AM,JG503521,BURGLARY +41.720900408#-87.554599376,10/15/2023 3:00:00 AM,JG468840,CRIMINAL DAMAGE +41.742267488#-87.702192623,10/15/2023 8:00:00 PM,JG465660,THEFT +41.766298978#-87.570076538,10/15/2023 11:01:00 AM,JG464615,MOTOR VEHICLE THEFT +41.927407329#-87.70729439,11/08/2023 8:50:00 PM,JG497095,dfbdshfbj \ No newline at end of file diff --git a/simple/sample/input_incremental_2/config.json b/simple/sample/input_incremental_2/config.json new file mode 100644 index 00000000..b3542c00 --- /dev/null +++ b/simple/sample/input_incremental_2/config.json @@ -0,0 +1,54 @@ +{ + "inputFiles": { + "latlng.csv": { + "importType": "observations", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "powerplants.csv": { + "importType": "observations", + "entityType": "PowerPlant", + "provenance": "Provenance2 Name" + }, + "s2cells.csv": { + "importType": "observations", + "entityType": "S2CellLevel10", + "provenance": "Provenance2 Name" + }, + "wikidataids.csv": { + "importType": "observations", + "entityType": "Country", + "provenance": "Provenance1 Name" + } + }, + "variables": { + "var1": { + "name": "Good var1 name", + "description": "Good var1 description", + "nlSentences": [ + "Natural language sentence 1", + "Natural language sentence 2" + ], + "group": "Parent Group/Child Group 1", + "properties": { + "populationType": "Person", + "measuredProperty": "age", + "statType": "medianValue", + "gender": "Female" + } + }, + "var2": { + "name": "Good var2 name", + "group": "Parent Group/Child Group 2" + } + }, + "sources": { + "Source1 Name": { + "url": "http://source1.com", + "provenances": { + "Provenance1 Name": "http://source1.com/provenance1", + "Provenance2 Name": "http://source1.com/provenance2" + } + } + } +} \ No newline at end of file diff --git a/simple/sample/input_incremental_2/latlng.csv b/simple/sample/input_incremental_2/latlng.csv new file mode 100644 index 00000000..13c1246b --- /dev/null +++ b/simple/sample/input_incremental_2/latlng.csv @@ -0,0 +1,3 @@ +lat#lng,year,var1,var2 +38.7#-119.4,2021,555,666 +19.076#72.877,2022,321,123 diff --git a/simple/sample/input_incremental_2/powerplants.csv b/simple/sample/input_incremental_2/powerplants.csv new file mode 100644 index 00000000..674a3e0c --- /dev/null +++ b/simple/sample/input_incremental_2/powerplants.csv @@ -0,0 +1,15 @@ +powerplant,year,var1,var2 +Suzlon Project,2023,0.19,6 +Crete Energy Venture,2023,0.21,56 +Watchtower Educational Center,2023,0.29,6 +Union Power,2023,0.31,34 +Pearl Station,2023,0.37,76 +Austin Gas Recovery,2023,0.50,34 +FOO BAR,2023,0.50,97 +Gordon,2023,0.52,92 +BAZ BAR,2023,0.53,64 +White River Lock and Dam 2,2023,0.76,9 +Bristol Plant,2023,#N/A,34 +Edison Sault,2023,#N/A,42 +Navajo Dam,2023,#N/A,75 +CNN Center,2023,#N/A,65 diff --git a/simple/sample/input_incremental_2/s2cells.csv b/simple/sample/input_incremental_2/s2cells.csv new file mode 100644 index 00000000..13c1246b --- /dev/null +++ b/simple/sample/input_incremental_2/s2cells.csv @@ -0,0 +1,3 @@ +lat#lng,year,var1,var2 +38.7#-119.4,2021,555,666 +19.076#72.877,2022,321,123 diff --git a/simple/sample/input_incremental_2/wikidataids.csv b/simple/sample/input_incremental_2/wikidataids.csv new file mode 100644 index 00000000..a9870983 --- /dev/null +++ b/simple/sample/input_incremental_2/wikidataids.csv @@ -0,0 +1,3 @@ +wikidataid,year,var1,var2 +Q30,2021,555,666 +Q668,2022,321,123 diff --git a/simple/sample/output/tables/imports.csv b/simple/sample/output/tables/imports.csv index 662a8017..e2add92a 100644 --- a/simple/sample/output/tables/imports.csv +++ b/simple/sample/output/tables/imports.csv @@ -1,2 +1,2 @@ imported_at,status,metadata -"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 5, ""numObs"": 99}" +"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 5, ""numObs"": 99, ""incremental"": false}" diff --git a/simple/sample/output_incremental/nl/sentences.csv b/simple/sample/output_incremental/nl/sentences.csv new file mode 100644 index 00000000..1fa662dc --- /dev/null +++ b/simple/sample/output_incremental/nl/sentences.csv @@ -0,0 +1,6 @@ +dcid,sentence +var1,Good var1 name;Good var1 description;Natural language sentence 1;Natural language sentence 2 +var2,Good var2 name +Variable_1,Variable 1 +Variable_2,Variable 2 +Crime_Count,Crime Count;Number of crimes diff --git a/simple/sample/output_incremental/process/debug_resolve_countries.csv b/simple/sample/output_incremental/process/debug_resolve_countries.csv new file mode 100644 index 00000000..20e90425 --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_countries.csv @@ -0,0 +1,15 @@ +input,dcid,link +West Bank and Gaza,*UNRESOLVED*, +dcid: wikidataId/Q22062741,wikidataId/Q22062741,https://datacommons.org/browser/wikidataId/Q22062741 +Afghanistan,country/AFG,https://datacommons.org/browser/country/AFG +Albania,country/ALB,https://datacommons.org/browser/country/ALB +Algeria,country/DZA,https://datacommons.org/browser/country/DZA +American Samoa,country/ASM,https://datacommons.org/browser/country/ASM +Andorra,country/AND,https://datacommons.org/browser/country/AND +Angola,country/AGO,https://datacommons.org/browser/country/AGO +Anguilla,country/AIA,https://datacommons.org/browser/country/AIA +Wallis and Futuna Islands,country/WLF,https://datacommons.org/browser/country/WLF +Western Sahara,country/ESH,https://datacommons.org/browser/country/ESH +Yemen,country/YEM,https://datacommons.org/browser/country/YEM +Zambia,country/ZMB,https://datacommons.org/browser/country/ZMB +Zimbabwe,country/ZWE,https://datacommons.org/browser/country/ZWE diff --git a/simple/sample/output_incremental/process/debug_resolve_geoids.csv b/simple/sample/output_incremental/process/debug_resolve_geoids.csv new file mode 100644 index 00000000..8bc3366d --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_geoids.csv @@ -0,0 +1,3 @@ +input,dcid,link +01,geoId/01,https://datacommons.org/browser/geoId/01 +122,geoId/122,https://datacommons.org/browser/geoId/122 diff --git a/simple/sample/output_incremental/process/debug_resolve_latlng.csv b/simple/sample/output_incremental/process/debug_resolve_latlng.csv new file mode 100644 index 00000000..898498bd --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_latlng.csv @@ -0,0 +1,3 @@ +input,dcid,link +38.7#-119.4,country/USA,https://datacommons.org/browser/country/USA +19.076#72.877,country/IND,https://datacommons.org/browser/country/IND diff --git a/simple/sample/output_incremental/process/debug_resolve_latlng_events.csv b/simple/sample/output_incremental/process/debug_resolve_latlng_events.csv new file mode 100644 index 00000000..f072f419 --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_latlng_events.csv @@ -0,0 +1,51 @@ +input,dcid,link +41.927407329#-87.70729439,zip/60647,https://datacommons.org/browser/zip/60647 +41.896671699#-87.628635323,zip/60654,https://datacommons.org/browser/zip/60654 +41.808525157#-87.672792896,zip/60609,https://datacommons.org/browser/zip/60609 +41.979505088#-87.693158103,zip/60625,https://datacommons.org/browser/zip/60625 +41.771890947#-87.638705659,zip/60621,https://datacommons.org/browser/zip/60621 +41.985611859#-87.713834343,zip/60659,https://datacommons.org/browser/zip/60659 +41.733053891#-87.568330657,zip/60617,https://datacommons.org/browser/zip/60617 +41.949586612#-87.664085689,zip/60613,https://datacommons.org/browser/zip/60613 +41.704388397#-87.626879123,zip/60628,https://datacommons.org/browser/zip/60628 +41.881944424#-87.634195294,zip/60606,https://datacommons.org/browser/zip/60606 +41.755481563#-87.649019949,zip/60620,https://datacommons.org/browser/zip/60620 +41.970433391#-87.763029002,zip/60630,https://datacommons.org/browser/zip/60630 +41.802269632#-87.605372566,zip/60615,https://datacommons.org/browser/zip/60615 +41.721303358#-87.655873595,zip/60643,https://datacommons.org/browser/zip/60643 +41.884497529#-87.625838595,zip/60601,https://datacommons.org/browser/zip/60601 +41.778436411#-87.589657198,zip/60637,https://datacommons.org/browser/zip/60637 +41.838219696#-87.704850674,zip/60623,https://datacommons.org/browser/zip/60623 +41.70319162#-87.651369057,zip/60643,https://datacommons.org/browser/zip/60643 +41.883969722#-87.644191276,zip/60661,https://datacommons.org/browser/zip/60661 +41.884276844#-87.622098929,zip/60601,https://datacommons.org/browser/zip/60601 +41.87493626#-87.748170814,zip/60644,https://datacommons.org/browser/zip/60644 +41.95417672#-87.677232056,zip/60613,https://datacommons.org/browser/zip/60613 +41.948044095#-87.664039332,zip/60613,https://datacommons.org/browser/zip/60613 +41.875625633#-87.629450396,zip/60605,https://datacommons.org/browser/zip/60605 +41.976489992#-87.788483018,zip/60630,https://datacommons.org/browser/zip/60630 +41.828080528#-87.686233684,zip/60609,https://datacommons.org/browser/zip/60609 +41.993043969#-87.660360363,zip/60660,https://datacommons.org/browser/zip/60660 +41.858444489#-87.716414102,zip/60623,https://datacommons.org/browser/zip/60623 +41.879874073#-87.769750673,zip/60644,https://datacommons.org/browser/zip/60644 +41.771296232#-87.729149311,zip/60629,https://datacommons.org/browser/zip/60629 +41.875679322#-87.62657476,zip/60605,https://datacommons.org/browser/zip/60605 +41.82539977#-87.637026874,zip/60609,https://datacommons.org/browser/zip/60609 +41.863196881#-87.614817819,zip/60605,https://datacommons.org/browser/zip/60605 +41.93743245#-87.649180491,zip/60657,https://datacommons.org/browser/zip/60657 +41.680799541#-87.669942159,zip/60643,https://datacommons.org/browser/zip/60643 +41.891874434#-87.647617474,zip/60642,https://datacommons.org/browser/zip/60642 +41.888993854#-87.626934833,zip/60611,https://datacommons.org/browser/zip/60611 +41.902821551#-87.775389625,zip/60651,https://datacommons.org/browser/zip/60651 +41.724654303#-87.622283278,zip/60619,https://datacommons.org/browser/zip/60619 +41.773780824#-87.645848665,zip/60621,https://datacommons.org/browser/zip/60621 +41.841289747#-87.628142362,zip/60616,https://datacommons.org/browser/zip/60616 +41.776150283#-87.615522623,zip/60637,https://datacommons.org/browser/zip/60637 +41.836069707#-87.613033345,zip/60616,https://datacommons.org/browser/zip/60616 +41.74974473#-87.652507329,zip/60620,https://datacommons.org/browser/zip/60620 +41.946653043#-87.700875462,zip/60618,https://datacommons.org/browser/zip/60618 +41.890400093#-87.628021143,zip/60611,https://datacommons.org/browser/zip/60611 +41.793842185#-87.620286919,zip/60637,https://datacommons.org/browser/zip/60637 +41.720900408#-87.554599376,zip/60617,https://datacommons.org/browser/zip/60617 +41.742267488#-87.702192623,zip/60652,https://datacommons.org/browser/zip/60652 +41.766298978#-87.570076538,zip/60649,https://datacommons.org/browser/zip/60649 diff --git a/simple/sample/output_incremental/process/debug_resolve_powerplants.csv b/simple/sample/output_incremental/process/debug_resolve_powerplants.csv new file mode 100644 index 00000000..a5b03a0e --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_powerplants.csv @@ -0,0 +1,15 @@ +input,dcid,link +BAZ BAR,*UNRESOLVED*, +FOO BAR,*UNRESOLVED*, +Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93 +Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p +Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4 +Union Power,dc/2ysvc67fk1162,https://datacommons.org/browser/dc/2ysvc67fk1162 +Pearl Station,dc/00w9rbw8yn7x7,https://datacommons.org/browser/dc/00w9rbw8yn7x7 +Austin Gas Recovery,dc/00zjgb4rjchx3,https://datacommons.org/browser/dc/00zjgb4rjchx3 +Gordon,dc/011s19rm0mzh1,https://datacommons.org/browser/dc/011s19rm0mzh1 +White River Lock and Dam 2,dc/017y3py1dzkmg,https://datacommons.org/browser/dc/017y3py1dzkmg +Bristol Plant,dc/4359q0h458f01,https://datacommons.org/browser/dc/4359q0h458f01 +Edison Sault,dc/3kds7zgl4wz26,https://datacommons.org/browser/dc/3kds7zgl4wz26 +Navajo Dam,dc/02b53twnh3fx,https://datacommons.org/browser/dc/02b53twnh3fx +CNN Center,dc/dk2p9l3l8x1b6,https://datacommons.org/browser/dc/dk2p9l3l8x1b6 diff --git a/simple/sample/output_incremental/process/debug_resolve_s2cells.csv b/simple/sample/output_incremental/process/debug_resolve_s2cells.csv new file mode 100644 index 00000000..e34c1404 --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_s2cells.csv @@ -0,0 +1,3 @@ +input,dcid,link +38.7#-119.4,s2CellId/0x80982b0000000000,https://datacommons.org/browser/s2CellId/0x80982b0000000000 +19.076#72.877,s2CellId/0x3be7c90000000000,https://datacommons.org/browser/s2CellId/0x3be7c90000000000 diff --git a/simple/sample/output_incremental/process/debug_resolve_wikidataids.csv b/simple/sample/output_incremental/process/debug_resolve_wikidataids.csv new file mode 100644 index 00000000..012d5a81 --- /dev/null +++ b/simple/sample/output_incremental/process/debug_resolve_wikidataids.csv @@ -0,0 +1,3 @@ +input,dcid,link +Q668,country/IND,https://datacommons.org/browser/country/IND +Q30,country/USA,https://datacommons.org/browser/country/USA diff --git a/simple/sample/output_incremental/process/report.json b/simple/sample/output_incremental/process/report.json new file mode 100644 index 00000000..c30a3e25 --- /dev/null +++ b/simple/sample/output_incremental/process/report.json @@ -0,0 +1,27 @@ +{ + "status": "SUCCESS", + "startTime": "2023-01-01 00:00:00", + "lastUpdate": "2023-01-01 00:00:00", + "importFiles": { + "latlng.csv": { + "status": "SUCCESS", + "startTime": "2023-01-01 00:00:00", + "lastUpdate": "2023-01-01 00:00:00" + }, + "powerplants.csv": { + "status": "SUCCESS", + "startTime": "2023-01-01 00:00:00", + "lastUpdate": "2023-01-01 00:00:00" + }, + "s2cells.csv": { + "status": "SUCCESS", + "startTime": "2023-01-01 00:00:00", + "lastUpdate": "2023-01-01 00:00:00" + }, + "wikidataids.csv": { + "status": "SUCCESS", + "startTime": "2023-01-01 00:00:00", + "lastUpdate": "2023-01-01 00:00:00" + } + } +} \ No newline at end of file diff --git a/simple/sample/output_incremental/tables/imports.csv b/simple/sample/output_incremental/tables/imports.csv new file mode 100644 index 00000000..f04f1006 --- /dev/null +++ b/simple/sample/output_incremental/tables/imports.csv @@ -0,0 +1,3 @@ +imported_at,status,metadata +"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 5, ""numObs"": 67, ""incremental"": false}" +"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 2, ""numObs"": 32, ""incremental"": true}" diff --git a/simple/sample/output_incremental/tables/observations.csv b/simple/sample/output_incremental/tables/observations.csv new file mode 100644 index 00000000..379d1303 --- /dev/null +++ b/simple/sample/output_incremental/tables/observations.csv @@ -0,0 +1,100 @@ +entity,variable,date,value,provenance +country/AFG,var1,2023,0.19,c/p/1 +country/YEM,var1,2023,0.21,c/p/1 +country/AGO,var1,2023,0.29,c/p/1 +country/ZMB,var1,2023,0.31,c/p/1 +country/ZWE,var1,2023,0.37,c/p/1 +country/ALB,var1,2023,0.5,c/p/1 +wikidataId/Q22062741,var1,2023,0.5,c/p/1 +country/DZA,var1,2023,0.52,c/p/1 +country/AND,var1,2023,0.76,c/p/1 +country/AFG,var2,2023,6,c/p/1 +country/YEM,var2,2023,56,c/p/1 +country/AGO,var2,2023,6,c/p/1 +country/ZMB,var2,2023,34,c/p/1 +country/ZWE,var2,2023,76,c/p/1 +country/ALB,var2,2023,34,c/p/1 +wikidataId/Q22062741,var2,2023,97,c/p/1 +country/DZA,var2,2023,92,c/p/1 +country/AND,var2,2023,9,c/p/1 +country/ASM,var2,2023,34,c/p/1 +country/AIA,var2,2023,42,c/p/1 +country/WLF,var2,2023,75,c/p/1 +country/ESH,var2,2023,65,c/p/1 +geoId/01,Variable_1,2021,555,c/p/1 +geoId/122,Variable_1,2022,321,c/p/1 +geoId/01,Variable_2,2021,666,c/p/1 +geoId/122,Variable_2,2022,123456,c/p/1 +zip/60647,Crime_Count,2023-11,2,c/p/1 +zip/60654,Crime_Count,2023-11,1,c/p/1 +zip/60609,Crime_Count,2023-11,2,c/p/1 +zip/60625,Crime_Count,2023-11,1,c/p/1 +zip/60621,Crime_Count,2023-10,1,c/p/1 +zip/60659,Crime_Count,2023-10,1,c/p/1 +zip/60617,Crime_Count,2023-11,1,c/p/1 +zip/60613,Crime_Count,2023-11,3,c/p/1 +zip/60628,Crime_Count,2023-09,1,c/p/1 +zip/60606,Crime_Count,2023-08,1,c/p/1 +zip/60620,Crime_Count,2023-11,1,c/p/1 +zip/60630,Crime_Count,2023-11,2,c/p/1 +zip/60615,Crime_Count,2023-11,1,c/p/1 +zip/60643,Crime_Count,2023-11,2,c/p/1 +zip/60601,Crime_Count,2023-11,1,c/p/1 +zip/60637,Crime_Count,2023-11,3,c/p/1 +zip/60623,Crime_Count,2023-11,1,c/p/1 +zip/60661,Crime_Count,2023-10,1,c/p/1 +zip/60601,Crime_Count,2023-10,1,c/p/1 +zip/60644,Crime_Count,2023-10,2,c/p/1 +zip/60605,Crime_Count,2022-12,2,c/p/1 +zip/60660,Crime_Count,2023-10,1,c/p/1 +zip/60623,Crime_Count,2023-10,1,c/p/1 +zip/60629,Crime_Count,2023-08,1,c/p/1 +zip/60605,Crime_Count,2023-09,1,c/p/1 +zip/60609,Crime_Count,2023-08,1,c/p/1 +zip/60657,Crime_Count,2023-10,1,c/p/1 +zip/60643,Crime_Count,2023-10,1,c/p/1 +zip/60642,Crime_Count,2023-10,1,c/p/1 +zip/60611,Crime_Count,2022-12,1,c/p/1 +zip/60651,Crime_Count,2023-10,1,c/p/1 +zip/60619,Crime_Count,2023-08,1,c/p/1 +zip/60621,Crime_Count,2023-09,1,c/p/1 +zip/60616,Crime_Count,2023-09,1,c/p/1 +zip/60616,Crime_Count,2023-11,1,c/p/1 +zip/60620,Crime_Count,2023-09,1,c/p/1 +zip/60618,Crime_Count,2023-09,1,c/p/1 +zip/60611,Crime_Count,2023-10,1,c/p/1 +zip/60617,Crime_Count,2023-10,1,c/p/1 +zip/60652,Crime_Count,2023-10,1,c/p/1 +zip/60649,Crime_Count,2023-10,1,c/p/1 +country/USA,var1,2021,555,c/p/1 +country/IND,var1,2022,321,c/p/1 +country/USA,var2,2021,666,c/p/1 +country/IND,var2,2022,123,c/p/1 +dc/000qxlm93vn93,var1,2023,0.19,c/p/2 +dc/5c7tz3lbln3p,var1,2023,0.21,c/p/2 +dc/8zmh7ctlkbsc4,var1,2023,0.29,c/p/2 +dc/2ysvc67fk1162,var1,2023,0.31,c/p/2 +dc/00w9rbw8yn7x7,var1,2023,0.37,c/p/2 +dc/00zjgb4rjchx3,var1,2023,0.5,c/p/2 +dc/011s19rm0mzh1,var1,2023,0.52,c/p/2 +dc/017y3py1dzkmg,var1,2023,0.76,c/p/2 +dc/000qxlm93vn93,var2,2023,6,c/p/2 +dc/5c7tz3lbln3p,var2,2023,56,c/p/2 +dc/8zmh7ctlkbsc4,var2,2023,6,c/p/2 +dc/2ysvc67fk1162,var2,2023,34,c/p/2 +dc/00w9rbw8yn7x7,var2,2023,76,c/p/2 +dc/00zjgb4rjchx3,var2,2023,34,c/p/2 +dc/011s19rm0mzh1,var2,2023,92,c/p/2 +dc/017y3py1dzkmg,var2,2023,9,c/p/2 +dc/4359q0h458f01,var2,2023,34,c/p/2 +dc/3kds7zgl4wz26,var2,2023,42,c/p/2 +dc/02b53twnh3fx,var2,2023,75,c/p/2 +dc/dk2p9l3l8x1b6,var2,2023,65,c/p/2 +s2CellId/0x80982b0000000000,var1,2021,555,c/p/2 +s2CellId/0x3be7c90000000000,var1,2022,321,c/p/2 +s2CellId/0x80982b0000000000,var2,2021,666,c/p/2 +s2CellId/0x3be7c90000000000,var2,2022,123,c/p/2 +country/USA,var1,2021,555,c/p/1 +country/IND,var1,2022,321,c/p/1 +country/USA,var2,2021,666,c/p/1 +country/IND,var2,2022,123,c/p/1 diff --git a/simple/sample/output_incremental/tables/triples.csv b/simple/sample/output_incremental/tables/triples.csv new file mode 100644 index 00000000..c0ef2dc4 --- /dev/null +++ b/simple/sample/output_incremental/tables/triples.csv @@ -0,0 +1,429 @@ +subject_id,predicate,object_id,object_value +JG497095,typeOf,CrimeEvent,"" +JG497095,location,zip/60647,"" +JG497095,observationDate,"","11/08/2023 8:50:00 PM" +JG497095,includedIn,c/p/1,"" +JG497095,CASE,"",JG497095 +JG497095,PRIMARY_DESCRIPTION,"",THEFT +JG496991,typeOf,CrimeEvent,"" +JG496991,location,zip/60654,"" +JG496991,observationDate,"","11/08/2023 3:14:00 PM" +JG496991,includedIn,c/p/1,"" +JG496991,CASE,"",JG496991 +JG496991,PRIMARY_DESCRIPTION,"",ASSAULT +JG497145,typeOf,CrimeEvent,"" +JG497145,location,zip/60609,"" +JG497145,observationDate,"","11/08/2023 10:55:00 PM" +JG497145,includedIn,c/p/1,"" +JG497145,CASE,"",JG497145 +JG497145,PRIMARY_DESCRIPTION,"",ASSAULT +JG496701,typeOf,CrimeEvent,"" +JG496701,location,zip/60625,"" +JG496701,observationDate,"","11/08/2023 4:39:00 PM" +JG496701,includedIn,c/p/1,"" +JG496701,CASE,"",JG496701 +JG496701,PRIMARY_DESCRIPTION,"","OTHER OFFENSE" +JG484195,typeOf,CrimeEvent,"" +JG484195,location,zip/60621,"" +JG484195,observationDate,"","10/28/2023 7:30:00 PM" +JG484195,includedIn,c/p/1,"" +JG484195,CASE,"",JG484195 +JG484195,PRIMARY_DESCRIPTION,"",THEFT +JG483131,typeOf,CrimeEvent,"" +JG483131,location,zip/60659,"" +JG483131,observationDate,"","10/28/2023 5:00:00 PM" +JG483131,includedIn,c/p/1,"" +JG483131,CASE,"",JG483131 +JG483131,PRIMARY_DESCRIPTION,"","CRIMINAL DAMAGE" +JG498494,typeOf,CrimeEvent,"" +JG498494,location,zip/60617,"" +JG498494,observationDate,"","11/08/2023 8:25:00 AM" +JG498494,includedIn,c/p/1,"" +JG498494,CASE,"",JG498494 +JG498494,PRIMARY_DESCRIPTION,"",ASSAULT +JG496575,typeOf,CrimeEvent,"" +JG496575,location,zip/60613,"" +JG496575,observationDate,"","11/08/2023 2:38:00 PM" +JG496575,includedIn,c/p/1,"" +JG496575,CASE,"",JG496575 +JG496575,PRIMARY_DESCRIPTION,"",THEFT +JG427641,typeOf,CrimeEvent,"" +JG427641,location,zip/60628,"" +JG427641,observationDate,"","09/17/2023 3:00:00 AM" +JG427641,includedIn,c/p/1,"" +JG427641,CASE,"",JG427641 +JG427641,PRIMARY_DESCRIPTION,"",THEFT +JG365961,typeOf,CrimeEvent,"" +JG365961,location,zip/60606,"" +JG365961,observationDate,"","08/02/2023 9:25:00 AM" +JG365961,includedIn,c/p/1,"" +JG365961,CASE,"",JG365961 +JG365961,PRIMARY_DESCRIPTION,"",ASSAULT +JG496115,typeOf,CrimeEvent,"" +JG496115,location,zip/60620,"" +JG496115,observationDate,"","11/08/2023 9:00:00 AM" +JG496115,includedIn,c/p/1,"" +JG496115,CASE,"",JG496115 +JG496115,PRIMARY_DESCRIPTION,"",THEFT +JG496955,typeOf,CrimeEvent,"" +JG496955,location,zip/60630,"" +JG496955,observationDate,"","11/08/2023 7:45:00 PM" +JG496955,includedIn,c/p/1,"" +JG496955,CASE,"",JG496955 +JG496955,PRIMARY_DESCRIPTION,"",ROBBERY +JG501047,typeOf,CrimeEvent,"" +JG501047,location,zip/60615,"" +JG501047,observationDate,"","11/08/2023 3:00:00 PM" +JG501047,includedIn,c/p/1,"" +JG501047,CASE,"",JG501047 +JG501047,PRIMARY_DESCRIPTION,"",BURGLARY +JG496779,typeOf,CrimeEvent,"" +JG496779,location,zip/60643,"" +JG496779,observationDate,"","11/08/2023 1:00:00 PM" +JG496779,includedIn,c/p/1,"" +JG496779,CASE,"",JG496779 +JG496779,PRIMARY_DESCRIPTION,"",ASSAULT +JG496296,typeOf,CrimeEvent,"" +JG496296,location,zip/60601,"" +JG496296,observationDate,"","11/08/2023 11:00:00 AM" +JG496296,includedIn,c/p/1,"" +JG496296,CASE,"",JG496296 +JG496296,PRIMARY_DESCRIPTION,"",THEFT +JG504330,typeOf,CrimeEvent,"" +JG504330,location,zip/60637,"" +JG504330,observationDate,"","11/05/2023 6:00:00 PM" +JG504330,includedIn,c/p/1,"" +JG504330,CASE,"",JG504330 +JG504330,PRIMARY_DESCRIPTION,"","OFFENSE INVOLVING CHILDREN" +JG496568,typeOf,CrimeEvent,"" +JG496568,location,zip/60623,"" +JG496568,observationDate,"","11/08/2023 1:49:00 PM" +JG496568,includedIn,c/p/1,"" +JG496568,CASE,"",JG496568 +JG496568,PRIMARY_DESCRIPTION,"",ASSAULT +JG496295,typeOf,CrimeEvent,"" +JG496295,location,zip/60643,"" +JG496295,observationDate,"","11/08/2023 11:30:00 AM" +JG496295,includedIn,c/p/1,"" +JG496295,CASE,"",JG496295 +JG496295,PRIMARY_DESCRIPTION,"",ROBBERY +JG488191,typeOf,CrimeEvent,"" +JG488191,location,zip/60661,"" +JG488191,observationDate,"","10/28/2023 9:00:00 AM" +JG488191,includedIn,c/p/1,"" +JG488191,CASE,"",JG488191 +JG488191,PRIMARY_DESCRIPTION,"","MOTOR VEHICLE THEFT" +JG482122,typeOf,CrimeEvent,"" +JG482122,location,zip/60601,"" +JG482122,observationDate,"","10/28/2023 12:10:00 PM" +JG482122,includedIn,c/p/1,"" +JG482122,CASE,"",JG482122 +JG482122,PRIMARY_DESCRIPTION,"",ROBBERY +JG481621,typeOf,CrimeEvent,"" +JG481621,location,zip/60644,"" +JG481621,observationDate,"","10/28/2023 1:30:00 AM" +JG481621,includedIn,c/p/1,"" +JG481621,CASE,"",JG481621 +JG481621,PRIMARY_DESCRIPTION,"","CRIMINAL DAMAGE" +JG499040,typeOf,CrimeEvent,"" +JG499040,location,zip/60613,"" +JG499040,observationDate,"","11/08/2023 6:30:00 PM" +JG499040,includedIn,c/p/1,"" +JG499040,CASE,"",JG499040 +JG499040,PRIMARY_DESCRIPTION,"","CRIMINAL TRESPASS" +JG497052,typeOf,CrimeEvent,"" +JG497052,location,zip/60613,"" +JG497052,observationDate,"","11/08/2023 9:01:00 PM" +JG497052,includedIn,c/p/1,"" +JG497052,CASE,"",JG497052 +JG497052,PRIMARY_DESCRIPTION,"",BURGLARY +JF511492,typeOf,CrimeEvent,"" +JF511492,location,zip/60605,"" +JF511492,observationDate,"","12/14/2022 7:00:00 PM" +JF511492,includedIn,c/p/1,"" +JF511492,CASE,"",JF511492 +JF511492,PRIMARY_DESCRIPTION,"",THEFT +JG498785,typeOf,CrimeEvent,"" +JG498785,location,zip/60630,"" +JG498785,observationDate,"","11/08/2023 10:40:00 PM" +JG498785,includedIn,c/p/1,"" +JG498785,CASE,"",JG498785 +JG498785,PRIMARY_DESCRIPTION,"",THEFT +JG496372,typeOf,CrimeEvent,"" +JG496372,location,zip/60609,"" +JG496372,observationDate,"","11/08/2023 12:02:00 PM" +JG496372,includedIn,c/p/1,"" +JG496372,CASE,"",JG496372 +JG496372,PRIMARY_DESCRIPTION,"",BURGLARY +JG481891,typeOf,CrimeEvent,"" +JG481891,location,zip/60660,"" +JG481891,observationDate,"","10/28/2023 9:15:00 AM" +JG481891,includedIn,c/p/1,"" +JG481891,CASE,"",JG481891 +JG481891,PRIMARY_DESCRIPTION,"",THEFT +JG482300,typeOf,CrimeEvent,"" +JG482300,location,zip/60623,"" +JG482300,observationDate,"","10/28/2023 2:29:00 PM" +JG482300,includedIn,c/p/1,"" +JG482300,CASE,"",JG482300 +JG482300,PRIMARY_DESCRIPTION,"","OTHER OFFENSE" +JG482034,typeOf,CrimeEvent,"" +JG482034,location,zip/60644,"" +JG482034,observationDate,"","10/28/2023 2:00:00 AM" +JG482034,includedIn,c/p/1,"" +JG482034,CASE,"",JG482034 +JG482034,PRIMARY_DESCRIPTION,"",THEFT +JG406115,typeOf,CrimeEvent,"" +JG406115,location,zip/60629,"" +JG406115,observationDate,"","08/31/2023 7:00:00 PM" +JG406115,includedIn,c/p/1,"" +JG406115,CASE,"",JG406115 +JG406115,PRIMARY_DESCRIPTION,"",BATTERY +JG416492,typeOf,CrimeEvent,"" +JG416492,location,zip/60605,"" +JG416492,observationDate,"","09/08/2023 8:00:00 AM" +JG416492,includedIn,c/p/1,"" +JG416492,CASE,"",JG416492 +JG416492,PRIMARY_DESCRIPTION,"","DECEPTIVE PRACTICE" +JG405111,typeOf,CrimeEvent,"" +JG405111,location,zip/60609,"" +JG405111,observationDate,"","08/31/2023 10:52:00 AM" +JG405111,includedIn,c/p/1,"" +JG405111,CASE,"",JG405111 +JG405111,PRIMARY_DESCRIPTION,"",BATTERY +JF501686,typeOf,CrimeEvent,"" +JF501686,location,zip/60605,"" +JF501686,observationDate,"","12/07/2022 1:39:00 PM" +JF501686,includedIn,c/p/1,"" +JF501686,CASE,"",JF501686 +JF501686,PRIMARY_DESCRIPTION,"","CRIMINAL TRESPASS" +JG484000,typeOf,CrimeEvent,"" +JG484000,location,zip/60657,"" +JG484000,observationDate,"","10/28/2023 7:41:00 AM" +JG484000,includedIn,c/p/1,"" +JG484000,CASE,"",JG484000 +JG484000,PRIMARY_DESCRIPTION,"",THEFT +JG482660,typeOf,CrimeEvent,"" +JG482660,location,zip/60643,"" +JG482660,observationDate,"","10/28/2023 10:40:00 PM" +JG482660,includedIn,c/p/1,"" +JG482660,CASE,"",JG482660 +JG482660,PRIMARY_DESCRIPTION,"",ASSAULT +JG464444,typeOf,CrimeEvent,"" +JG464444,location,zip/60642,"" +JG464444,observationDate,"","10/15/2023 2:30:00 AM" +JG464444,includedIn,c/p/1,"" +JG464444,CASE,"",JG464444 +JG464444,PRIMARY_DESCRIPTION,"",ROBBERY +JF254640,typeOf,CrimeEvent,"" +JF254640,location,zip/60611,"" +JF254640,observationDate,"","12/08/2022 3:38:00 PM" +JF254640,includedIn,c/p/1,"" +JF254640,CASE,"",JF254640 +JF254640,PRIMARY_DESCRIPTION,"",HOMICIDE +JG483104,typeOf,CrimeEvent,"" +JG483104,location,zip/60651,"" +JG483104,observationDate,"","10/28/2023 4:00:00 PM" +JG483104,includedIn,c/p/1,"" +JG483104,CASE,"",JG483104 +JG483104,PRIMARY_DESCRIPTION,"",THEFT +JG412467,typeOf,CrimeEvent,"" +JG412467,location,zip/60619,"" +JG412467,observationDate,"","08/31/2023 9:13:00 PM" +JG412467,includedIn,c/p/1,"" +JG412467,CASE,"",JG412467 +JG412467,PRIMARY_DESCRIPTION,"","MOTOR VEHICLE THEFT" +JG453780,typeOf,CrimeEvent,"" +JG453780,location,zip/60621,"" +JG453780,observationDate,"","09/29/2023 12:00:00 AM" +JG453780,includedIn,c/p/1,"" +JG453780,CASE,"",JG453780 +JG453780,PRIMARY_DESCRIPTION,"",THEFT +JG445684,typeOf,CrimeEvent,"" +JG445684,location,zip/60616,"" +JG445684,observationDate,"","09/30/2023 11:45:00 PM" +JG445684,includedIn,c/p/1,"" +JG445684,CASE,"",JG445684 +JG445684,PRIMARY_DESCRIPTION,"",BATTERY +JG495860,typeOf,CrimeEvent,"" +JG495860,location,zip/60637,"" +JG495860,observationDate,"","11/08/2023 3:00:00 AM" +JG495860,includedIn,c/p/1,"" +JG495860,CASE,"",JG495860 +JG495860,PRIMARY_DESCRIPTION,"","CRIMINAL DAMAGE" +JG497647,typeOf,CrimeEvent,"" +JG497647,location,zip/60616,"" +JG497647,observationDate,"","11/08/2023 3:00:00 PM" +JG497647,includedIn,c/p/1,"" +JG497647,CASE,"",JG497647 +JG497647,PRIMARY_DESCRIPTION,"","MOTOR VEHICLE THEFT" +JG445669,typeOf,CrimeEvent,"" +JG445669,location,zip/60620,"" +JG445669,observationDate,"","09/30/2023 9:30:00 PM" +JG445669,includedIn,c/p/1,"" +JG445669,CASE,"",JG445669 +JG445669,PRIMARY_DESCRIPTION,"",ASSAULT +JG445052,typeOf,CrimeEvent,"" +JG445052,location,zip/60618,"" +JG445052,observationDate,"","09/30/2023 10:01:00 AM" +JG445052,includedIn,c/p/1,"" +JG445052,CASE,"",JG445052 +JG445052,PRIMARY_DESCRIPTION,"",THEFT +JG481737,typeOf,CrimeEvent,"" +JG481737,location,zip/60611,"" +JG481737,observationDate,"","10/28/2023 3:34:00 AM" +JG481737,includedIn,c/p/1,"" +JG481737,CASE,"",JG481737 +JG481737,PRIMARY_DESCRIPTION,"",BATTERY +JG503521,typeOf,CrimeEvent,"" +JG503521,location,zip/60637,"" +JG503521,observationDate,"","11/13/2023 9:30:00 AM" +JG503521,includedIn,c/p/1,"" +JG503521,CASE,"",JG503521 +JG503521,PRIMARY_DESCRIPTION,"",BURGLARY +JG468840,typeOf,CrimeEvent,"" +JG468840,location,zip/60617,"" +JG468840,observationDate,"","10/15/2023 3:00:00 AM" +JG468840,includedIn,c/p/1,"" +JG468840,CASE,"",JG468840 +JG468840,PRIMARY_DESCRIPTION,"","CRIMINAL DAMAGE" +JG465660,typeOf,CrimeEvent,"" +JG465660,location,zip/60652,"" +JG465660,observationDate,"","10/15/2023 8:00:00 PM" +JG465660,includedIn,c/p/1,"" +JG465660,CASE,"",JG465660 +JG465660,PRIMARY_DESCRIPTION,"",THEFT +JG464615,typeOf,CrimeEvent,"" +JG464615,location,zip/60649,"" +JG464615,observationDate,"","10/15/2023 11:01:00 AM" +JG464615,includedIn,c/p/1,"" +JG464615,CASE,"",JG464615 +JG464615,PRIMARY_DESCRIPTION,"","MOTOR VEHICLE THEFT" +JG497095,typeOf,CrimeEvent,"" +JG497095,location,zip/60647,"" +JG497095,observationDate,"","11/08/2023 8:50:00 PM" +JG497095,includedIn,c/p/1,"" +JG497095,CASE,"",JG497095 +JG497095,PRIMARY_DESCRIPTION,"",dfbdshfbj +custom/g/Root,typeOf,StatVarGroup,"" +custom/g/Root,name,"","Custom Variables" +custom/g/Root,specializationOf,dc/g/Root,"" +Variable_1,typeOf,StatisticalVariable,"" +Variable_1,name,"","Variable 1" +Variable_1,memberOf,custom/g/group_1,"" +Variable_1,includedIn,c/p/1,"" +Variable_1,includedIn,c/s/1,"" +Variable_1,populationType,Thing,"" +Variable_1,statType,measuredValue,"" +Variable_1,measuredProperty,Variable_1,"" +Variable_2,typeOf,StatisticalVariable,"" +Variable_2,name,"","Variable 2" +Variable_2,memberOf,custom/g/Root,"" +Variable_2,includedIn,c/p/1,"" +Variable_2,includedIn,c/s/1,"" +Variable_2,populationType,Thing,"" +Variable_2,statType,measuredValue,"" +Variable_2,measuredProperty,Variable_2,"" +Crime_Count,typeOf,StatisticalVariable,"" +Crime_Count,name,"","Crime Count" +Crime_Count,description,"","Number of crimes" +Crime_Count,memberOf,custom/g/Root,"" +Crime_Count,includedIn,c/p/1,"" +Crime_Count,includedIn,c/s/1,"" +Crime_Count,populationType,Thing,"" +Crime_Count,statType,measuredValue,"" +Crime_Count,measuredProperty,Crime_Count,"" +CrimeEvent,typeOf,Class,"" +CrimeEvent,subClassOf,Event,"" +CrimeEvent,name,"",CrimeEvent +CrimeEvent,includedIn,c/p/1,"" +CrimeEvent,includedIn,c/s/1,"" +CASE,typeOf,Property,"" +CASE,name,"",CASE +PRIMARY_DESCRIPTION,typeOf,Property,"" +PRIMARY_DESCRIPTION,name,"","PRIMARY DESCRIPTION" +country/AFG,typeOf,Country,"" +country/YEM,typeOf,Country,"" +country/AGO,typeOf,Country,"" +country/ZMB,typeOf,Country,"" +country/ZWE,typeOf,Country,"" +country/ALB,typeOf,Country,"" +wikidataId/Q22062741,typeOf,Country,"" +country/DZA,typeOf,Country,"" +country/AND,typeOf,Country,"" +country/ASM,typeOf,Country,"" +country/AIA,typeOf,Country,"" +country/WLF,typeOf,Country,"" +country/ESH,typeOf,Country,"" +c/s/default,typeOf,Source,"" +c/s/default,name,"","Custom Data Commons" +c/s/1,typeOf,Source,"" +c/s/1,name,"","Source1 Name" +c/s/1,url,"",http://source1.com +c/s/1,domain,"",source1.com +c/p/default,typeOf,Provenance,"" +c/p/default,name,"","Custom Import" +c/p/default,source,c/s/default,"" +c/p/default,url,"",custom-import +c/p/1,typeOf,Provenance,"" +c/p/1,name,"","Provenance1 Name" +c/p/1,source,c/s/1,"" +c/p/1,url,"",http://source1.com/provenance1 +c/p/2,typeOf,Provenance,"" +c/p/2,name,"","Provenance2 Name" +c/p/2,source,c/s/1,"" +c/p/2,url,"",http://source1.com/provenance2 +custom/g/group_1,typeOf,StatVarGroup,"" +custom/g/group_1,name,"","Parent Group" +custom/g/group_1,specializationOf,dc/g/Root,"" +custom/g/group_1,includedIn,c/p/1,"" +custom/g/group_1,includedIn,c/p/2,"" +custom/g/group_1,includedIn,c/s/1,"" +custom/g/group_2,typeOf,StatVarGroup,"" +custom/g/group_2,name,"","Child Group 1" +custom/g/group_2,specializationOf,custom/g/group_1,"" +custom/g/group_2,includedIn,c/p/1,"" +custom/g/group_2,includedIn,c/p/2,"" +custom/g/group_2,includedIn,c/s/1,"" +custom/g/group_3,typeOf,StatVarGroup,"" +custom/g/group_3,name,"","Child Group 2" +custom/g/group_3,specializationOf,custom/g/group_1,"" +custom/g/group_3,includedIn,c/p/1,"" +custom/g/group_3,includedIn,c/p/2,"" +custom/g/group_3,includedIn,c/s/1,"" +var1,typeOf,StatisticalVariable,"" +var1,name,"","Good var1 name" +var1,description,"","Good var1 description" +var1,memberOf,custom/g/group_2,"" +var1,includedIn,c/p/1,"" +var1,includedIn,c/p/2,"" +var1,includedIn,c/s/1,"" +var1,populationType,Person,"" +var1,measuredProperty,age,"" +var1,statType,medianValue,"" +var1,gender,Female,"" +var2,typeOf,StatisticalVariable,"" +var2,name,"","Good var2 name" +var2,memberOf,custom/g/group_3,"" +var2,includedIn,c/p/1,"" +var2,includedIn,c/p/2,"" +var2,includedIn,c/s/1,"" +var2,populationType,Thing,"" +var2,statType,measuredValue,"" +var2,measuredProperty,var2,"" +country/USA,typeOf,Country,"" +country/IND,typeOf,Country,"" +dc/000qxlm93vn93,typeOf,PowerPlant,"" +dc/5c7tz3lbln3p,typeOf,PowerPlant,"" +dc/8zmh7ctlkbsc4,typeOf,PowerPlant,"" +dc/2ysvc67fk1162,typeOf,PowerPlant,"" +dc/00w9rbw8yn7x7,typeOf,PowerPlant,"" +dc/00zjgb4rjchx3,typeOf,PowerPlant,"" +dc/011s19rm0mzh1,typeOf,PowerPlant,"" +dc/017y3py1dzkmg,typeOf,PowerPlant,"" +dc/4359q0h458f01,typeOf,PowerPlant,"" +dc/3kds7zgl4wz26,typeOf,PowerPlant,"" +dc/02b53twnh3fx,typeOf,PowerPlant,"" +dc/dk2p9l3l8x1b6,typeOf,PowerPlant,"" +s2CellId/0x80982b0000000000,typeOf,S2CellLevel10,"" +s2CellId/0x3be7c90000000000,typeOf,S2CellLevel10,"" diff --git a/simple/stats/db.py b/simple/stats/db.py index 6e8bd2b7..4290a02f 100644 --- a/simple/stats/db.py +++ b/simple/stats/db.py @@ -69,6 +69,31 @@ _DELETE_TRIPLES_STATEMENT = "delete from triples" _INSERT_TRIPLES_STATEMENT = "insert into triples values(?, ?, ?, ?)" +_CREATE_TEMP_TRIPLES_TABLE = """ +create table if not exists temp_triples ( + subject_id varchar(255), + predicate varchar(255), + object_id varchar(255), + object_value TEXT +); +""" + +_INSERT_TEMP_TRIPLES_STATEMENT = "insert into temp_triples values(?, ?, ?, ?)" + +_DELETE_MATCHING_TRIPLES_STATEMENT = """ + DELETE FROM triples + WHERE (subject_id, predicate) IN ( + SELECT subject_id, predicate FROM temp_triples + ) +""" + +_INSERT_FROM_TEMP_TRIPLES_STATEMENT = """ + INSERT INTO triples + SELECT * FROM temp_triples +""" + +_DROP_TEMP_TRIPLES_TABLE = "DROP TABLE IF EXISTS temp_triples" + _CREATE_OBSERVATIONS_TABLE = """ create table if not exists observations ( entity varchar(255), @@ -82,6 +107,32 @@ _DELETE_OBSERVATIONS_STATEMENT = "delete from observations" _INSERT_OBSERVATIONS_STATEMENT = "insert into observations values(?, ?, ?, ?, ?)" +_CREATE_TEMP_OBSERVATIONS_TABLE = """ +create table if not exists temp_observations ( + entity varchar(255), + variable varchar(255), + date varchar(255), + value varchar(255), + provenance varchar(255) +); +""" + +_INSERT_TEMP_OBSERVATIONS_STATEMENT = "insert into temp_observations values(?, ?, ?, ?, ?)" + +_DELETE_MATCHING_OBSERVATIONS_STATEMENT = """ + DELETE FROM observations + WHERE (entity, variable, date) IN ( + SELECT entity, variable, date FROM temp_observations + ) +""" + +_INSERT_FROM_TEMP_OBSERVATIONS_STATEMENT = """ + INSERT INTO observations + SELECT * FROM temp_observations +""" + +_DROP_TEMP_OBSERVATIONS_TABLE = "DROP TABLE IF EXISTS temp_observations" + _CREATE_IMPORTS_TABLE = """ create table if not exists imports ( imported_at datetime, @@ -101,6 +152,14 @@ _DELETE_OBSERVATIONS_STATEMENT ] +_INIT_STATEMENTS_INCREMENTAL = [ + _CREATE_TRIPLES_TABLE, + _CREATE_OBSERVATIONS_TABLE, + _CREATE_IMPORTS_TABLE, + _CREATE_TEMP_TRIPLES_TABLE, + _CREATE_TEMP_OBSERVATIONS_TABLE, +] + OBSERVATIONS_TMCF = """Node: E:Table->E0 typeOf: dcs:StatVarObservation variableMeasured: C:Table->variable @@ -146,9 +205,10 @@ class MainDcDb(Db): Triples will be output as schema MCF. """ - def __init__(self, db_params: dict) -> None: + def __init__(self, db_params: dict, incremental: bool) -> None: assert db_params assert MAIN_DC_OUTPUT_DIR in db_params + assert not incremental, "Incremental mode not supported for main DC." self.output_dir_fh = create_file_handler(db_params[MAIN_DC_OUTPUT_DIR], is_dir=True) @@ -193,15 +253,26 @@ def _add_triple(self, triple: Triple): class SqlDb(Db): """Class to insert triples and observations into a SQL DB.""" - def __init__(self, config: dict) -> None: + def __init__(self, config: dict, incremental: bool) -> None: self.engine = create_db_engine(config) self.num_observations = 0 self.variables: set[str] = set() + self.incremental = incremental + if self.incremental: + for statement in _INIT_STATEMENTS_INCREMENTAL: + self.engine.execute(statement) + else: + for statement in _INIT_STATEMENTS: + self.engine.execute(statement) def insert_triples(self, triples: list[Triple]): logging.info("Writing %s triples to [%s]", len(triples), self.engine) - self.engine.executemany(_INSERT_TRIPLES_STATEMENT, - [to_triple_tuple(triple) for triple in triples]) + if self.incremental: + self.engine.executemany(_INSERT_TEMP_TRIPLES_STATEMENT, + [to_triple_tuple(triple) for triple in triples]) + else: + self.engine.executemany(_INSERT_TRIPLES_STATEMENT, + [to_triple_tuple(triple) for triple in triples]) def insert_observations(self, observations: list[Observation], input_file_name: str): @@ -212,8 +283,10 @@ def insert_observations(self, observations: list[Observation], for observation in observations: tuples.append(to_observation_tuple(observation)) self.variables.add(observation.variable) - - self.engine.executemany(_INSERT_OBSERVATIONS_STATEMENT, tuples) + if self.incremental: + self.engine.executemany(_INSERT_TEMP_OBSERVATIONS_STATEMENT, tuples) + else: + self.engine.executemany(_INSERT_OBSERVATIONS_STATEMENT, tuples) def insert_import_info(self, status: ImportStatus): metadata = self._import_metadata() @@ -224,14 +297,26 @@ def insert_import_info(self, status: ImportStatus): (str(datetime.now()), status.name, json.dumps(metadata))) def commit_and_close(self): + if self.incremental: + self._update_observations_and_triples() self.engine.commit_and_close() def _import_metadata(self) -> dict: return { "numVars": len(self.variables), "numObs": self.num_observations, + "incremental": self.incremental, } + def _update_observations_and_triples(self): + self.engine.execute(_DELETE_MATCHING_OBSERVATIONS_STATEMENT) + self.engine.execute(_INSERT_FROM_TEMP_OBSERVATIONS_STATEMENT) + self.engine.execute(_DROP_TEMP_OBSERVATIONS_TABLE) + + self.engine.execute(_DELETE_MATCHING_TRIPLES_STATEMENT) + self.engine.execute(_INSERT_FROM_TEMP_TRIPLES_STATEMENT) + self.engine.execute(_DROP_TEMP_TRIPLES_TABLE) + def to_triple_tuple(triple: Triple): return (_strip_namespace(triple.subject_id), triple.predicate, @@ -277,8 +362,6 @@ def __init__(self, db_params: dict) -> None: logging.info("Connected to SQLite: %s", self.local_db_file_path) self.cursor = self.connection.cursor() - for statement in _INIT_STATEMENTS: - self.cursor.execute(statement) def __str__(self) -> str: return f"{TYPE_SQLITE}: {self.db_file_path}" @@ -313,7 +396,7 @@ def commit_and_close(self): _CLOUD_MY_SQL_PARAMS = [CLOUD_MY_SQL_INSTANCE] + _CLOUD_MY_SQL_CONNECT_PARAMS -class CloudSqlDbEngine: +class CloudSqlDbEngine(DbEngine): def __init__(self, db_params: dict[str, str]) -> None: for param in _CLOUD_MY_SQL_PARAMS: @@ -328,8 +411,6 @@ def __init__(self, db_params: dict[str, str]) -> None: db_params[CLOUD_MY_SQL_INSTANCE], db_params[CLOUD_MY_SQL_DB]) self.description = f"{TYPE_CLOUD_SQL}: {db_params[CLOUD_MY_SQL_INSTANCE]} ({db_params[CLOUD_MY_SQL_DB]})" self.cursor: Cursor = self.connection.cursor() - for statement in _INIT_STATEMENTS: - self.cursor.execute(statement) def __str__(self) -> str: return self.description @@ -372,11 +453,11 @@ def create_db_engine(config: dict) -> DbEngine: assert False -def create_db(config: dict) -> Db: +def create_db(config: dict, incremental: bool) -> Db: db_type = config[FIELD_DB_TYPE] if db_type and db_type == TYPE_MAIN_DC: - return MainDcDb(config) - return SqlDb(config) + return MainDcDb(config, incremental) + return SqlDb(config, incremental) def create_sqlite_config(sqlite_db_file_path: str) -> dict: diff --git a/simple/stats/main.py b/simple/stats/main.py index 54132d06..e7d27fbb 100644 --- a/simple/stats/main.py +++ b/simple/stats/main.py @@ -45,6 +45,12 @@ constants.DEFAULT_FROZEN_TIME, "If freeze_time is True, the time that the run is frozen at.", ) +flags.DEFINE_bool( + "incremental", + False, + "If True, update existing records in the database and only add new records. " + "If False, delete all existing records and insert new records.", +) # If running with time frozen, the packages to be ignored. # i.e. packages where time should not be frozen if it leads to errant behavior. @@ -64,7 +70,8 @@ def _run(): Runner(config_file=FLAGS.config_file, input_dir=FLAGS.input_dir, output_dir=FLAGS.output_dir, - mode=FLAGS.mode).run() + mode=FLAGS.mode, + incremental=FLAGS.incremental).run() def main(_): diff --git a/simple/stats/nl.py b/simple/stats/nl.py index 8243877a..3ef994e7 100644 --- a/simple/stats/nl.py +++ b/simple/stats/nl.py @@ -23,7 +23,8 @@ _SENTENCE_SEPARATOR = ";" -def generate_sv_sentences(svs: list[StatVar], sentences_fh: FileHandler): +def generate_sv_sentences(svs: list[StatVar], sentences_fh: FileHandler, + incremental: bool): """Generates sentences based on the name, description and NL sentences of the specified SVs. The SV dcids and sentences are written to a CSV using the specified FileHandler @@ -35,6 +36,11 @@ def generate_sv_sentences(svs: list[StatVar], sentences_fh: FileHandler): dataframe = pd.DataFrame(rows) logging.info("Writing %s SV sentences to: %s", dataframe.size, sentences_fh) + if incremental: + data = sentences_fh.read_string_io() + existing_df = pd.read_csv(data) + dataframe = pd.concat([existing_df, dataframe], ignore_index=True) + dataframe.drop_duplicates(inplace=True) sentences_fh.write_string(dataframe.to_csv(index=False)) diff --git a/simple/stats/reporter.py b/simple/stats/reporter.py index 1b90546c..cde696ae 100644 --- a/simple/stats/reporter.py +++ b/simple/stats/reporter.py @@ -37,6 +37,7 @@ def _is_done_status(status: Status) -> bool: return status == Status.SUCCESS or status == Status.FAILURE +# TODO: Fix ImportReporter for incremental mode. class ImportReporter: """Generates a report on every reported change. diff --git a/simple/stats/runner.py b/simple/stats/runner.py index cb8f32c3..abb8b07f 100644 --- a/simple/stats/runner.py +++ b/simple/stats/runner.py @@ -52,11 +52,13 @@ def __init__(self, config_file: str, input_dir: str, output_dir: str, - mode: RunMode = RunMode.CUSTOM_DC) -> None: + mode: RunMode = RunMode.CUSTOM_DC, + incremental: bool = False) -> None: assert config_file or input_dir, "One of config_file or input_dir must be specified" assert output_dir, "output_dir must be specified" self.mode = mode + self.incremental = incremental self.input_handlers: list[FileHandler] = [] # Config file driven. @@ -119,7 +121,7 @@ def _get_db_config() -> dict: return create_sqlite_config( self.output_dir_fh.make_file(constants.DB_FILE_NAME).path) - self.db = create_db(_get_db_config()) + self.db = create_db(_get_db_config(), self.incremental) self.nodes = Nodes(self.config) def run(self): @@ -135,7 +137,8 @@ def run(self): # Generate SV sentences. nl.generate_sv_sentences( list(self.nodes.variables.values()), - self.nl_dir_fh.make_file(constants.SENTENCES_FILE_NAME)) + self.nl_dir_fh.make_file(constants.SENTENCES_FILE_NAME), + self.incremental) # Write import info to DB. self.db.insert_import_info(status=ImportStatus.SUCCESS)