Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add incremental mode support in Simple Stats Importer tool. #296

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,33 @@ function run_sample {
deactivate
}

function run_sample_incremental {
# Do not use Cloud SQL.
export USE_CLOUDSQL=false

python3 -m venv .env
source .env/bin/activate

cd simple
pip3 install -r requirements.txt

echo "Deleting existing datacommons.db file."
rm -f sample/output_incremental/datacommons.db

echo "Running sample."
python3 -m stats.main --input_dir=sample/input_incremental_1 --output_dir=sample/output_incremental --freeze_time
echo "Running sample again."
python3 -m stats.main --input_dir=sample/input_incremental_2 --output_dir=sample/output_incremental --freeze_time --incremental

echo "Writing tables to CSVs."
mkdir -p sample/output_incremental/tables
sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from observations;" > sample/output_incremental/tables/observations.csv
sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from triples;" > sample/output_incremental/tables/triples.csv
sqlite3 -header -csv sample/output_incremental/datacommons.db "select * from imports;" > sample/output_incremental/tables/imports.csv

deactivate
}

function run_main_dc_sample {
python3 -m venv .env
source .env/bin/activate
Expand Down Expand Up @@ -199,6 +226,11 @@ while [[ "$#" -gt 0 ]]; do
run_sample
shift 1
;;
-i)
echo -e "### Running sample incremental"
run_sample_incremental
shift 1
;;
*)
help
exit 1
Expand Down
68 changes: 68 additions & 0 deletions simple/sample/input_incremental_1/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"inputFiles": {
"countries.csv": {
"importType": "observations",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"geoids.csv": {
"importType": "observations",
"entityType": "",
"ignoreColumns": [
"ignore1",
"ignore2"
],
"provenance": "Provenance1 Name"
},
"latlng_events.csv": {
"importType": "events",
"eventType": "CrimeEvent",
"entityType": "CensusZipCodeTabulationArea",
"provenance": "Provenance1 Name",
"idColumn": "CASE",
"computedVariables": [
"Crime Count"
]
}
},
"variables": {
"var1": {
"name": "Good var1 name",
"description": "Good var1 description",
"nlSentences": [
"Natural language sentence 1",
"Natural language sentence 2"
],
"group": "Parent Group/Child Group 1",
"properties": {
"populationType": "Person",
"measuredProperty": "age",
"statType": "medianValue",
"gender": "Female"
}
},
"var2": {
"name": "Good var2 name",
"group": "Parent Group/Child Group 2"
},
"Variable 1": {
"group": "Parent Group"
},
"Crime Count": {
"description": "Number of crimes",
"aggregation": {
"period": "month",
"method": "count"
}
}
},
"sources": {
"Source1 Name": {
"url": "http://source1.com",
"provenances": {
"Provenance1 Name": "http://source1.com/provenance1",
"Provenance2 Name": "http://source1.com/provenance2"
}
}
}
}
15 changes: 15 additions & 0 deletions simple/sample/input_incremental_1/countries.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
place,year,var1,var2
Afghanistan,2023,0.19,6
Yemen,2023,0.21,56
Angola,2023,0.29,6
Zambia,2023,0.31,34
Zimbabwe,2023,0.37,76
Albania,2023,0.50,34
dcid: wikidataId/Q22062741,2023,0.50,97
Algeria,2023,0.52,92
West Bank and Gaza,2023,0.53,64
Andorra,2023,0.76,9
American Samoa,2023,#N/A,34
Anguilla,2023,#N/A,42
Wallis and Futuna Islands,2023,#N/A,75
Western Sahara,2023,#N/A,65
3 changes: 3 additions & 0 deletions simple/sample/input_incremental_1/geoids.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
geoId,year,ignore1,Variable 1,Variable 2,ignore2
01,2021,foo,555, 666 ,bar
122,2022,#N/A,321 , "123,456",baz
52 changes: 52 additions & 0 deletions simple/sample/input_incremental_1/latlng_events.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
lat#lng,DATE OF OCCURRENCE,CASE,PRIMARY DESCRIPTION
41.927407329#-87.70729439,11/08/2023 8:50:00 PM,JG497095,THEFT
41.896671699#-87.628635323,11/08/2023 3:14:00 PM,JG496991,ASSAULT
41.808525157#-87.672792896,11/08/2023 10:55:00 PM,JG497145,ASSAULT
41.979505088#-87.693158103,11/08/2023 4:39:00 PM,JG496701,OTHER OFFENSE
41.771890947#-87.638705659,10/28/2023 7:30:00 PM,JG484195,THEFT
41.985611859#-87.713834343,10/28/2023 5:00:00 PM,JG483131,CRIMINAL DAMAGE
41.733053891#-87.568330657,11/08/2023 8:25:00 AM,JG498494,ASSAULT
41.949586612#-87.664085689,11/08/2023 2:38:00 PM,JG496575,THEFT
41.704388397#-87.626879123,09/17/2023 3:00:00 AM,JG427641,THEFT
41.881944424#-87.634195294,08/02/2023 9:25:00 AM,JG365961,ASSAULT
41.755481563#-87.649019949,11/08/2023 9:00:00 AM,JG496115,THEFT
41.970433391#-87.763029002,11/08/2023 7:45:00 PM,JG496955,ROBBERY
41.802269632#-87.605372566,11/08/2023 3:00:00 PM,JG501047,BURGLARY
41.721303358#-87.655873595,11/08/2023 1:00:00 PM,JG496779,ASSAULT
41.884497529#-87.625838595,11/08/2023 11:00:00 AM,JG496296,THEFT
41.778436411#-87.589657198,11/05/2023 6:00:00 PM,JG504330,OFFENSE INVOLVING CHILDREN
41.838219696#-87.704850674,11/08/2023 1:49:00 PM,JG496568,ASSAULT
41.70319162#-87.651369057,11/08/2023 11:30:00 AM,JG496295,ROBBERY
41.883969722#-87.644191276,10/28/2023 9:00:00 AM,JG488191,MOTOR VEHICLE THEFT
41.884276844#-87.622098929,10/28/2023 12:10:00 PM,JG482122,ROBBERY
41.87493626#-87.748170814,10/28/2023 1:30:00 AM,JG481621,CRIMINAL DAMAGE
41.95417672#-87.677232056,11/08/2023 6:30:00 PM,JG499040,CRIMINAL TRESPASS
41.948044095#-87.664039332,11/08/2023 9:01:00 PM,JG497052,BURGLARY
41.875625633#-87.629450396,12/14/2022 7:00:00 PM,JF511492,THEFT
41.976489992#-87.788483018,11/08/2023 10:40:00 PM,JG498785,THEFT
41.828080528#-87.686233684,11/08/2023 12:02:00 PM,JG496372,BURGLARY
41.993043969#-87.660360363,10/28/2023 9:15:00 AM,JG481891,THEFT
41.858444489#-87.716414102,10/28/2023 2:29:00 PM,JG482300,OTHER OFFENSE
41.879874073#-87.769750673,10/28/2023 2:00:00 AM,JG482034,THEFT
41.771296232#-87.729149311,08/31/2023 7:00:00 PM,JG406115,BATTERY
41.875679322#-87.62657476,09/08/2023 8:00:00 AM,JG416492,DECEPTIVE PRACTICE
41.82539977#-87.637026874,08/31/2023 10:52:00 AM,JG405111,BATTERY
41.863196881#-87.614817819,12/07/2022 1:39:00 PM,JF501686,CRIMINAL TRESPASS
41.93743245#-87.649180491,10/28/2023 7:41:00 AM,JG484000,THEFT
41.680799541#-87.669942159,10/28/2023 10:40:00 PM,JG482660,ASSAULT
41.891874434#-87.647617474,10/15/2023 2:30:00 AM,JG464444,ROBBERY
41.888993854#-87.626934833,12/08/2022 3:38:00 PM,JF254640,HOMICIDE
41.902821551#-87.775389625,10/28/2023 4:00:00 PM,JG483104,THEFT
41.724654303#-87.622283278,08/31/2023 9:13:00 PM,JG412467,MOTOR VEHICLE THEFT
41.773780824#-87.645848665,09/29/2023 12:00:00 AM,JG453780,THEFT
41.841289747#-87.628142362,09/30/2023 11:45:00 PM,JG445684,BATTERY
41.776150283#-87.615522623,11/08/2023 3:00:00 AM,JG495860,CRIMINAL DAMAGE
41.836069707#-87.613033345,11/08/2023 3:00:00 PM,JG497647,MOTOR VEHICLE THEFT
41.74974473#-87.652507329,09/30/2023 9:30:00 PM,JG445669,ASSAULT
41.946653043#-87.700875462,09/30/2023 10:01:00 AM,JG445052,THEFT
41.890400093#-87.628021143,10/28/2023 3:34:00 AM,JG481737,BATTERY
41.793842185#-87.620286919,11/13/2023 9:30:00 AM,JG503521,BURGLARY
41.720900408#-87.554599376,10/15/2023 3:00:00 AM,JG468840,CRIMINAL DAMAGE
41.742267488#-87.702192623,10/15/2023 8:00:00 PM,JG465660,THEFT
41.766298978#-87.570076538,10/15/2023 11:01:00 AM,JG464615,MOTOR VEHICLE THEFT
41.927407329#-87.70729439,11/08/2023 8:50:00 PM,JG497095,dfbdshfbj
54 changes: 54 additions & 0 deletions simple/sample/input_incremental_2/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"inputFiles": {
"latlng.csv": {
"importType": "observations",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"powerplants.csv": {
"importType": "observations",
"entityType": "PowerPlant",
"provenance": "Provenance2 Name"
},
"s2cells.csv": {
"importType": "observations",
"entityType": "S2CellLevel10",
"provenance": "Provenance2 Name"
},
"wikidataids.csv": {
"importType": "observations",
"entityType": "Country",
"provenance": "Provenance1 Name"
}
},
"variables": {
"var1": {
"name": "Good var1 name",
"description": "Good var1 description",
"nlSentences": [
"Natural language sentence 1",
"Natural language sentence 2"
],
"group": "Parent Group/Child Group 1",
"properties": {
"populationType": "Person",
"measuredProperty": "age",
"statType": "medianValue",
"gender": "Female"
}
},
"var2": {
"name": "Good var2 name",
"group": "Parent Group/Child Group 2"
}
},
"sources": {
"Source1 Name": {
"url": "http://source1.com",
"provenances": {
"Provenance1 Name": "http://source1.com/provenance1",
"Provenance2 Name": "http://source1.com/provenance2"
}
}
}
}
3 changes: 3 additions & 0 deletions simple/sample/input_incremental_2/latlng.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
lat#lng,year,var1,var2
38.7#-119.4,2021,555,666
19.076#72.877,2022,321,123
15 changes: 15 additions & 0 deletions simple/sample/input_incremental_2/powerplants.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
powerplant,year,var1,var2
Suzlon Project,2023,0.19,6
Crete Energy Venture,2023,0.21,56
Watchtower Educational Center,2023,0.29,6
Union Power,2023,0.31,34
Pearl Station,2023,0.37,76
Austin Gas Recovery,2023,0.50,34
FOO BAR,2023,0.50,97
Gordon,2023,0.52,92
BAZ BAR,2023,0.53,64
White River Lock and Dam 2,2023,0.76,9
Bristol Plant,2023,#N/A,34
Edison Sault,2023,#N/A,42
Navajo Dam,2023,#N/A,75
CNN Center,2023,#N/A,65
3 changes: 3 additions & 0 deletions simple/sample/input_incremental_2/s2cells.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
lat#lng,year,var1,var2
38.7#-119.4,2021,555,666
19.076#72.877,2022,321,123
3 changes: 3 additions & 0 deletions simple/sample/input_incremental_2/wikidataids.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
wikidataid,year,var1,var2
Q30,2021,555,666
Q668,2022,321,123
2 changes: 1 addition & 1 deletion simple/sample/output/tables/imports.csv
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
imported_at,status,metadata
"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 5, ""numObs"": 99}"
"2023-01-01 00:00:00",SUCCESS,"{""numVars"": 5, ""numObs"": 99, ""incremental"": false}"
6 changes: 6 additions & 0 deletions simple/sample/output_incremental/nl/sentences.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
dcid,sentence
var1,Good var1 name;Good var1 description;Natural language sentence 1;Natural language sentence 2
var2,Good var2 name
Variable_1,Variable 1
Variable_2,Variable 2
Crime_Count,Crime Count;Number of crimes
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
input,dcid,link
West Bank and Gaza,*UNRESOLVED*,
dcid: wikidataId/Q22062741,wikidataId/Q22062741,https://datacommons.org/browser/wikidataId/Q22062741
Afghanistan,country/AFG,https://datacommons.org/browser/country/AFG
Albania,country/ALB,https://datacommons.org/browser/country/ALB
Algeria,country/DZA,https://datacommons.org/browser/country/DZA
American Samoa,country/ASM,https://datacommons.org/browser/country/ASM
Andorra,country/AND,https://datacommons.org/browser/country/AND
Angola,country/AGO,https://datacommons.org/browser/country/AGO
Anguilla,country/AIA,https://datacommons.org/browser/country/AIA
Wallis and Futuna Islands,country/WLF,https://datacommons.org/browser/country/WLF
Western Sahara,country/ESH,https://datacommons.org/browser/country/ESH
Yemen,country/YEM,https://datacommons.org/browser/country/YEM
Zambia,country/ZMB,https://datacommons.org/browser/country/ZMB
Zimbabwe,country/ZWE,https://datacommons.org/browser/country/ZWE
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input,dcid,link
01,geoId/01,https://datacommons.org/browser/geoId/01
122,geoId/122,https://datacommons.org/browser/geoId/122
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input,dcid,link
38.7#-119.4,country/USA,https://datacommons.org/browser/country/USA
19.076#72.877,country/IND,https://datacommons.org/browser/country/IND
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
input,dcid,link
41.927407329#-87.70729439,zip/60647,https://datacommons.org/browser/zip/60647
41.896671699#-87.628635323,zip/60654,https://datacommons.org/browser/zip/60654
41.808525157#-87.672792896,zip/60609,https://datacommons.org/browser/zip/60609
41.979505088#-87.693158103,zip/60625,https://datacommons.org/browser/zip/60625
41.771890947#-87.638705659,zip/60621,https://datacommons.org/browser/zip/60621
41.985611859#-87.713834343,zip/60659,https://datacommons.org/browser/zip/60659
41.733053891#-87.568330657,zip/60617,https://datacommons.org/browser/zip/60617
41.949586612#-87.664085689,zip/60613,https://datacommons.org/browser/zip/60613
41.704388397#-87.626879123,zip/60628,https://datacommons.org/browser/zip/60628
41.881944424#-87.634195294,zip/60606,https://datacommons.org/browser/zip/60606
41.755481563#-87.649019949,zip/60620,https://datacommons.org/browser/zip/60620
41.970433391#-87.763029002,zip/60630,https://datacommons.org/browser/zip/60630
41.802269632#-87.605372566,zip/60615,https://datacommons.org/browser/zip/60615
41.721303358#-87.655873595,zip/60643,https://datacommons.org/browser/zip/60643
41.884497529#-87.625838595,zip/60601,https://datacommons.org/browser/zip/60601
41.778436411#-87.589657198,zip/60637,https://datacommons.org/browser/zip/60637
41.838219696#-87.704850674,zip/60623,https://datacommons.org/browser/zip/60623
41.70319162#-87.651369057,zip/60643,https://datacommons.org/browser/zip/60643
41.883969722#-87.644191276,zip/60661,https://datacommons.org/browser/zip/60661
41.884276844#-87.622098929,zip/60601,https://datacommons.org/browser/zip/60601
41.87493626#-87.748170814,zip/60644,https://datacommons.org/browser/zip/60644
41.95417672#-87.677232056,zip/60613,https://datacommons.org/browser/zip/60613
41.948044095#-87.664039332,zip/60613,https://datacommons.org/browser/zip/60613
41.875625633#-87.629450396,zip/60605,https://datacommons.org/browser/zip/60605
41.976489992#-87.788483018,zip/60630,https://datacommons.org/browser/zip/60630
41.828080528#-87.686233684,zip/60609,https://datacommons.org/browser/zip/60609
41.993043969#-87.660360363,zip/60660,https://datacommons.org/browser/zip/60660
41.858444489#-87.716414102,zip/60623,https://datacommons.org/browser/zip/60623
41.879874073#-87.769750673,zip/60644,https://datacommons.org/browser/zip/60644
41.771296232#-87.729149311,zip/60629,https://datacommons.org/browser/zip/60629
41.875679322#-87.62657476,zip/60605,https://datacommons.org/browser/zip/60605
41.82539977#-87.637026874,zip/60609,https://datacommons.org/browser/zip/60609
41.863196881#-87.614817819,zip/60605,https://datacommons.org/browser/zip/60605
41.93743245#-87.649180491,zip/60657,https://datacommons.org/browser/zip/60657
41.680799541#-87.669942159,zip/60643,https://datacommons.org/browser/zip/60643
41.891874434#-87.647617474,zip/60642,https://datacommons.org/browser/zip/60642
41.888993854#-87.626934833,zip/60611,https://datacommons.org/browser/zip/60611
41.902821551#-87.775389625,zip/60651,https://datacommons.org/browser/zip/60651
41.724654303#-87.622283278,zip/60619,https://datacommons.org/browser/zip/60619
41.773780824#-87.645848665,zip/60621,https://datacommons.org/browser/zip/60621
41.841289747#-87.628142362,zip/60616,https://datacommons.org/browser/zip/60616
41.776150283#-87.615522623,zip/60637,https://datacommons.org/browser/zip/60637
41.836069707#-87.613033345,zip/60616,https://datacommons.org/browser/zip/60616
41.74974473#-87.652507329,zip/60620,https://datacommons.org/browser/zip/60620
41.946653043#-87.700875462,zip/60618,https://datacommons.org/browser/zip/60618
41.890400093#-87.628021143,zip/60611,https://datacommons.org/browser/zip/60611
41.793842185#-87.620286919,zip/60637,https://datacommons.org/browser/zip/60637
41.720900408#-87.554599376,zip/60617,https://datacommons.org/browser/zip/60617
41.742267488#-87.702192623,zip/60652,https://datacommons.org/browser/zip/60652
41.766298978#-87.570076538,zip/60649,https://datacommons.org/browser/zip/60649
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
input,dcid,link
BAZ BAR,*UNRESOLVED*,
FOO BAR,*UNRESOLVED*,
Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p
Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4
Union Power,dc/2ysvc67fk1162,https://datacommons.org/browser/dc/2ysvc67fk1162
Pearl Station,dc/00w9rbw8yn7x7,https://datacommons.org/browser/dc/00w9rbw8yn7x7
Austin Gas Recovery,dc/00zjgb4rjchx3,https://datacommons.org/browser/dc/00zjgb4rjchx3
Gordon,dc/011s19rm0mzh1,https://datacommons.org/browser/dc/011s19rm0mzh1
White River Lock and Dam 2,dc/017y3py1dzkmg,https://datacommons.org/browser/dc/017y3py1dzkmg
Bristol Plant,dc/4359q0h458f01,https://datacommons.org/browser/dc/4359q0h458f01
Edison Sault,dc/3kds7zgl4wz26,https://datacommons.org/browser/dc/3kds7zgl4wz26
Navajo Dam,dc/02b53twnh3fx,https://datacommons.org/browser/dc/02b53twnh3fx
CNN Center,dc/dk2p9l3l8x1b6,https://datacommons.org/browser/dc/dk2p9l3l8x1b6
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input,dcid,link
38.7#-119.4,s2CellId/0x80982b0000000000,https://datacommons.org/browser/s2CellId/0x80982b0000000000
19.076#72.877,s2CellId/0x3be7c90000000000,https://datacommons.org/browser/s2CellId/0x3be7c90000000000
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input,dcid,link
Q668,country/IND,https://datacommons.org/browser/country/IND
Q30,country/USA,https://datacommons.org/browser/country/USA
Loading