From 63990004af5f6ff335eb4f65695518be4772f648 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Wed, 26 Oct 2022 15:54:31 -0500 Subject: [PATCH 1/9] save wip --- data/processors/blank_columns.py | 24 +++++++ docket.mk | 57 ++++++++++++++--- fixtures/import_docket.csv | 52 +++++++-------- requirements.txt | 1 + .../commands/download_country_data.py | 18 +++--- .../commands/import_country_data.py | 3 +- sfm_pc/views.py | 63 ++++++++++++++++--- templates/download.html | 8 +-- 8 files changed, 167 insertions(+), 59 deletions(-) create mode 100644 data/processors/blank_columns.py diff --git a/data/processors/blank_columns.py b/data/processors/blank_columns.py new file mode 100644 index 00000000..e751dc88 --- /dev/null +++ b/data/processors/blank_columns.py @@ -0,0 +1,24 @@ +import csv +import sys +import argparse + +# init arg parser +parser = argparse.ArgumentParser() +parser.add_argument('--entity', type=str, required=True) +args = parser.parse_args() + +# init the incoming data as a dict reader +lines = iter(line.decode('utf-8').strip() for line in sys.stdin.buffer.readlines()) +header = next(lines).split(',') +reader = csv.DictReader(lines, fieldnames=header) + +# write data to stdout +stdout_csv = csv.writer(sys.stdout) +stdout_csv.writerow(header) + +for row in reader: + row.update({ + f'{args.entity}:comments:admin': '', + f'{args.entity}:owner:admin': '' + }) + stdout_csv.writerow(row.values()) diff --git a/docket.mk b/docket.mk index 1ab2ac88..0ba72e7b 100644 --- a/docket.mk +++ b/docket.mk @@ -1,12 +1,12 @@ -.PHONY: sfm_pc/management/commands/country_data +.PHONY: sfm_pc/management/commands/country_data/countries -%_import : %.csv sfm_pc/management/commands/country_data +%_import : %.csv sfm_pc/management/commands/country_data/countries perl -pe "s/,/ /g" $< | \ xargs -L1 bash -c ' \ echo "Loading data for country code $$3" && (\ python -u manage.py import_country_data \ --country_code $$3 \ - --country_path $(word 2, $^)/countries/$$3 \ + --country_path $(word 2, $^)/$$4 \ --sources_path $(word 2, $^)/sources.csv || \ exit 255 \ )' @@ -16,20 +16,57 @@ DATA_ARCHIVE_BUCKET := $(shell cat configs/s3_config.json | jq -r '.data_archive data_archive : wwic_download.zip aws s3 cp $< s3://$(DATA_ARCHIVE_BUCKET)/ -wwic_download.zip : sfm_pc/management/commands/country_data - # move into the target directory, zip to the root dir - cd $< && zip -r ../../../../$@ . +wwic_download.zip : filtered_data data/wwic_download/sources.csv + cd data && zip -r ../$@ . -sfm_pc/management/commands/country_data : import_docket.csv +# COUNTRY_CODES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f4) +COUNTRY_NAMES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f5) +ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson + +# .PHONY : filtered_data +# filtered_data: $(foreach country,$(COUNTRY_CODES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) data/countries +# echo "filtered csvs for entities" + +# .PHONY : filtered_data +# filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)/%,$(ENTITIES))) +# echo "filtered csvs for entities" + +test_% : $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) + echo $< + +data/wwic_download/%_units.csv : sfm_pc/management/commands/country_data/countries/%/units.csv + $(call filter_entity_data,unit) + +data/wwic_download/%_persons.csv : sfm_pc/management/commands/country_data/countries/%/persons.csv + $(call filter_entity_data,person) + +data/wwic_download/%_incidents.csv : sfm_pc/management/commands/country_data/countries/%/incidents.csv + $(call filter_entity_data,incident) + +data/wwic_download/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv + $(call filter_entity_data,location) + +data/wwic_download/sources.csv : sfm_pc/management/commands/country_data/countries/sources.csv + $(call filter_entity_data,source) + +data/wwic_download/%_locations.geojson : sfm_pc/management/commands/country_data/countries/%/locations.geojson + cp $< $@ + +define filter_entity_data + $(shell csvgrep --columns $(1):status:admin --match 3 $< | \ + python data/processors/blank_columns.py --entity $(1) > $@) +endef + +sfm_pc/management/commands/country_data/countries : import_docket.csv perl -pe "s/,/ /g" $< | \ xargs -L1 bash -c ' \ - echo "Importing data for country code $$3" && (\ + echo "Importing data for country $$4" && (\ python -u manage.py download_country_data \ --sources_doc_id $$0 \ --location_doc_id $$1 \ --entity_doc_id $$2 \ - --country_code $$3 \ - --parent_directory $@ || \ + --country_name $$4 \ + --target_directory $@ || \ exit 255 \ )' diff --git a/fixtures/import_docket.csv b/fixtures/import_docket.csv index 021d511f..9a5f94ca 100644 --- a/fixtures/import_docket.csv +++ b/fixtures/import_docket.csv @@ -1,26 +1,26 @@ -source_document_id,location_document_id,entitity_document_id,sfm:iso -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1ztOfGaQT3WDrq-BOjT0x5VErzgrWQ0Ku,19Hk6OD5AYjWQCUaTWPzkGm9sLqp4e5v_n667M0CgaQQ,ae -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1HpIjYaH_iMCRQD1jP159VGz-2NL4nB5p,1n9NZXDwr9gH6qT1k8-zNz34aq2MncG1RAiGHRfUHfAc,bd -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1j8KgLnpjlnLy6bN4ozkwnBpkHUI6i3si,1IN8uZeR7WJbAmgPkY_QzzIir_cuba_Irzd_FtceUGyQ,bf -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1mjTLMZ1la3zyiVQxLZ56sW497Sp8Lh5m,1tJXt33b5yNJajfTh2j6oXyWAduNaBRTy-XZON5MwspE,bh -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1qZoQciglG1DOeEa3hh5iUvF7q4_bKOQl,1d0YaTdc1Esj0MVncM6cyhch6oT2VkWm7z_K4xEZELDs,eg -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1eZHw6k7xM7Z_ZNMnF0Wc5cjJuVyuOop3,1KCiLecZiMCzD7JXY4r_cYgcjr54aG3CnMOOsrrGfMoQ,jo -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1vnWgeTkq5TUyF7555F4renMJnl4WiFNy,1k9XBHbLM3mGDVpokgmV5H8T0anOSsuWVRX_5PeQ_1ik,kw -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,13XnZOF7U0uwL3EP_QpdTVd1FUh3A3cwi,1CGvLuzRIBmuzKGknc3mUWISVLx9OwlpN22z6nFw3cLY,lr -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1fnTq_ONVxzVBbCyQy_-s6ngmadA6st70,1swzBv27-BOlMEcW3bW0MbS0nK30ra0eZjKXv8hQAEpQ,ma -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1syUOihIFKzk6QsLXd7XNUZIwzZZfAqEH,1sBW4eudBid8kOt-48XW6QQxNnR1NdXfoESudFyv0vMo,ml -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1dU24WM8MAnqApFcBoYCiKPzPevebal6_,1RNAVrz9d4E8mP_k901aKBuKqcD2C-UxEk7-feQjyRgA,mm -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,17Zqej6mrCT3BBBKcKj7949qHyRCa-9SJ,1yv5Gt4TYFplbNHLg7ZJsb7ZVPfCE2xD1GINm2e52Wgs,mr -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1nVWV5_1kGDwyWJ3PPqExKfchs3sAlEuh,1GKJIanAuch2j64-HDqInvfG46dWhULswPQNYbCC97y0,mx -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1qlHquI9EDz2lteBcjz_MheNLspg3mp_q,1KoVVqaPcuNz3Y7GLns8n3BnewwuNSxLK-qbY1dfMhG8,ne -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1boFMPurqyxbfYBvfisRiROmzr8TuPI8j,1UgD7O9e5HSpj60tT4p73i0k2mMdoI8jtEZLwTUdF148,ng -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1nMXXtFwJ3TqeynpKSW11uYAzihSMV8So,1cI_ahU1yEVQdHyhtrxUQ2LccaHNu9uOwwk_diclZgHc,np -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,19o8a1zwxUEYFxvZkqs2AwCyIm0oe_CPF,1BZWTt_ukmo9HxVvf9atN-9g_W7j57K3Q_mms0xAmN4U,ph -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1x4LjK_UWdxZm5EyNCupj7ikv7E-WMrkd,1uZtsbPTX9NVBrOCHLtdCum-2c2N9w4ALRGeo5miKCWE,qa -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1wSzKP9bsXB9w4U8frj4Y5kHrfV3C6Vi8,1PpAqUl5ijUAM_cHgxTIalMwsDnauEf2xojGSLQhGb1U,rw -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1FLls5iHQD71Omy4VEzXYQ4HacMubzg8v,1iVA0il4EkqB6HGuPhgjG_ZFZ9-51Em8LKG_BfrusaRo,sa -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1tNS4yJYlh265zDy9rQnjAZSqSmBZxrRh,1pxX5spQL1oe7fcJHX-2jg-Q2bGe5T9HrOVUN528HEaI,sd -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1l3NE8P8Xi-1qGwqZcdVdvV3Hn1h4Bwjv,1YC3W5Q2EGwq0dPOuZn7rUu9nMKr5j9MrThBv2T8TD6M,sl -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1yPQVFwbQ4edUfBhgAbL2o9DAOljZigTF,1KiMws3gla6Jc82CiGqgOXYyEg0Rvl8AefWznF6V6IKc,td -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1cyrCvMKVRHJtpQtcbTpoboJc9iNZ-oHy,1FZ3WqAlNpBNFv7zWOTFqUC45FXE_zqLvYQzMP3RCiL0,ug -1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1Ii31JX8y2InKt-FnHK-6kaqVK41XBOzY,1jBEXeS4Zz01afYLkm_NlL05dkW665K2UVCGYPQCsahY,ye \ No newline at end of file +source_document_id,location_document_id,entitity_document_id,sfm:iso,sfm:country_name +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1ztOfGaQT3WDrq-BOjT0x5VErzgrWQ0Ku,19Hk6OD5AYjWQCUaTWPzkGm9sLqp4e5v_n667M0CgaQQ,ae,united-arab-emirates +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1HpIjYaH_iMCRQD1jP159VGz-2NL4nB5p,1n9NZXDwr9gH6qT1k8-zNz34aq2MncG1RAiGHRfUHfAc,bd,bangladesh +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1j8KgLnpjlnLy6bN4ozkwnBpkHUI6i3si,1IN8uZeR7WJbAmgPkY_QzzIir_cuba_Irzd_FtceUGyQ,bf,burkina-faso +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1mjTLMZ1la3zyiVQxLZ56sW497Sp8Lh5m,1tJXt33b5yNJajfTh2j6oXyWAduNaBRTy-XZON5MwspE,bh,bahrain +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1qZoQciglG1DOeEa3hh5iUvF7q4_bKOQl,1d0YaTdc1Esj0MVncM6cyhch6oT2VkWm7z_K4xEZELDs,eg,egypt +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1eZHw6k7xM7Z_ZNMnF0Wc5cjJuVyuOop3,1KCiLecZiMCzD7JXY4r_cYgcjr54aG3CnMOOsrrGfMoQ,jo,jordan +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1vnWgeTkq5TUyF7555F4renMJnl4WiFNy,1k9XBHbLM3mGDVpokgmV5H8T0anOSsuWVRX_5PeQ_1ik,kw,kuwait +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,13XnZOF7U0uwL3EP_QpdTVd1FUh3A3cwi,1CGvLuzRIBmuzKGknc3mUWISVLx9OwlpN22z6nFw3cLY,lr,liberia +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1fnTq_ONVxzVBbCyQy_-s6ngmadA6st70,1swzBv27-BOlMEcW3bW0MbS0nK30ra0eZjKXv8hQAEpQ,ma,morocco +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1syUOihIFKzk6QsLXd7XNUZIwzZZfAqEH,1sBW4eudBid8kOt-48XW6QQxNnR1NdXfoESudFyv0vMo,ml,mali +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1dU24WM8MAnqApFcBoYCiKPzPevebal6_,1Kt18on6vuUIWsuEr_LxJfpPw-WngE843sJjpeotupdo,mm,myanmar +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,17Zqej6mrCT3BBBKcKj7949qHyRCa-9SJ,1yv5Gt4TYFplbNHLg7ZJsb7ZVPfCE2xD1GINm2e52Wgs,mr,mauritania +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1nVWV5_1kGDwyWJ3PPqExKfchs3sAlEuh,1GKJIanAuch2j64-HDqInvfG46dWhULswPQNYbCC97y0,mx,mexico +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1qlHquI9EDz2lteBcjz_MheNLspg3mp_q,1KoVVqaPcuNz3Y7GLns8n3BnewwuNSxLK-qbY1dfMhG8,ne,niger +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1boFMPurqyxbfYBvfisRiROmzr8TuPI8j,1UgD7O9e5HSpj60tT4p73i0k2mMdoI8jtEZLwTUdF148,ng,nigeria +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1nMXXtFwJ3TqeynpKSW11uYAzihSMV8So,1cI_ahU1yEVQdHyhtrxUQ2LccaHNu9uOwwk_diclZgHc,np,nepal +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,19o8a1zwxUEYFxvZkqs2AwCyIm0oe_CPF,1BZWTt_ukmo9HxVvf9atN-9g_W7j57K3Q_mms0xAmN4U,ph,philippines +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1x4LjK_UWdxZm5EyNCupj7ikv7E-WMrkd,1uZtsbPTX9NVBrOCHLtdCum-2c2N9w4ALRGeo5miKCWE,qa,qatar +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1wSzKP9bsXB9w4U8frj4Y5kHrfV3C6Vi8,1PpAqUl5ijUAM_cHgxTIalMwsDnauEf2xojGSLQhGb1U,rw,rwanda +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1FLls5iHQD71Omy4VEzXYQ4HacMubzg8v,1zBVkLW_vR1kuJ5dEaXdh9hIYG4ExU7Be6pT9MpldUek,sa,saudi-arabia +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1tNS4yJYlh265zDy9rQnjAZSqSmBZxrRh,1pxX5spQL1oe7fcJHX-2jg-Q2bGe5T9HrOVUN528HEaI,sd,sudan +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1l3NE8P8Xi-1qGwqZcdVdvV3Hn1h4Bwjv,1YC3W5Q2EGwq0dPOuZn7rUu9nMKr5j9MrThBv2T8TD6M,sl,sierra-leone +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1yPQVFwbQ4edUfBhgAbL2o9DAOljZigTF,1KiMws3gla6Jc82CiGqgOXYyEg0Rvl8AefWznF6V6IKc,td,chad +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1cyrCvMKVRHJtpQtcbTpoboJc9iNZ-oHy,1FZ3WqAlNpBNFv7zWOTFqUC45FXE_zqLvYQzMP3RCiL0,ug,uganda +1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1y_KMNV-wHDAcZgYnJyx-K7O9zZOuzrr7,1nhiu45GKNEc1kTwoZuG6KNKDMKVvFcn5Rg78WTRAKmU,ye,yemen \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4ee565bc..1fcb01b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ django-rosetta==0.9.8 django-queryset-csv==1.1.0 boto3==1.24.21 awscli==1.25.21 +csvkit==1.0.4 git+https://github.com/security-force-monitor/complex_fields.git diff --git a/sfm_pc/management/commands/download_country_data.py b/sfm_pc/management/commands/download_country_data.py index 06e37cd4..6fbc5497 100644 --- a/sfm_pc/management/commands/download_country_data.py +++ b/sfm_pc/management/commands/download_country_data.py @@ -35,24 +35,24 @@ def add_arguments(self, parser): ) parser.add_argument( - '--country_code', - dest='country_code', - help='Country code for the import' + '--country_name', + dest='country_name', + help='Slugified country name' ) parser.add_argument( - '--parent_directory', - dest='parent_directory' + '--target_directory', + dest='target_directory' ) def handle(self, *args, **kwargs): entity_doc_id = kwargs['entity_doc_id'] location_doc_id = kwargs['location_doc_id'] sources_doc_id = kwargs['sources_doc_id'] - country_code = kwargs['country_code'].rstrip() - parent_directory = kwargs['parent_directory'] + country_name = kwargs['country_name'].rstrip() + target_directory = kwargs['target_directory'] - country_subdirectory = f'{parent_directory}/countries/{country_code}' + country_subdirectory = f'{target_directory}/{country_name}' sheets_service = self._build_google_service( scopes=['https://www.googleapis.com/auth/spreadsheets.readonly'], @@ -64,7 +64,7 @@ def handle(self, *args, **kwargs): self._create_csv_files( sheets_service=sheets_service, doc_id=sources_doc_id, - output_directory=parent_directory, + output_directory=target_directory, key_func=lambda key: key == 'sources' ) diff --git a/sfm_pc/management/commands/import_country_data.py b/sfm_pc/management/commands/import_country_data.py index 0c0e9777..b482b23b 100644 --- a/sfm_pc/management/commands/import_country_data.py +++ b/sfm_pc/management/commands/import_country_data.py @@ -202,7 +202,8 @@ def handle(self, *args, **options): all_sheets = self.get_sheets_from_folder(country_path, options['sources_path']) - self.country_code = options.get('country_code', 'unnamed').rstrip() + self.country_code = options.get('country_code', 'unnamed') + self.country_name = options.get('country_name', 'unnamed').rstrip() self.create_sources(all_sheets['source']) diff --git a/sfm_pc/views.py b/sfm_pc/views.py index e5aaf58e..0def8f94 100644 --- a/sfm_pc/views.py +++ b/sfm_pc/views.py @@ -6,6 +6,7 @@ import csv import logging import os +import requests from django.conf import settings from django.views.generic.base import TemplateView @@ -247,17 +248,59 @@ class DownloadData(TemplateView): def get_context_data(self): context = super().get_context_data() - - download_url = self.get_presigned_url() - - if download_url: - context['download_url'] = download_url - + + download_url, head_object = self._get_s3_object_metadata() + + if download_url and head_object: + # (bytes / 1024) = kilobytes && (kilobytes / 1024) = megabytes + file_size_mb = (head_object['ContentLength'] / 1024) / 1024 + context.update({ + 'download_url': download_url, + 'file_size': file_size_mb + }) + return context - - def get_presigned_url(self): + # + # s3_client = boto3.client('s3') + # + # params = { + # 'Bucket': DATA_ARCHIVE_BUCKET, + # 'Key': 'wwic_download.zip' + # } + # + # download_url = self.get_presigned_url(s3_client, params) + # + # if download_url: + # context['download_url'] = download_url + # + # object_head = self + # + # response = requests.head(download_url) + # from pprint import pprint + # pprint(response.__dict__) + # + # return context + + def _get_s3_object_metadata(self): s3_client = boto3.client('s3') + + params = { + 'Bucket': DATA_ARCHIVE_BUCKET, + 'Key': 'wwic_download.zip' + } + + download_url = self._get_presigned_url(s3_client, params) + if download_url: + # Need to do a HEAD request to get the object size + head_object = s3_client.head_object(**params) + + return download_url, head_object + + return None, None + + + def _get_presigned_url(self, s3_client, params): try: response = s3_client.generate_presigned_url( 'get_object', @@ -272,6 +315,10 @@ def get_presigned_url(self): except ClientError as e: logging.error(e) return None + + def _get_object_head(self, s3_client, params): + response = s3_client.head_object(**params) + return response class Echo: diff --git a/templates/download.html b/templates/download.html index f6cc95c9..bda540f0 100644 --- a/templates/download.html +++ b/templates/download.html @@ -2,17 +2,15 @@ {% load i18n %} {% block content %}
-

{% trans "Use Our Data." %}

-

{% trans "Hold Perpetrators Accountable." %}

+

{% trans "Download data from WhoWasInCommand.com" %}

-

{% trans "Download the data that powers WhoWasInCommand to answer your own questions about the structure, behaviour and people in charge of security forces like the police and army." %}

+

{% trans "WhoWasInCommand.com is a free, public database of police, military and other security and defence forces. Click the "Download" button below to get a copy of all the data in the WhoWasInCommand.com database." %}

{% if download_url %} {% else %}

{% trans "An error occurred fetching the data. This shouldn't happen. Please reload the page or contact us if it keeps happening..." %}

From 80f14adced0cf3537ba99a17cf8ecec611520bed Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Wed, 26 Oct 2022 16:41:06 -0500 Subject: [PATCH 2/9] cleanup code --- docket.mk | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/docket.mk b/docket.mk index 0ba72e7b..afc09f9e 100644 --- a/docket.mk +++ b/docket.mk @@ -19,20 +19,12 @@ data_archive : wwic_download.zip wwic_download.zip : filtered_data data/wwic_download/sources.csv cd data && zip -r ../$@ . -# COUNTRY_CODES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f4) COUNTRY_NAMES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f5) ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson -# .PHONY : filtered_data -# filtered_data: $(foreach country,$(COUNTRY_CODES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) data/countries -# echo "filtered csvs for entities" - -# .PHONY : filtered_data -# filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)/%,$(ENTITIES))) -# echo "filtered csvs for entities" - -test_% : $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) - echo $< +.PHONY : filtered_data +filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) + echo "filtered csvs for entities" data/wwic_download/%_units.csv : sfm_pc/management/commands/country_data/countries/%/units.csv $(call filter_entity_data,unit) From a21ef37d73a99770efe812541c395e127cc596ee Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Mon, 31 Oct 2022 12:41:48 -0500 Subject: [PATCH 3/9] cleanup and reorganize docket.mk + finish adding copy to download page + add download link to navbar --- .gitignore | 4 ++++ README.md | 2 +- data/wwic_download/README.md | 3 +++ docker-compose.yml | 2 ++ docket.mk | 41 +++++++++++++++++++----------------- sfm_pc/views.py | 26 +++++------------------ templates/base.html | 1 + templates/download.html | 32 ++++++++++++++++++++++------ 8 files changed, 64 insertions(+), 47 deletions(-) create mode 100644 data/wwic_download/README.md diff --git a/.gitignore b/.gitignore index c43f3e98..20d5c6fd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ *.swo *.swp .DS_Store +.env # Import artifacts *_import @@ -12,6 +13,9 @@ sfm_pc/management/commands/data/ country_data/ wwic_download.zip +data/wwic_download/*.csv +data/wwic_download/*.geojson +data/wwic_download/*.pdf /staticfiles bin/ diff --git a/README.md b/README.md index 6e00c486..5d3a78d9 100644 --- a/README.md +++ b/README.md @@ -322,7 +322,7 @@ docker-compose run --rm app ./manage.py update_composition_index --recreate ``` #### data_archive config -If you need to work with the `data_archive` make recipe, `cp .env. s3.example .env` and add your AWS access tokens. These tokens must be for an IAM user attached to the correct policy for accessing the `wwic-data-archive-staging` S3 bucket. This bucket is configured to work in local development and on the staging server. +If you need to work with the `data_archive` make recipe in your local development environment, `cp .env.s3.example .env` and add your AWS access tokens. These tokens must be for an IAM user attached to the correct policy for accessing the `wwic-data-archive-staging` S3 bucket. This bucket is configured to work in local development and on the staging server. To create an archive locally, run `docker-compose --env-file .env.s3 run --rm app make data_archive` so you can upload the zip archive to S3. Once this is done, the "download" link at the `localhost:8000/en/download/` should work. diff --git a/data/wwic_download/README.md b/data/wwic_download/README.md new file mode 100644 index 00000000..9b56e8d8 --- /dev/null +++ b/data/wwic_download/README.md @@ -0,0 +1,3 @@ +# Who Was in Command data archive + +tk \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 2642d316..ed835e58 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -27,6 +27,8 @@ services: - PG_HOST=${PG_HOST} - PG_USER=${PG_USER} - PG_PASSWORD=${PG_PASSWORD} + - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} + - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} command: python manage.py runserver 0.0.0.0:8000 migration: diff --git a/docket.mk b/docket.mk index afc09f9e..a0c3f069 100644 --- a/docket.mk +++ b/docket.mk @@ -1,22 +1,11 @@ .PHONY: sfm_pc/management/commands/country_data/countries -%_import : %.csv sfm_pc/management/commands/country_data/countries - perl -pe "s/,/ /g" $< | \ - xargs -L1 bash -c ' \ - echo "Loading data for country code $$3" && (\ - python -u manage.py import_country_data \ - --country_code $$3 \ - --country_path $(word 2, $^)/$$4 \ - --sources_path $(word 2, $^)/sources.csv || \ - exit 255 \ - )' - DATA_ARCHIVE_BUCKET := $(shell cat configs/s3_config.json | jq -r '.data_archive_bucket') data_archive : wwic_download.zip aws s3 cp $< s3://$(DATA_ARCHIVE_BUCKET)/ -wwic_download.zip : filtered_data data/wwic_download/sources.csv +wwic_download.zip : filtered_data data/wwic_download/sources.csv data/wwic_download/sfm_research_handbook.pdf cd data && zip -r ../$@ . COUNTRY_NAMES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f5) @@ -26,6 +15,11 @@ ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) echo "filtered csvs for entities" +define filter_entity_data + $(shell csvgrep --columns $(1):status:admin --match 3 $< | \ + python data/processors/blank_columns.py --entity $(1) > $@) +endef + data/wwic_download/%_units.csv : sfm_pc/management/commands/country_data/countries/%/units.csv $(call filter_entity_data,unit) @@ -35,19 +29,28 @@ data/wwic_download/%_persons.csv : sfm_pc/management/commands/country_data/count data/wwic_download/%_incidents.csv : sfm_pc/management/commands/country_data/countries/%/incidents.csv $(call filter_entity_data,incident) -data/wwic_download/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv - $(call filter_entity_data,location) - data/wwic_download/sources.csv : sfm_pc/management/commands/country_data/countries/sources.csv $(call filter_entity_data,source) +data/wwic_download/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv + python data/processors/blank_columns.py --entity location > $@ + data/wwic_download/%_locations.geojson : sfm_pc/management/commands/country_data/countries/%/locations.geojson cp $< $@ -define filter_entity_data - $(shell csvgrep --columns $(1):status:admin --match 3 $< | \ - python data/processors/blank_columns.py --entity $(1) > $@) -endef +data/wwic_download/sfm_research_handbook.pdf : + curl -o $@ https://help.securityforcemonitor.org/_/downloads/en/latest/pdf/ + +%_import : %.csv sfm_pc/management/commands/country_data/countries + perl -pe "s/,/ /g" $< | \ + xargs -L1 bash -c ' \ + echo "Loading data for country code $$3" && (\ + python -u manage.py import_country_data \ + --country_code $$3 \ + --country_path $(word 2, $^)/$$4 \ + --sources_path $(word 2, $^)/sources.csv || \ + exit 255 \ + )' sfm_pc/management/commands/country_data/countries : import_docket.csv perl -pe "s/,/ /g" $< | \ diff --git a/sfm_pc/views.py b/sfm_pc/views.py index 0def8f94..9c431182 100644 --- a/sfm_pc/views.py +++ b/sfm_pc/views.py @@ -250,36 +250,20 @@ def get_context_data(self): context = super().get_context_data() download_url, head_object = self._get_s3_object_metadata() + print('download_url', download_url) + print('head_object', head_object) if download_url and head_object: # (bytes / 1024) = kilobytes && (kilobytes / 1024) = megabytes file_size_mb = (head_object['ContentLength'] / 1024) / 1024 context.update({ 'download_url': download_url, - 'file_size': file_size_mb + 'file_size': int(file_size_mb) }) + + print('context', context) return context - # - # s3_client = boto3.client('s3') - # - # params = { - # 'Bucket': DATA_ARCHIVE_BUCKET, - # 'Key': 'wwic_download.zip' - # } - # - # download_url = self.get_presigned_url(s3_client, params) - # - # if download_url: - # context['download_url'] = download_url - # - # object_head = self - # - # response = requests.head(download_url) - # from pprint import pprint - # pprint(response.__dict__) - # - # return context def _get_s3_object_metadata(self): s3_client = boto3.client('s3') diff --git a/templates/base.html b/templates/base.html index a432f7f9..8bee7b54 100644 --- a/templates/base.html +++ b/templates/base.html @@ -92,6 +92,7 @@ {% endif %}
  • {% trans "About" %}
  • +
  • {% trans "Download" %}
  • {% if user.is_authenticated %}
  • {% trans "Logout" %}
  • {% endif %} diff --git a/templates/download.html b/templates/download.html index bda540f0..1b92489c 100644 --- a/templates/download.html +++ b/templates/download.html @@ -1,16 +1,16 @@ {% extends "base.html" %} {% load i18n %} {% block content %} -
    -

    {% trans "Download data from WhoWasInCommand.com" %}

    -
    -

    {% trans "WhoWasInCommand.com is a free, public database of police, military and other security and defence forces. Click the "Download" button below to get a copy of all the data in the WhoWasInCommand.com database." %}

    +
    +

    {% trans "Download data from WhoWasInCommand.com" %}

    +
    +

    {% trans 'WhoWasInCommand.com is a free, public database of police, military and other security and defence forces. Click the "Download" button below to get a copy of all the data in the WhoWasInCommand.com database.' %}

    -
    +
    {% if download_url %} {% else %}

    {% trans "An error occurred fetching the data. This shouldn't happen. Please reload the page or contact us if it keeps happening..." %}

    @@ -20,5 +20,25 @@

    {% trans "Download data from WhoWasInCommand.com" %}

    + +
    +

    What's in the download file?

    +

    The download file includes spreadsheets containing data for each country in the WhoWasInCommand.com database. Each spreadsheet includes:

    +
      +
    • the organizational structure and command chain of each branch of the security forces we have researched.
    • +
    • details on command personnel and their postings to different units over time.
    • +
    • the geographical footprint of different forces, including infrastructure and areas of operation.
    • +
    • a file of geospatial information used to represent this data as a map.
    • +
    • all the sources used to evidence each piece of data.
    • +
    + +

    The download file also contains:

    +
      +
    • the Security Force Monitor Research Handbook, which explains what each piece of data means and how it was created. You can also read the Research Handbook online.
    • +
    • copyright and licensing information explaining what you can do with this data, and what your obligations are should you use the data.
    • +
    + +

    The data on published WhoWasInCommand are created by Security Force Monitor, an investigative team based at the Human Rights Institute of Columbia Law School. We provide this data to assist journalists, human rights workers, litigators and others seeking accountability for human rights abuses perpetrated by security and defense forces.

    +
    {% endblock %} From efac70f201ef1f25ec829cf9b47fd21c1c3e2d46 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Mon, 31 Oct 2022 14:31:02 -0500 Subject: [PATCH 4/9] remove old download code --- organization/views.py | 5 ----- person/views.py | 5 ----- sfm_pc/forms.py | 18 ------------------ templates/organization/view.html | 1 - .../partials/organization_search_results.html | 1 - templates/partials/person_search_results.html | 1 - templates/partials/source_search_results.html | 1 - .../partials/violation_search_results.html | 1 - templates/person/view.html | 1 - templates/violation/view.html | 1 - violation/views.py | 5 ----- 11 files changed, 40 deletions(-) diff --git a/organization/views.py b/organization/views.py index d53a1c71..fb18f5a4 100644 --- a/organization/views.py +++ b/organization/views.py @@ -91,11 +91,6 @@ def get_context_data(self, **kwargs): # Determine if the user is logged in authenticated = self.request.user.is_authenticated - # Generate link to download a CSV of this record - params = '?download_etype=Organization&entity_id={0}'.format(str(context['organization'].uuid)) - - context['download_url'] = reverse('download') + params - # Commanders of this unit context['person_members'] = [] diff --git a/person/views.py b/person/views.py index c770d7ee..bdc4a8a2 100644 --- a/person/views.py +++ b/person/views.py @@ -47,11 +47,6 @@ def get_context_data(self, **kwargs): authenticated = self.request.user.is_authenticated - # Generate link to download a CSV of this record - params = '?download_etype=Person&entity_id={0}'.format(str(context['person'].uuid)) - - context['download_url'] = reverse('download') + params - if authenticated: affiliations = context['person'].memberships else: diff --git a/sfm_pc/forms.py b/sfm_pc/forms.py index 6a5cb928..dcc4f3d0 100644 --- a/sfm_pc/forms.py +++ b/sfm_pc/forms.py @@ -548,24 +548,6 @@ def division_choices(): return [(r.value, country_name(r.value)) for r in division_ids] -def download_types(): - return [ - ('basic', _("Basic")), - ('parentage', _("Parentage")), - ('memberships', _("Memberships")), - ('areas', _("Areas of operation")), - ('sites', _("Sites")), - ('personnel', _("Personnel")), - ('sources', _("Sources")), - ] - - -class DownloadForm(forms.Form): - download_type = forms.ChoiceField(label=gettext_lazy("Choose a download type"), choices=download_types) - division_id = forms.ChoiceField(label=gettext_lazy("Country"), choices=division_choices) - confidences = forms.BooleanField(label=gettext_lazy("Include confidence scores"), required=False) - - class ChangeLogForm(forms.Form): from_date = forms.DateTimeField(label=_("Start date"), required=False) to_date = forms.DateTimeField(label=_("End date"), required=False) diff --git a/templates/organization/view.html b/templates/organization/view.html index f348bc42..65a6280e 100644 --- a/templates/organization/view.html +++ b/templates/organization/view.html @@ -25,7 +25,6 @@

    {% help href='unitrec.html#unit-record-title-area' %} {% cite organization.name.get_value %} - {% trans "Download as CSV" %} {% trans "Print this page" %} diff --git a/templates/partials/organization_search_results.html b/templates/partials/organization_search_results.html index a5a4cfe9..6abbc5f9 100644 --- a/templates/partials/organization_search_results.html +++ b/templates/partials/organization_search_results.html @@ -4,7 +4,6 @@

    {% trans "Units" %} {{ hit_count|intcomma }} {% trans "results" %} - {% trans "Download results" %}

    {% with merge='False' object_list=objects sortable='True' %} diff --git a/templates/partials/person_search_results.html b/templates/partials/person_search_results.html index 26f677b2..ebe884ab 100644 --- a/templates/partials/person_search_results.html +++ b/templates/partials/person_search_results.html @@ -4,7 +4,6 @@

    {% trans "Personnel" %} {{ hit_count|intcomma }} {% trans "results" %} - {% trans "Download results" %}

    {% with merge='False' object_list=objects sortable='True' %} diff --git a/templates/partials/source_search_results.html b/templates/partials/source_search_results.html index 0f1cd996..2c30324d 100644 --- a/templates/partials/source_search_results.html +++ b/templates/partials/source_search_results.html @@ -4,7 +4,6 @@

    {% trans "Sources" %} {{ hit_count|intcomma }} {% trans "results" %} - {% trans "Download results" %}

    {% with object_list=objects sortable='True' %} diff --git a/templates/partials/violation_search_results.html b/templates/partials/violation_search_results.html index 46f228b2..7b081a09 100644 --- a/templates/partials/violation_search_results.html +++ b/templates/partials/violation_search_results.html @@ -4,7 +4,6 @@

    {% trans "Incidents" %} {{ hit_count|intcomma }} {% trans "results" %} - {% trans "Download results" %}

    {% with object_list=objects sortable='True' %} diff --git a/templates/person/view.html b/templates/person/view.html index e138e22b..1db13961 100644 --- a/templates/person/view.html +++ b/templates/person/view.html @@ -26,7 +26,6 @@

    {% help href='personsrec.html#person-record-title-area' %} {% cite person.name.get_value %} - {% trans "Download as CSV" %} {% trans "Print this page" %} diff --git a/templates/violation/view.html b/templates/violation/view.html index bf61ba1b..ff65c83e 100644 --- a/templates/violation/view.html +++ b/templates/violation/view.html @@ -26,7 +26,6 @@

    {# All violation info uses the same source, so we can cite at the end #} {% cite violation.description.get_value %} - {% trans "Download as CSV" %} {% trans "Print this page" %} diff --git a/violation/views.py b/violation/views.py index 5da86ccd..5ece2767 100644 --- a/violation/views.py +++ b/violation/views.py @@ -53,11 +53,6 @@ def get_context_data(self, **kwargs): authenticated = self.request.user.is_authenticated - # Generate link to download a CSV of this record - params = '?download_etype=Violation&entity_id={0}'.format(str(context['violation'].uuid)) - - context['download_url'] = reverse('download') + params - context['location'] = None if context['violation'].location.get_value(): From d40c1b4ee5aa901814b298982f7597143bd291b1 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Mon, 31 Oct 2022 14:31:28 -0500 Subject: [PATCH 5/9] cleanup code --- docket.mk | 5 +++-- sfm_pc/management/commands/import_country_data.py | 3 +-- sfm_pc/views.py | 6 +----- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/docket.mk b/docket.mk index a0c3f069..2bbec711 100644 --- a/docket.mk +++ b/docket.mk @@ -1,12 +1,13 @@ -.PHONY: sfm_pc/management/commands/country_data/countries +.PHONY: sfm_pc/management/commands/country_data DATA_ARCHIVE_BUCKET := $(shell cat configs/s3_config.json | jq -r '.data_archive_bucket') data_archive : wwic_download.zip aws s3 cp $< s3://$(DATA_ARCHIVE_BUCKET)/ +.PHONY: wwic_download.zip wwic_download.zip : filtered_data data/wwic_download/sources.csv data/wwic_download/sfm_research_handbook.pdf - cd data && zip -r ../$@ . + cd data/wwic_download && zip -r ../../$@ . COUNTRY_NAMES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f5) ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson diff --git a/sfm_pc/management/commands/import_country_data.py b/sfm_pc/management/commands/import_country_data.py index b482b23b..0c0e9777 100644 --- a/sfm_pc/management/commands/import_country_data.py +++ b/sfm_pc/management/commands/import_country_data.py @@ -202,8 +202,7 @@ def handle(self, *args, **options): all_sheets = self.get_sheets_from_folder(country_path, options['sources_path']) - self.country_code = options.get('country_code', 'unnamed') - self.country_name = options.get('country_name', 'unnamed').rstrip() + self.country_code = options.get('country_code', 'unnamed').rstrip() self.create_sources(all_sheets['source']) diff --git a/sfm_pc/views.py b/sfm_pc/views.py index 9c431182..4fca0dd8 100644 --- a/sfm_pc/views.py +++ b/sfm_pc/views.py @@ -250,8 +250,6 @@ def get_context_data(self): context = super().get_context_data() download_url, head_object = self._get_s3_object_metadata() - print('download_url', download_url) - print('head_object', head_object) if download_url and head_object: # (bytes / 1024) = kilobytes && (kilobytes / 1024) = megabytes @@ -260,14 +258,12 @@ def get_context_data(self): 'download_url': download_url, 'file_size': int(file_size_mb) }) - - print('context', context) return context def _get_s3_object_metadata(self): s3_client = boto3.client('s3') - + params = { 'Bucket': DATA_ARCHIVE_BUCKET, 'Key': 'wwic_download.zip' From d82a749726f6fff276eaf95ee1807917fcc7e084 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Mon, 31 Oct 2022 14:35:35 -0500 Subject: [PATCH 6/9] cleanup code --- sfm_pc/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sfm_pc/views.py b/sfm_pc/views.py index 4fca0dd8..ab4db0d1 100644 --- a/sfm_pc/views.py +++ b/sfm_pc/views.py @@ -252,7 +252,7 @@ def get_context_data(self): download_url, head_object = self._get_s3_object_metadata() if download_url and head_object: - # (bytes / 1024) = kilobytes && (kilobytes / 1024) = megabytes + # megabytes = (bytes / 1024) / 1024 file_size_mb = (head_object['ContentLength'] / 1024) / 1024 context.update({ 'download_url': download_url, From 1b6498496b9deceb1359df401b183314ce14ae78 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Tue, 1 Nov 2022 16:49:30 -0500 Subject: [PATCH 7/9] better org --- data/wwic_download/{ => metadata}/README.md | 0 docket.mk | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename data/wwic_download/{ => metadata}/README.md (100%) diff --git a/data/wwic_download/README.md b/data/wwic_download/metadata/README.md similarity index 100% rename from data/wwic_download/README.md rename to data/wwic_download/metadata/README.md diff --git a/docket.mk b/docket.mk index 2bbec711..a42563a2 100644 --- a/docket.mk +++ b/docket.mk @@ -34,12 +34,12 @@ data/wwic_download/sources.csv : sfm_pc/management/commands/country_data/countri $(call filter_entity_data,source) data/wwic_download/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv - python data/processors/blank_columns.py --entity location > $@ + cp $< $@ data/wwic_download/%_locations.geojson : sfm_pc/management/commands/country_data/countries/%/locations.geojson cp $< $@ -data/wwic_download/sfm_research_handbook.pdf : +data/wwic_download/metadata/sfm_research_handbook.pdf : curl -o $@ https://help.securityforcemonitor.org/_/downloads/en/latest/pdf/ %_import : %.csv sfm_pc/management/commands/country_data/countries From ae657010d04697bd8a2ff754fb97022b4037a2f2 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Wed, 2 Nov 2022 10:21:43 -0500 Subject: [PATCH 8/9] better dict reader with stdout --- data/processors/blank_columns.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/data/processors/blank_columns.py b/data/processors/blank_columns.py index e751dc88..b36e0c79 100644 --- a/data/processors/blank_columns.py +++ b/data/processors/blank_columns.py @@ -8,17 +8,15 @@ args = parser.parse_args() # init the incoming data as a dict reader -lines = iter(line.decode('utf-8').strip() for line in sys.stdin.buffer.readlines()) -header = next(lines).split(',') -reader = csv.DictReader(lines, fieldnames=header) +reader = csv.DictReader(sys.stdin) # write data to stdout -stdout_csv = csv.writer(sys.stdout) -stdout_csv.writerow(header) +stdout_csv = csv.DictWriter(sys.stdout, fieldnames=reader.fieldnames) +stdout_csv.writeheader() for row in reader: row.update({ f'{args.entity}:comments:admin': '', f'{args.entity}:owner:admin': '' }) - stdout_csv.writerow(row.values()) + stdout_csv.writerow(row) From 5a944f65afe6a1a8ac464987ae808a8d46d322d4 Mon Sep 17 00:00:00 2001 From: Sam McAlilly Date: Thu, 3 Nov 2022 14:10:04 -0500 Subject: [PATCH 9/9] reorganize file structure + fix sources bug in blank_columns.py --- .gitignore | 6 ++-- data/processors/blank_columns.py | 20 ++++++++++--- docket.mk | 21 +++++++------- fixtures/import_docket.csv | 50 ++++++++++++++++---------------- 4 files changed, 55 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 20d5c6fd..2397c28d 100644 --- a/.gitignore +++ b/.gitignore @@ -13,9 +13,9 @@ sfm_pc/management/commands/data/ country_data/ wwic_download.zip -data/wwic_download/*.csv -data/wwic_download/*.geojson -data/wwic_download/*.pdf +data/wwic_download/countries/*.csv +data/wwic_download/countries/*.geojson +data/wwic_download/metadata/*.pdf /staticfiles bin/ diff --git a/data/processors/blank_columns.py b/data/processors/blank_columns.py index b36e0c79..60c00686 100644 --- a/data/processors/blank_columns.py +++ b/data/processors/blank_columns.py @@ -15,8 +15,20 @@ stdout_csv.writeheader() for row in reader: - row.update({ - f'{args.entity}:comments:admin': '', - f'{args.entity}:owner:admin': '' - }) + comment_key = f'{args.entity}:comments:admin' + comments = row.get(comment_key) + + if comments: + row.update({ + comment_key: '' + }) + + owner_key = f'{args.entity}:owner:admin' + owner = row.get(owner_key) + + if owner: + row.update({ + owner_key: '' + }) + stdout_csv.writerow(row) diff --git a/docket.mk b/docket.mk index a42563a2..0c81dcb4 100644 --- a/docket.mk +++ b/docket.mk @@ -1,4 +1,4 @@ -.PHONY: sfm_pc/management/commands/country_data +.PHONY: sfm_pc/management/commands/country_data data/wwic_download/countries DATA_ARCHIVE_BUCKET := $(shell cat configs/s3_config.json | jq -r '.data_archive_bucket') @@ -6,14 +6,14 @@ data_archive : wwic_download.zip aws s3 cp $< s3://$(DATA_ARCHIVE_BUCKET)/ .PHONY: wwic_download.zip -wwic_download.zip : filtered_data data/wwic_download/sources.csv data/wwic_download/sfm_research_handbook.pdf +wwic_download.zip : filtered_data data/wwic_download/metadata/sfm_research_handbook.pdf cd data/wwic_download && zip -r ../../$@ . COUNTRY_NAMES=$(shell perl -pe "s/,/ /g" import_docket.csv | cut -d' ' -f5) -ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson +ENTITIES=units.csv persons.csv incidents.csv locations.csv locations.geojson sources.csv .PHONY : filtered_data -filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/$(country)_%,$(ENTITIES))) +filtered_data: $(foreach country,$(COUNTRY_NAMES),$(patsubst %,data/wwic_download/countries/$(country)_%,$(ENTITIES))) echo "filtered csvs for entities" define filter_entity_data @@ -21,24 +21,25 @@ define filter_entity_data python data/processors/blank_columns.py --entity $(1) > $@) endef -data/wwic_download/%_units.csv : sfm_pc/management/commands/country_data/countries/%/units.csv +data/wwic_download/countries/%_units.csv : sfm_pc/management/commands/country_data/countries/%/units.csv $(call filter_entity_data,unit) -data/wwic_download/%_persons.csv : sfm_pc/management/commands/country_data/countries/%/persons.csv +data/wwic_download/countries/%_persons.csv : sfm_pc/management/commands/country_data/countries/%/persons.csv $(call filter_entity_data,person) -data/wwic_download/%_incidents.csv : sfm_pc/management/commands/country_data/countries/%/incidents.csv +data/wwic_download/countries/%_incidents.csv : sfm_pc/management/commands/country_data/countries/%/incidents.csv $(call filter_entity_data,incident) -data/wwic_download/sources.csv : sfm_pc/management/commands/country_data/countries/sources.csv +data/wwic_download/countries/%_sources.csv : sfm_pc/management/commands/country_data/countries/%/sources.csv $(call filter_entity_data,source) -data/wwic_download/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv +data/wwic_download/countries/%_locations.csv : sfm_pc/management/commands/country_data/countries/%/locations.csv cp $< $@ -data/wwic_download/%_locations.geojson : sfm_pc/management/commands/country_data/countries/%/locations.geojson +data/wwic_download/countries/%_locations.geojson : sfm_pc/management/commands/country_data/countries/%/locations.geojson cp $< $@ +.PHONY : data/wwic_download/metadata/sfm_research_handbook.pdf data/wwic_download/metadata/sfm_research_handbook.pdf : curl -o $@ https://help.securityforcemonitor.org/_/downloads/en/latest/pdf/ diff --git a/fixtures/import_docket.csv b/fixtures/import_docket.csv index 9a5f94ca..0d2f9437 100644 --- a/fixtures/import_docket.csv +++ b/fixtures/import_docket.csv @@ -1,26 +1,26 @@ source_document_id,location_document_id,entitity_document_id,sfm:iso,sfm:country_name -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1ztOfGaQT3WDrq-BOjT0x5VErzgrWQ0Ku,19Hk6OD5AYjWQCUaTWPzkGm9sLqp4e5v_n667M0CgaQQ,ae,united-arab-emirates -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1HpIjYaH_iMCRQD1jP159VGz-2NL4nB5p,1n9NZXDwr9gH6qT1k8-zNz34aq2MncG1RAiGHRfUHfAc,bd,bangladesh -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1j8KgLnpjlnLy6bN4ozkwnBpkHUI6i3si,1IN8uZeR7WJbAmgPkY_QzzIir_cuba_Irzd_FtceUGyQ,bf,burkina-faso -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1mjTLMZ1la3zyiVQxLZ56sW497Sp8Lh5m,1tJXt33b5yNJajfTh2j6oXyWAduNaBRTy-XZON5MwspE,bh,bahrain -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1qZoQciglG1DOeEa3hh5iUvF7q4_bKOQl,1d0YaTdc1Esj0MVncM6cyhch6oT2VkWm7z_K4xEZELDs,eg,egypt -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1eZHw6k7xM7Z_ZNMnF0Wc5cjJuVyuOop3,1KCiLecZiMCzD7JXY4r_cYgcjr54aG3CnMOOsrrGfMoQ,jo,jordan -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1vnWgeTkq5TUyF7555F4renMJnl4WiFNy,1k9XBHbLM3mGDVpokgmV5H8T0anOSsuWVRX_5PeQ_1ik,kw,kuwait -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,13XnZOF7U0uwL3EP_QpdTVd1FUh3A3cwi,1CGvLuzRIBmuzKGknc3mUWISVLx9OwlpN22z6nFw3cLY,lr,liberia -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1fnTq_ONVxzVBbCyQy_-s6ngmadA6st70,1swzBv27-BOlMEcW3bW0MbS0nK30ra0eZjKXv8hQAEpQ,ma,morocco -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1syUOihIFKzk6QsLXd7XNUZIwzZZfAqEH,1sBW4eudBid8kOt-48XW6QQxNnR1NdXfoESudFyv0vMo,ml,mali -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1dU24WM8MAnqApFcBoYCiKPzPevebal6_,1Kt18on6vuUIWsuEr_LxJfpPw-WngE843sJjpeotupdo,mm,myanmar -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,17Zqej6mrCT3BBBKcKj7949qHyRCa-9SJ,1yv5Gt4TYFplbNHLg7ZJsb7ZVPfCE2xD1GINm2e52Wgs,mr,mauritania -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1nVWV5_1kGDwyWJ3PPqExKfchs3sAlEuh,1GKJIanAuch2j64-HDqInvfG46dWhULswPQNYbCC97y0,mx,mexico -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1qlHquI9EDz2lteBcjz_MheNLspg3mp_q,1KoVVqaPcuNz3Y7GLns8n3BnewwuNSxLK-qbY1dfMhG8,ne,niger -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1boFMPurqyxbfYBvfisRiROmzr8TuPI8j,1UgD7O9e5HSpj60tT4p73i0k2mMdoI8jtEZLwTUdF148,ng,nigeria -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1nMXXtFwJ3TqeynpKSW11uYAzihSMV8So,1cI_ahU1yEVQdHyhtrxUQ2LccaHNu9uOwwk_diclZgHc,np,nepal -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,19o8a1zwxUEYFxvZkqs2AwCyIm0oe_CPF,1BZWTt_ukmo9HxVvf9atN-9g_W7j57K3Q_mms0xAmN4U,ph,philippines -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1x4LjK_UWdxZm5EyNCupj7ikv7E-WMrkd,1uZtsbPTX9NVBrOCHLtdCum-2c2N9w4ALRGeo5miKCWE,qa,qatar -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1wSzKP9bsXB9w4U8frj4Y5kHrfV3C6Vi8,1PpAqUl5ijUAM_cHgxTIalMwsDnauEf2xojGSLQhGb1U,rw,rwanda -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1FLls5iHQD71Omy4VEzXYQ4HacMubzg8v,1zBVkLW_vR1kuJ5dEaXdh9hIYG4ExU7Be6pT9MpldUek,sa,saudi-arabia -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1tNS4yJYlh265zDy9rQnjAZSqSmBZxrRh,1pxX5spQL1oe7fcJHX-2jg-Q2bGe5T9HrOVUN528HEaI,sd,sudan -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1l3NE8P8Xi-1qGwqZcdVdvV3Hn1h4Bwjv,1YC3W5Q2EGwq0dPOuZn7rUu9nMKr5j9MrThBv2T8TD6M,sl,sierra-leone -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1yPQVFwbQ4edUfBhgAbL2o9DAOljZigTF,1KiMws3gla6Jc82CiGqgOXYyEg0Rvl8AefWznF6V6IKc,td,chad -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1cyrCvMKVRHJtpQtcbTpoboJc9iNZ-oHy,1FZ3WqAlNpBNFv7zWOTFqUC45FXE_zqLvYQzMP3RCiL0,ug,uganda -1NS-tHBXYAGnFLS1VGhVFc7drQ59zD9h5d3sMTMOLMNY,1y_KMNV-wHDAcZgYnJyx-K7O9zZOuzrr7,1nhiu45GKNEc1kTwoZuG6KNKDMKVvFcn5Rg78WTRAKmU,ye,yemen \ No newline at end of file +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1ztOfGaQT3WDrq-BOjT0x5VErzgrWQ0Ku,1Ck11zLFVP6iJZFAR0_Xsq0UaeEJrmFl7ysbFX9mGu7c,ae,united-arab-emirates +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1HpIjYaH_iMCRQD1jP159VGz-2NL4nB5p,1EqAi59wjE1v-bYX3cC1qdl6zkThpWJ8YcvSPUC-RGHc,bd,bangladesh +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1j8KgLnpjlnLy6bN4ozkwnBpkHUI6i3si,1wBmSuTkoEhosDzfHtyvZqd9SKez-sWoPoJ9oPonWsSo,bf,burkina-faso +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1mjTLMZ1la3zyiVQxLZ56sW497Sp8Lh5m,1c0O2XlwSpTAtB0AdhkkdgevWbsBUxvsmsETUwPPVIlk,bh,bahrain +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1qZoQciglG1DOeEa3hh5iUvF7q4_bKOQl,1cZVy2PUAzeq2xOoLRLwL9z9mqbry32zv_XY7sjEih2c,eg,egypt +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1eZHw6k7xM7Z_ZNMnF0Wc5cjJuVyuOop3,1CKkNsXwRdwXDiOldwT-6baw9DayXA2Vsn4ttpwP9SuM,jo,jordan +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1vnWgeTkq5TUyF7555F4renMJnl4WiFNy,1Y6-9-9kai-YyK1pXvcv_W6fqUn9lORltUhuFc2YUu1I,kw,kuwait +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,13XnZOF7U0uwL3EP_QpdTVd1FUh3A3cwi,1aGbMvFHzGn9ZlKKcFhiQ2c9egsoGDH11QBgyqmhS-IM,lr,liberia +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1fnTq_ONVxzVBbCyQy_-s6ngmadA6st70,16962grIJlisFbh2Zp9kBAhv6jVnZz6bHgb6RGBUHd3o,ma,morocco +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1syUOihIFKzk6QsLXd7XNUZIwzZZfAqEH,1UcgoJ_ytS-WSWl2_5OuV9h92wSCBWRFBoDtr4Ztqt14,ml,mali +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1dU24WM8MAnqApFcBoYCiKPzPevebal6_,1vwb7ENaOeVRJIc5iCDBbF8K0Oql4SscENmLEdUT77Hg,mm,myanmar +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,17Zqej6mrCT3BBBKcKj7949qHyRCa-9SJ,1cUtCEUuZRMqcxlRqFyoEM9eAdiDdWy2DUocroYivCx4,mr,mauritania +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1nVWV5_1kGDwyWJ3PPqExKfchs3sAlEuh,168KuHwUr9565zWaQVZ5au3qtGOb-qyJx_WOwNzqt_Eo,mx,mexico +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1qlHquI9EDz2lteBcjz_MheNLspg3mp_q,1_Pj5BryFXUPQPmMigII8G2HBUrpsnkK5V-Zu_9LCdGw,ne,niger +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1boFMPurqyxbfYBvfisRiROmzr8TuPI8j,1f3W3qJATCzVjZGw239Wy3D25THs8ThnvoC24aUFaGZQ,ng,nigeria +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1nMXXtFwJ3TqeynpKSW11uYAzihSMV8So,1Uc5eZswLB6mrwQLhd_OYQm7v7ThH99N0eb7RbTtD5iY,np,nepal +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,19o8a1zwxUEYFxvZkqs2AwCyIm0oe_CPF,1h1a0S5aVv9Z3wucgKsYXmg5Z_CWzsKfjJSfJFcXxPSY,ph,philippines +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1x4LjK_UWdxZm5EyNCupj7ikv7E-WMrkd,1UGOxjmJdJ9Dzj8cX3mZkgXAzT_ap_EMD2OqLjzDeGeE,qa,qatar +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1wSzKP9bsXB9w4U8frj4Y5kHrfV3C6Vi8,1QAgVpj0bf_A0HGFzHgwxBbZqgIFurfH4h7u1MnfKzJc,rw,rwanda +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1FLls5iHQD71Omy4VEzXYQ4HacMubzg8v,1a9XRXK5rG4_n0Afw7tIDkIbAmdydqKcU8J8zx5pLnVU,sa,saudi-arabia +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1tNS4yJYlh265zDy9rQnjAZSqSmBZxrRh,11dEjFSe56YdmJfVeKhRZpQKSgRb6mfM1DWKoNFxYg9Y,sd,sudan +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1l3NE8P8Xi-1qGwqZcdVdvV3Hn1h4Bwjv,1YxRrB39ItO_kEPTrMQ9FJlvMEp1Fjby0vchHiwW3C_I,sl,sierra-leone +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1yPQVFwbQ4edUfBhgAbL2o9DAOljZigTF,15cnbBqIlp4LzEXrs2z2L4_RTnY5e1GMrGV150JV615Q,td,chad +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1cyrCvMKVRHJtpQtcbTpoboJc9iNZ-oHy,1WlN4Hbv3JKE76hnNYkr80HU9oNJwjjOnj9nt7mm9ddw,ug,uganda +1dkGS6Ocyc2YYQ5IopEdjz38t9kaT34XfIAab0iteGoY,1Ii31JX8y2InKt-FnHK-6kaqVK41XBOzY,1r62axKA5xgvJAiSiHrKgHZSATwSkKB-K15fdmLbn3zo,ye,yemen \ No newline at end of file