Skip to content

Commit

Permalink
some fixes for duplicates
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesturk committed Feb 27, 2014
1 parent f02af20 commit c8169ca
Show file tree
Hide file tree
Showing 11 changed files with 78,313 additions and 78,316 deletions.
4 changes: 1 addition & 3 deletions identifiers/country-us/census_whitelist.csv
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,4 @@ ocd-division/country:us/state:mt/place:butte,Butte MT -- nearly coterminous with
ocd-division/country:us/state:tn/place:nashville,Nashville TN -- there are some unincoporporated towns in Davidson county that are often not included in Nashville population totals. Users are advised that the county:davidson identifier is better for any government stats
ocd-division/country:us/state:mn/place:st_anthony,St Anthony MN
ocd-division/country:us/state:tx/place:lakeside,Lakeside TX
ocd-division/country:us/state:tx/place:oak_ridge,Oak Right TX
ocd-division/country:us/state:ca/place:jurupa_valley,Jurupa Valley CA -- incorporated after 2010 census
ocd-division/country:us/state:ca/place:eastvale,Eastvale CA -- incorporated after 2010 census
ocd-division/country:us/state:tx/place:oak_ridge,Oak Ridge TX
3,100 changes: 0 additions & 3,100 deletions identifiers/country-us/us_census_county.csv

This file was deleted.

19,511 changes: 0 additions & 19,511 deletions identifiers/country-us/us_census_place.csv

This file was deleted.

39,150 changes: 39,150 additions & 0 deletions identifiers/country-us/us_census_places.csv

Large diffs are not rendered by default.

16,539 changes: 0 additions & 16,539 deletions identifiers/country-us/us_census_subdiv.csv

This file was deleted.

3,100 changes: 0 additions & 3,100 deletions mappings/us-census-county-geoids.csv

This file was deleted.

19,511 changes: 0 additions & 19,511 deletions mappings/us-census-place-geoids.csv

This file was deleted.

39,150 changes: 39,150 additions & 0 deletions mappings/us-census-places-geoids.csv

Large diffs are not rendered by default.

16,539 changes: 0 additions & 16,539 deletions mappings/us-census-subdiv-geoids.csv

This file was deleted.

23 changes: 10 additions & 13 deletions scripts/country-us/census_places.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,19 +186,16 @@ def make_id(parent=None, **kwargs):
def process_types(types):
funcstat_count = collections.Counter()
type_count = collections.Counter()
# map geoid to id
counties = {}
ids = {}
# list of rows that produced an id
duplicates = collections.defaultdict(list)
# load exceptions from file
exceptions = get_exception_set()
csvfile = csv.writer(open('identifiers/country-us/us_census_places.csv', 'w'))
geocsv = csv.writer(open('mappings/us-census-places-geoids.csv', 'w'))

for entity_type in types:
ids = {}
csvfile = csv.writer(open('identifiers/country-us/us_census_{}.csv'.format(entity_type),
'w'))
geocsv = csv.writer(open('mappings/us-census-{}-geoids.csv'.format(entity_type), 'w'))

url = BASE_URL + TYPES[entity_type]['zip']
print('fetching zipfile', url)
zf, _ = urllib.request.urlretrieve(url)
Expand Down Expand Up @@ -261,7 +258,7 @@ def process_types(types):
elif entity_type != 'subdiv' or subdiv_rule == 'town':
id = make_id(parent=parent_id, **{subtype: name})

# check for duplicates
# duplicates
if id in ids:
id1 = make_id(parent=parent_id, **{subtype: row['NAME']})
row2 = ids.pop(id)
Expand All @@ -285,12 +282,12 @@ def process_types(types):
raise Exception(row)


# write ids out
for id, row in sorted(ids.items()):
if id not in exceptions:
csvfile.writerow((id, row['NAME']))
if geocsv:
geocsv.writerow((id, row['GEOID']))
# write ids out
for id, row in sorted(ids.items()):
if id not in exceptions:
csvfile.writerow((id, row['NAME']))
if geocsv:
geocsv.writerow((id, row['GEOID']))

print(' | '.join('{0}: {1}'.format(k,v) for k,v in funcstat_count.most_common()))
print(' | '.join('{0}: {1}'.format(k,v) for k,v in type_count.most_common()))
Expand Down
2 changes: 2 additions & 0 deletions scripts/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
out = csv.writer(out)
for row in sorted(all_rows):
out.writerow(row)
else:
print('NO COUNTRY CSV FILE WRITTEN')

# do geoid validation too (TODO: add a flag for this)
seen_in_geoid = set()
Expand Down

0 comments on commit c8169ca

Please sign in to comment.