-
Notifications
You must be signed in to change notification settings - Fork 97
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #81 from openeventdata/retrain
Update dependencies, gazetteer, and models
- Loading branch information
Showing
14 changed files
with
653 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# Mordecai examples | ||
|
||
## Geocoding cities | ||
|
||
This script is an example usage of `geo.lookup_city()`, which takes a CSV | ||
containing columns with city names, country 3 letter codes, and (optionally) | ||
state/ADM1 names. If the columns are named (respectively) `city`, `adm1`, and | ||
`country`, you can run it like this: | ||
|
||
``` | ||
python geocode_cities.py geocode_cities.csv out.csv | ||
``` | ||
|
||
Otherwise, you'll have to specify the column names as part of the call. The | ||
geocoder returns lat/lon and Geonames information, as well as providing the | ||
reason for why it selected a particular location and cautions when the results | ||
were ambiguous. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
city,adm1,country | ||
Norman,OK,USA | ||
College Park,MD,USA | ||
Cambridge,MA,USA | ||
Whaugbggoan,OK,USA | ||
Columbia Heights,DC,USA | ||
Aleppo,Aleppo,SYR |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import plac | ||
import pandas as pd | ||
from mordecai import Geoparser | ||
from tqdm import tqdm | ||
|
||
|
||
def main(in_file: ("input CSV file"), | ||
out_file: ("filename to write ouput to"), | ||
city_col: ("column in CSV with city col") = "city", | ||
adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", | ||
country_col: ("column in CSV with country name") = "country"): | ||
"""Geocode a csv with a city, ADM1, and country columns.""" | ||
print("Loading Mordecai...") | ||
geo = Geoparser() | ||
df = pd.read_csv(in_file) | ||
geocoded = [] | ||
print("Geocoding...") | ||
for i in tqdm(df.iterrows()): | ||
row = i[1] | ||
if pd.isnull(row[adm1_col]): | ||
# Elasticsearch doesn't like NaN, change to None | ||
adm1 = None | ||
else: | ||
adm1 = row[adm1_col] | ||
res = geo.lookup_city(city = row[city_col], | ||
adm1 = adm1, | ||
country = row[country_col]) | ||
try: | ||
gc = {"admin1_code" : res['geo']['admin1_code'], | ||
"admin2_code": res['geo']['admin2_code'], | ||
"asciiname": res['geo']['asciiname'], | ||
"name": res['geo']['name'], | ||
"geonameid": res['geo']['geonameid'], | ||
"feature_class": res['geo']['feature_class'], | ||
"feature_code": res['geo']['feature_code'], | ||
"country_code3": res['geo']['country_code3'], | ||
"lat": float(res['geo']['coordinates'].split(",")[0]), | ||
"lon": float(res['geo']['coordinates'].split(",")[1])} | ||
except TypeError: | ||
gc = {"admin1_code" : "", | ||
"admin2_code": "", | ||
"asciiname": "", | ||
"name": "", | ||
"geonameid": "", | ||
"feature_class": "", | ||
"feature_code": "", | ||
"country_code3": "", | ||
"lat": "", | ||
"lon": ""} | ||
gc['search_city'] = row[city_col] | ||
gc['search_adm1'] = row[adm1_col] | ||
gc['search_country'] = row[country_col] | ||
gc["info"] = res['info'] | ||
gc["reason"] = res['reason'] | ||
geocoded.append(gc) | ||
geo_df = pd.DataFrame(geocoded) | ||
geo_df.to_csv(out_file) | ||
print("Wrote file out to ", out_file) | ||
|
||
|
||
if __name__ == '__main__': | ||
plac.call(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
,admin1_code,admin2_code,asciiname,name,geonameid,feature_class,feature_code,country_code3,lat,lon,search_city,search_adm1,search_country,info,reason | ||
0,OK,027,Norman,Norman,4543762,P,PPLA2,USA,35.22257,-97.43948,Norman,OK,USA,50 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." | ||
1,MD,033,College Park,College Park,4351977,P,PPL,USA,38.98067,-76.93692,College Park,MD,USA,2 elasticsearch matches for cities out of 37 total results of all types,Exact name match for city. | ||
2,ID,005,Cambridge,Cambridge,5587778,P,PPL,USA,42.45047,-112.11663,Cambridge,MA,USA,33 entries within minimum edit distance. Picking closest average distance: 2.25.,CAUTION: Best of several edit distance matches. | ||
3,,,,,,,,,,,Whaugbggoan,OK,USA,0 total results of all types.,FAILURE: No fuzzy match for city or neighborhood. | ||
4,DC,001,Columbia Heights,Columbia Heights,4138102,P,PPL,USA,38.92567,-77.02942,Columbia Heights,DC,USA,6 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." | ||
5,09,,Aleppo,Aleppo,170063,P,PPLA,SYR,36.20124,37.16117,Aleppo,Aleppo,SYR,9 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." |
Oops, something went wrong.