Skip to content

Commit

Permalink
Use lattitude and longitude from AirTable base (#50)
Browse files Browse the repository at this point in the history
* Use lattitude and longitude from AirTable base

* Fix geolocation by storing lat/lon in airtable

* Update CSS accounting data
  • Loading branch information
choldgraf authored Sep 30, 2024
1 parent 242ada3 commit baed665
Show file tree
Hide file tree
Showing 3 changed files with 31 additions and 108 deletions.
68 changes: 13 additions & 55 deletions book/cloud.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,17 +183,19 @@ tags: [remove-cell]
# Load the latest AirTable data
communities = pd.read_csv("./data/airtable-communities.csv")
# Drop communities that are missing location/hubs/domains from hubs
communities = communities.dropna(subset=["Location", "Hubs", "domain (from Hubs)"])
# Clean up a bit
communities = communities.rename(columns={"domain (from Hubs)": "domain"})
communities["domain"] = communities["domain"].map(lambda a: eval(a))
# Drop communities that are missing location/hubs/domains from hubs
communities = communities.dropna(subset=["Location", "Hubs", "domain"])
for col in ["id", "domain", "Location"]:
communities[col] = communities[col].map(lambda a: eval(a))
communities["Location"] = communities["Location"].map(lambda a: a[0])
# Calculate the number of users for each hub
for ix, irow in communities.iterrows():
clusters = eval(irow["cluster"])
hubs = eval(irow["id"])
hubs = irow["id"]
clusterhub = [f"{a}/{b}" for a, b in zip(clusters, hubs)]
# Grab the average number of monthly users for this community across all clusters/hubs
Expand All @@ -202,56 +204,15 @@ for ix, irow in communities.iterrows():
hubs = df.query("clusterhub in @clusterhub and timescale == 'monthly'")
n_users = hubs.groupby("clusterhub").mean("users")["users"].sum().round()
communities.loc[ix, "users"] = n_users
```

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-cell]
---
def geocode(city_name):
"""A simpler geocoder that uses the openstreetmaps API.
We used to use geopy, but it is unmaintained and their geocoder
broke, so this should be more reliable.
"""
base_url = "https://nominatim.openstreetmap.org/search"
params = {
"q": city_name,
"format": "json",
"limit": 1
}
headers = {
"User-Agent": "YourAppName/1.0" # Replace with your app name
}
response = requests.get(base_url, params=params, headers=headers)
data = response.json()
if data:
return float(data[0]["lat"]), float(data[0]["lon"])
else:
return None
# Geocode each city so we can plot it on a map
path_locations = Path("./data/city-locations.csv")
if not path_locations.exists():
unique_locations = communities["Location"].unique()
located = []
for location in track(unique_locations):
lat, lon = geocode(unique_locations)
located.append([location, lat, lon])
located = pd.DataFrame(located, columns=["Location", "lat", "lon"])
# Save for future use
located.to_csv(path_locations, index=False)
else:
located = pd.read_csv(path_locations, index_col=False)
# Read in locations data and link it to our communities
locations = pd.read_csv("./data/airtable-locations.csv")
communities = pd.merge(communities, locations[["aid", "Latitude", "Longitude"]], left_on="Location", right_on="aid", how="left")
# Rename Lattitude and Longitude to be easier to work with
communities = communities.rename(columns={"Latitude": "lat", "Longitude": "lon"})
```

```{code-cell} ipython3
Expand All @@ -261,9 +222,6 @@ slideshow:
slide_type: ''
tags: [remove-cell]
---
# Merge location information with our communities based on city name
communities = pd.merge(located, communities, "outer", "Location")
# Drop any records without users because these aren't valid
missing_records = communities["users"].isnull()
print(f"Dropping {missing_records.sum()} records with missing users...")
Expand Down
66 changes: 14 additions & 52 deletions book/scripts/clean_css_accounting_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,87 +53,49 @@
here = Path(".")

# Take the first CSV file in the accounting folder
path = list(Path(here / "../data/css-accounting/").glob("*.csv"))[-1]
df = pd.read_csv(path, skiprows=6)
path = list(Path(here / "../data/css-accounting/").glob("*.xlsx"))[-1]
df = pd.read_excel(path, skiprows=6)

# Quick renaming
df = df.rename(columns={"Net (USD)": "Amount"})
df = df.rename(columns={"Net (USD)": "Amount", "Account": "Category", "Account Type": "Category Type"})

# Drop empty rows
df = df.dropna(subset=["Description"])


# -

# ## Munge the dataframe

# +
# Parse the amount column
def parse_accounting_string(s):
if not isinstance(s, str):
# If not a string, just skip it
return
s = s.replace(',', '') # Remove comma separators
s = s.strip() # Remove leading/trailing whitespace

if s.startswith('(') and s.endswith(')'):
# Handle parentheses for negative numbers
return -float(s[1:-1])
elif s.startswith('$'):
# Handle dollar sign
return float(s[1:])
else:
return float(s)
df["Amount"] = df["Amount"].map(parse_accounting_string)

# Remove category rows and add them as an entry
dfnew = []
active_category = None
for ix, irow in df.iterrows():
if pd.isna(irow["Date"]) or irow["Date"].lower().startswith("Total"):
# If empty, just skip it
continue
elif pd.isna(irow["Source"]):
# If the source is empty, assume that it is a category and not a transaction
active_category = irow["Date"]
else:
# Add the active category
irow["Category"] = active_category
irow["Kind"] = "revenue" if "revenue" in active_category.lower() else "cost"
dfnew.append(irow)
dfnew = pd.DataFrame(dfnew)


# +
# If we want to inspect it
# ishow(dfnew)

# +
# Add major category for later use
def clean_category(cat):
# Extract the category string
# Each entry has a form like:
# NNNN - NNNN STRING
cat = cat.split(" ", 3)[-1]
# NNNN STRING
cat = cat.split(" ", 1)[-1]

# Return the major category
if any(ii in cat for ii in ["Other", "Revenue"]):
if any(ii in cat.lower() for ii in ["other", "revenue"]):
return cat.split(":")[-1]
else:
return cat.split(":")[0]
dfnew["Category Major"] = dfnew["Category"].map(clean_category)
df["Category Major"] = df["Category"].map(clean_category)

# Date type
dfnew["Date"] = pd.to_datetime(dfnew["Date"])
df["Date"] = pd.to_datetime(df["Date"])
# -

# ## Save to CSV for upload
#

# Create a new CSV that we'll use
newpath = path.parent / (path.stem + "-cleaned" + path.suffix)
dfnew.to_csv(newpath, index=False)
newpath = path.parent / (path.stem + "-cleaned.csv")
df.to_csv(newpath, index=False)

# ## Visualize

for kind, idata in dfnew.groupby("Kind"):
for kind, idata in df.groupby("Category Type"):
monthly = idata.groupby("Category Major").resample("ME", on="Date").sum()["Amount"].reset_index()
totals = monthly.groupby("Date").sum("Amount")
fig = px.line(monthly, x="Date", y="Amount", color="Category Major", height=600, title=f"Monthly {kind}")
Expand Down
5 changes: 4 additions & 1 deletion book/scripts/download_airtable_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# airtable.com/{{ BASE ID }}/{{ TABLE ID }}/{{VIEW ID}}
views = [
("communities", "appbjBTRIbgRiElkr", "tblYGygEo5PQBSUYt", "viw2F6xVWJujWKCuj"),
("locations", "appbjBTRIbgRiElkr", "tblNiMH0gYRVhVdhE", "viwYjmYFRWWJnrv8Y"),
("accounting", "appbjBTRIbgRiElkr", "tblNjmVbPaVmC7wc3", "viw1daKSu2dTcd5lg"),
("contracts", "appbjBTRIbgRiElkr", "tbliwB70vYg3hlkb1", "viwWPJhcFbXUJZUO6"),
("leads", "appbjBTRIbgRiElkr", "tblmRU6U53i8o7z2I", "viw8xzzSXk8tPwBho"),
Expand All @@ -46,7 +47,9 @@
table = api.table(base_id, table_id)
records = table.all(view=view_id)
print(f"Downloading AirTable data from https://airtable.com/{base_id}/{table_id}/{view_id}...")
df = pd.DataFrame.from_records((r["fields"] for r in records))
# Add the AirTable ID for easy indexing later
data = [r["fields"] | {"aid": r["id"]} for r in records]
df = pd.DataFrame.from_records(data)

# %% [markdown]
# Write to a CSV file (not checked into git)
Expand Down

0 comments on commit baed665

Please sign in to comment.