Use lattitude and longitude from AirTable base (#50)

* Use lattitude and longitude from AirTable base * Fix geolocation by storing lat/lon in airtable * Update CSS accounting data
2i2c-org · Sep 30, 2024 · baed665 · baed665
1 parent 242ada3
commit baed665
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 108 deletions.
diff --git a/book/cloud.md b/book/cloud.md
@@ -183,17 +183,19 @@ tags: [remove-cell]
 # Load the latest AirTable data
 communities = pd.read_csv("./data/airtable-communities.csv")
 
-# Drop communities that are missing location/hubs/domains from hubs
-communities = communities.dropna(subset=["Location", "Hubs", "domain (from Hubs)"])
-
 # Clean up a bit
 communities = communities.rename(columns={"domain (from Hubs)": "domain"})
-communities["domain"] = communities["domain"].map(lambda a: eval(a))
+
+# Drop communities that are missing location/hubs/domains from hubs
+communities = communities.dropna(subset=["Location", "Hubs", "domain"])
+for col in ["id", "domain", "Location"]:
+    communities[col] = communities[col].map(lambda a: eval(a))
+communities["Location"] = communities["Location"].map(lambda a: a[0])
 
 # Calculate the number of users for each hub
 for ix, irow in communities.iterrows():
     clusters = eval(irow["cluster"])
-    hubs = eval(irow["id"])
+    hubs = irow["id"]
     clusterhub = [f"{a}/{b}" for a, b in zip(clusters, hubs)]
 
     # Grab the average number of monthly users for this community across all clusters/hubs
@@ -202,56 +204,15 @@ for ix, irow in communities.iterrows():
     hubs = df.query("clusterhub in @clusterhub and timescale == 'monthly'")
     n_users = hubs.groupby("clusterhub").mean("users")["users"].sum().round()
     communities.loc[ix, "users"] = n_users
-    
 ```
 
 ```{code-cell} ipython3
----
-editable: true
-slideshow:
-  slide_type: ''
-tags: [remove-cell]
----
-def geocode(city_name):
-    """A simpler geocoder that uses the openstreetmaps API.
-
-    We used to use geopy, but it is unmaintained and their geocoder
-    broke, so this should be more reliable.
-    """
-
-    
-    base_url = "https://nominatim.openstreetmap.org/search"
-    params = {
-        "q": city_name,
-        "format": "json",
-        "limit": 1
-    }
-    headers = {
-        "User-Agent": "YourAppName/1.0"  # Replace with your app name
-    }
-    
-    response = requests.get(base_url, params=params, headers=headers)
-    data = response.json()
-    
-    if data:
-        return float(data[0]["lat"]), float(data[0]["lon"])
-    else:
-        return None
-
-# Geocode each city so we can plot it on a map
-path_locations = Path("./data/city-locations.csv")
-if not path_locations.exists():
-    unique_locations = communities["Location"].unique()
-    located = []
-    for location in track(unique_locations):
-        lat, lon = geocode(unique_locations)
-        located.append([location, lat, lon])
-    located = pd.DataFrame(located, columns=["Location", "lat", "lon"])
-
-    # Save for future use
-    located.to_csv(path_locations, index=False)
-else:
-    located = pd.read_csv(path_locations, index_col=False)
+# Read in locations data and link it to our communities
+locations = pd.read_csv("./data/airtable-locations.csv")
+communities = pd.merge(communities, locations[["aid", "Latitude", "Longitude"]], left_on="Location", right_on="aid", how="left")
+
+# Rename Lattitude and Longitude to be easier to work with
+communities = communities.rename(columns={"Latitude": "lat", "Longitude": "lon"})
 ```
 
 ```{code-cell} ipython3
@@ -261,9 +222,6 @@ slideshow:
   slide_type: ''
 tags: [remove-cell]
 ---
-# Merge location information with our communities based on city name
-communities = pd.merge(located, communities, "outer", "Location")
-
 # Drop any records without users because these aren't valid
 missing_records = communities["users"].isnull()
 print(f"Dropping {missing_records.sum()} records with missing users...")

diff --git a/book/scripts/clean_css_accounting_data.py b/book/scripts/clean_css_accounting_data.py
@@ -53,87 +53,49 @@
     here = Path(".")
 
 # Take the first CSV file in the accounting folder
-path = list(Path(here / "../data/css-accounting/").glob("*.csv"))[-1]
-df = pd.read_csv(path, skiprows=6)
+path = list(Path(here / "../data/css-accounting/").glob("*.xlsx"))[-1]
+df = pd.read_excel(path, skiprows=6)
 
 # Quick renaming
-df = df.rename(columns={"Net (USD)": "Amount"})
+df = df.rename(columns={"Net (USD)": "Amount", "Account": "Category", "Account Type": "Category Type"})
+
+# Drop empty rows
+df = df.dropna(subset=["Description"])
 
 
 # -
 
 # ## Munge the dataframe
 
-# +
-# Parse the amount column
-def parse_accounting_string(s):
-    if not isinstance(s, str):
-        # If not a string, just skip it
-        return
-    s = s.replace(',', '')  # Remove comma separators
-    s = s.strip()  # Remove leading/trailing whitespace
-
-    if s.startswith('(') and s.endswith(')'):
-        # Handle parentheses for negative numbers
-        return -float(s[1:-1])
-    elif s.startswith('$'):
-        # Handle dollar sign
-        return float(s[1:])
-    else:
-        return float(s)
-df["Amount"] = df["Amount"].map(parse_accounting_string)
-
-# Remove category rows and add them as an entry
-dfnew = []
-active_category = None
-for ix, irow in df.iterrows():
-    if pd.isna(irow["Date"]) or irow["Date"].lower().startswith("Total"):
-        # If empty, just skip it
-        continue
-    elif pd.isna(irow["Source"]):
-        # If the source is empty, assume that it is a category and not a transaction
-        active_category = irow["Date"]
-    else:
-        # Add the active category
-        irow["Category"] = active_category
-        irow["Kind"] = "revenue" if "revenue" in active_category.lower() else "cost"
-        dfnew.append(irow)
-dfnew = pd.DataFrame(dfnew)
-
-
-# +
-# If we want to inspect it
-# ishow(dfnew)
-
 # +
 # Add major category for later use
 def clean_category(cat):
     # Extract the category string
     # Each entry has a form like:
-    #     NNNN - NNNN STRING
-    cat = cat.split(" ", 3)[-1]
+    #     NNNN STRING
+    cat = cat.split(" ", 1)[-1]
 
     # Return the major category
-    if any(ii in cat for ii in ["Other", "Revenue"]):
+    if any(ii in cat.lower() for ii in ["other", "revenue"]):
         return cat.split(":")[-1]
     else:
         return cat.split(":")[0]
-dfnew["Category Major"] = dfnew["Category"].map(clean_category)
+df["Category Major"] = df["Category"].map(clean_category)
 
 # Date type
-dfnew["Date"] = pd.to_datetime(dfnew["Date"])
+df["Date"] = pd.to_datetime(df["Date"])
 # -
 
 # ## Save to CSV for upload
 #
 
 # Create a new CSV that we'll use
-newpath = path.parent / (path.stem + "-cleaned" + path.suffix)
-dfnew.to_csv(newpath, index=False)
+newpath = path.parent / (path.stem + "-cleaned.csv")
+df.to_csv(newpath, index=False)
 
 # ## Visualize
 
-for kind, idata in dfnew.groupby("Kind"):
+for kind, idata in df.groupby("Category Type"):
     monthly = idata.groupby("Category Major").resample("ME", on="Date").sum()["Amount"].reset_index()
     totals = monthly.groupby("Date").sum("Amount")
     fig = px.line(monthly, x="Date", y="Amount", color="Category Major", height=600, title=f"Monthly {kind}")

diff --git a/book/scripts/download_airtable_data.py b/book/scripts/download_airtable_data.py
@@ -35,6 +35,7 @@
 #   airtable.com/{{ BASE ID }}/{{ TABLE ID }}/{{VIEW ID}}
 views = [
     ("communities", "appbjBTRIbgRiElkr", "tblYGygEo5PQBSUYt", "viw2F6xVWJujWKCuj"),
+    ("locations", "appbjBTRIbgRiElkr", "tblNiMH0gYRVhVdhE", "viwYjmYFRWWJnrv8Y"),
     ("accounting", "appbjBTRIbgRiElkr", "tblNjmVbPaVmC7wc3", "viw1daKSu2dTcd5lg"),
     ("contracts", "appbjBTRIbgRiElkr", "tbliwB70vYg3hlkb1", "viwWPJhcFbXUJZUO6"),
     ("leads", "appbjBTRIbgRiElkr", "tblmRU6U53i8o7z2I", "viw8xzzSXk8tPwBho"),
@@ -46,7 +47,9 @@
     table = api.table(base_id, table_id)
     records = table.all(view=view_id)
     print(f"Downloading AirTable data from https://airtable.com/{base_id}/{table_id}/{view_id}...")
-    df = pd.DataFrame.from_records((r["fields"] for r in records))
+    # Add the AirTable ID for easy indexing later
+    data = [r["fields"] | {"aid": r["id"]} for r in records]
+    df = pd.DataFrame.from_records(data)
 
     # %% [markdown]
     # Write to a CSV file (not checked into git)