Fix airtable data

2i2c-org · May 17, 2024 · 4153ffd · 4153ffd
1 parent 7cd0329
commit 4153ffd
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 156 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Mac
+*.DS_Store
+
 # Distribution / packaging
 .Python
 build/

diff --git a/book/cloud.md b/book/cloud.md
@@ -151,160 +151,7 @@ px.area(unique_hubs, x="date", y="hubs", color="cluster", title="Number of activ
 
 +++ {"editable": true, "slideshow": {"slide_type": ""}}
 
-## Active users
-
-Average active users over the past 6 months.
-
-```{code-cell} ipython3
----
-editable: true
-slideshow:
-  slide_type: ''
-tags: [remove-cell]
----
-# Sum by cluster so we avoid having too many categories
-df_clusters = df.groupby(["cluster", "date", "timescale"]).sum("users").reset_index()
-
-# Add logusers
-df_clusters = df_clusters.query("users > 0")
-df_clusters["logusers"] = df_clusters["users"].map(np.log10)
-
-# List of clusters sorted by size
-sorted_clusters = df_clusters.groupby("cluster").mean("users").sort_values("users", ascending=False).index.values
-```
-
-`````{code-cell} ipython3
----
-editable: true
-mystnb:
-  markdown_format: myst
-slideshow:
-  slide_type: ''
-tags: [remove-input]
----
-grid = """
-````{grid}
-:class-container: big-number
-%s
-````
-"""
-scale_ordering = ["daily", "monthly"]
-interior = []
-for scale in scale_ordering:
-    users = df_clusters.query("timescale == @scale").groupby("cluster").mean("users")["users"].sum()
-    interior.append(dedent("""\
-    ```{grid-item-card} %s
-    %s
-    ```\
-    """ % (f"{scale.capitalize()} users", int(users))))
-Markdown(grid % "\n".join(interior))
-`````
-
-+++ {"editable": true, "slideshow": {"slide_type": ""}}
-
-Monthly active users over the past 6 months
-
-```{code-cell} ipython3
----
-editable: true
-slideshow:
-  slide_type: ''
-tags: [remove-input, full-width]
----
-for scale in ["monthly", "daily"]:
-    for kind in ["users", "logusers"]:
-        bar = px.area(
-            df_clusters.query("timescale == @scale"),
-            x="date",
-            y=kind,
-            color="cluster",
-            category_orders={"cluster": sorted_clusters},
-            line_group="cluster",
-            title=f"{scale.capitalize()} {kind} across all 2i2c clusters",
-            height=500
-        )
-        bar.show()
-```
-
-+++ {"editable": true, "slideshow": {"slide_type": ""}}
-
-### Active users by hub
-
-Active users broken down by each hub that we run.
-We break our hubs into two groups as some hubs have orders of magnitude more users than others.
-
-+++ {"editable": true, "slideshow": {"slide_type": ""}}
-
-#### Count hubs by community size
-
-```{code-cell} ipython3
----
-editable: true
-slideshow:
-  slide_type: ''
-tags: [remove-cell]
----
-# Mean users for each hub
-df_sums = df.groupby(["clusterhub", "timescale"]).mean("users")
-
-# Calculate bins and add it to data for plotting 
-bins = [0, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
-labels = [f"{bins[ii]}-{bins[ii+1]}" for ii in range(len(bins)-1)]
-df_sums["bin"] = pd.cut(df_sums["users"], bins, labels=labels, right=False)
-df_sums = df_sums.reset_index()
-max_y_bins = df_sums.groupby(["timescale", "bin"]).count()["users"].max() + 10
-max_y_users = df_sums.groupby(["timescale", "bin"]).sum()["users"].max() + 100
-```
-
-+++ {"editable": true, "slideshow": {"slide_type": ""}}
-
-#### Total number of users binned by community size
-
-Tells us the percentage of our userbase that comes from different community sizes.
-
-```{code-cell} ipython3
----
-editable: true
-slideshow:
-  slide_type: ''
-tags: [remove-input, remove-stderr, remove-stdout]
----
-chs_bins = []
-chs_users = []
-chs_perc = []
-groups = df_sums.groupby("timescale")
-for scale in scale_ordering:
-    idata = groups.get_group(scale).copy()
-    binned_data = idata.groupby('bin').size().reset_index(name='count')
-
-    # Number 
-    ch = alt.Chart(idata, title=f"{scale}").mark_bar().encode(
-        alt.X("bin:O", scale=alt.Scale(domain=labels), axis=alt.Axis(labelAngle=-45), title=f"{scale} Active Users"),
-        y=alt.Y('count()', title="Number of communities", scale=alt.Scale(domain=[0, max_y_bins])),
-        color="clusterhub",
-        tooltip=["users", "clusterhub"],
-    ).interactive()
-    chs_bins.append(ch)
-
-    # Percentage breakdown chart
-    bin_sums = idata.groupby("bin").sum()["users"]
-    bin_sums = bin_sums / bin_sums.sum()
-    ch = alt.Chart(bin_sums.reset_index(), title=f"{scale}").mark_bar().encode(
-        x = "bin",
-        y = alt.Y("users", axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])),
-        tooltip=["bin", alt.Tooltip("users", format='.0%')],
-    ).interactive()
-    chs_perc.append(ch)
-
-# Display the charts
-display(alt.hconcat(*chs_bins, title=f"Number of communities in bins of active users"))
-display(alt.hconcat(*chs_users, title=f"Total Active Users by community size"))
-display(alt.hconcat(*chs_perc, title=f"% {scale} Total Active Users by community size"))
-```
-
-+++ {"editable": true, "slideshow": {"slide_type": ""}}
-
-## Map of hubs
+### Geographic map of community locations
 
 Below is a visualization that represents the hubs
 
@@ -469,3 +316,156 @@ for constellation, idata in communities.groupby("Constellation"):
     fig.show("png")
     fig.write_image(path_file)
 ```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+## Active users
+
+Average active users over the past 6 months.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-cell]
+---
+# Sum by cluster so we avoid having too many categories
+df_clusters = df.groupby(["cluster", "date", "timescale"]).sum("users").reset_index()
+
+# Add logusers
+df_clusters = df_clusters.query("users > 0")
+df_clusters["logusers"] = df_clusters["users"].map(np.log10)
+
+# List of clusters sorted by size
+sorted_clusters = df_clusters.groupby("cluster").mean("users").sort_values("users", ascending=False).index.values
+```
+
+`````{code-cell} ipython3
+---
+editable: true
+mystnb:
+  markdown_format: myst
+slideshow:
+  slide_type: ''
+tags: [remove-input]
+---
+grid = """
+````{grid}
+:class-container: big-number
+%s
+````
+"""
+scale_ordering = ["daily", "monthly"]
+interior = []
+for scale in scale_ordering:
+    users = df_clusters.query("timescale == @scale").groupby("cluster").mean("users")["users"].sum()
+    interior.append(dedent("""\
+    ```{grid-item-card} %s
+    %s
+    ```\
+    """ % (f"{scale.capitalize()} users", int(users))))
+Markdown(grid % "\n".join(interior))
+`````
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+Monthly active users over the past 6 months
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input, full-width]
+---
+for scale in ["monthly", "daily"]:
+    for kind in ["users", "logusers"]:
+        bar = px.area(
+            df_clusters.query("timescale == @scale"),
+            x="date",
+            y=kind,
+            color="cluster",
+            category_orders={"cluster": sorted_clusters},
+            line_group="cluster",
+            title=f"{scale.capitalize()} {kind} across all 2i2c clusters",
+            height=500
+        )
+        bar.show()
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+### Active users by hub
+
+Active users broken down by each hub that we run.
+We break our hubs into two groups as some hubs have orders of magnitude more users than others.
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+#### Count hubs by community size
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-cell]
+---
+# Mean users for each hub
+df_sums = df.groupby(["clusterhub", "timescale"]).mean("users")
+
+# Calculate bins and add it to data for plotting 
+bins = [0, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
+labels = [f"{bins[ii]}-{bins[ii+1]}" for ii in range(len(bins)-1)]
+df_sums["bin"] = pd.cut(df_sums["users"], bins, labels=labels, right=False)
+df_sums = df_sums.reset_index()
+max_y_bins = df_sums.groupby(["timescale", "bin"]).count()["users"].max() + 10
+max_y_users = df_sums.groupby(["timescale", "bin"]).sum()["users"].max() + 100
+```
+
++++ {"editable": true, "slideshow": {"slide_type": ""}}
+
+#### Total number of users binned by community size
+
+Tells us the percentage of our userbase that comes from different community sizes.
+
+```{code-cell} ipython3
+---
+editable: true
+slideshow:
+  slide_type: ''
+tags: [remove-input, remove-stderr, remove-stdout]
+---
+chs_bins = []
+chs_users = []
+chs_perc = []
+groups = df_sums.groupby("timescale")
+for scale in scale_ordering:
+    idata = groups.get_group(scale).copy()
+    binned_data = idata.groupby('bin').size().reset_index(name='count')
+
+    # Number 
+    ch = alt.Chart(idata, title=f"{scale}").mark_bar().encode(
+        alt.X("bin:O", scale=alt.Scale(domain=labels), axis=alt.Axis(labelAngle=-45), title=f"{scale} Active Users"),
+        y=alt.Y('count()', title="Number of communities", scale=alt.Scale(domain=[0, max_y_bins])),
+        color="clusterhub",
+        tooltip=["users", "clusterhub"],
+    ).interactive()
+    chs_bins.append(ch)
+
+    # Percentage breakdown chart
+    bin_sums = idata.groupby("bin").sum()["users"]
+    bin_sums = bin_sums / bin_sums.sum()
+    ch = alt.Chart(bin_sums.reset_index(), title=f"{scale}").mark_bar().encode(
+        x = "bin",
+        y = alt.Y("users", axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])),
+        tooltip=["bin", alt.Tooltip("users", format='.0%')],
+    ).interactive()
+    chs_perc.append(ch)
+
+# Display the charts
+display(alt.hconcat(*chs_bins, title=f"Number of communities in bins of active users"))
+display(alt.hconcat(*chs_users, title=f"Total Active Users by community size"))
+display(alt.hconcat(*chs_perc, title=f"% {scale} Total Active Users by community size"))
+```
diff --git a/book/scripts/download_airtable_data.py b/book/scripts/download_airtable_data.py
@@ -42,6 +42,7 @@
 for (name, table_id, view_id) in views:
     table = api.table(base_id, table_id)
     records = table.all(view=view_id)
+    print(f"Downloading AirTable data from https://airtable.com/{base_id}/{table_id}/{view_id}...")
     df = pd.DataFrame.from_records((r["fields"] for r in records))
 
     # %% [markdown]

diff --git a/book/scripts/download_grafana_activity.py b/book/scripts/download_grafana_activity.py
@@ -165,14 +165,16 @@ def get_pandas_prometheus(grafana_url: str, grafana_token: str, prometheus_uid:
                 # Add to our list so that we concatenate across all clusters
                 activity.append(iactivity)
             except Exception:
-                errors.append(uid)
+                errors.append((uid, idata["name"].squeeze()))
 
     # Convert into a DF and do a little munging
     activity = pd.concat(activity)
 
     # Write to a CSV for future ref
     activity.to_csv(path_activity, index=False)
     print(f"Finished loading hub activity data to {path_activity}...")
-    print(f"The following clusters had errors: {', '.join(errors)}")
+    if errors:
+        serrors = "\n".join(f"- {error}" for error in errors)
+        print(f"The following clusters had errors: {serrors}")
 else:
     print(f"Found data at {path_activity}, not downloading...")
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,9 @@ __pycache__/ @@
     # C extensions
     *.so
+    # Mac
+    *.DS_Store
     # Distribution / packaging
     .Python
     build/
@@ Expand Down @@