Skip to content

Commit

Permalink
Fix airtable data
Browse files Browse the repository at this point in the history
  • Loading branch information
choldgraf committed May 17, 2024
1 parent 7cd0329 commit 4153ffd
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 156 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ __pycache__/
# C extensions
*.so

# Mac
*.DS_Store

# Distribution / packaging
.Python
build/
Expand Down
308 changes: 154 additions & 154 deletions book/cloud.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,160 +151,7 @@ px.area(unique_hubs, x="date", y="hubs", color="cluster", title="Number of activ

+++ {"editable": true, "slideshow": {"slide_type": ""}}

## Active users

Average active users over the past 6 months.

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-cell]
---
# Sum by cluster so we avoid having too many categories
df_clusters = df.groupby(["cluster", "date", "timescale"]).sum("users").reset_index()
# Add logusers
df_clusters = df_clusters.query("users > 0")
df_clusters["logusers"] = df_clusters["users"].map(np.log10)
# List of clusters sorted by size
sorted_clusters = df_clusters.groupby("cluster").mean("users").sort_values("users", ascending=False).index.values
```

`````{code-cell} ipython3
---
editable: true
mystnb:
markdown_format: myst
slideshow:
slide_type: ''
tags: [remove-input]
---
grid = """
````{grid}
:class-container: big-number
%s
````
"""
scale_ordering = ["daily", "monthly"]
interior = []
for scale in scale_ordering:
users = df_clusters.query("timescale == @scale").groupby("cluster").mean("users")["users"].sum()
interior.append(dedent("""\
```{grid-item-card} %s
%s
```\
""" % (f"{scale.capitalize()} users", int(users))))
Markdown(grid % "\n".join(interior))
`````

+++ {"editable": true, "slideshow": {"slide_type": ""}}

Monthly active users over the past 6 months

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-input, full-width]
---
for scale in ["monthly", "daily"]:
for kind in ["users", "logusers"]:
bar = px.area(
df_clusters.query("timescale == @scale"),
x="date",
y=kind,
color="cluster",
category_orders={"cluster": sorted_clusters},
line_group="cluster",
title=f"{scale.capitalize()} {kind} across all 2i2c clusters",
height=500
)
bar.show()
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

### Active users by hub

Active users broken down by each hub that we run.
We break our hubs into two groups as some hubs have orders of magnitude more users than others.

+++ {"editable": true, "slideshow": {"slide_type": ""}}

#### Count hubs by community size

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-cell]
---
# Mean users for each hub
df_sums = df.groupby(["clusterhub", "timescale"]).mean("users")
# Calculate bins and add it to data for plotting
bins = [0, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
labels = [f"{bins[ii]}-{bins[ii+1]}" for ii in range(len(bins)-1)]
df_sums["bin"] = pd.cut(df_sums["users"], bins, labels=labels, right=False)
df_sums = df_sums.reset_index()
max_y_bins = df_sums.groupby(["timescale", "bin"]).count()["users"].max() + 10
max_y_users = df_sums.groupby(["timescale", "bin"]).sum()["users"].max() + 100
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

#### Total number of users binned by community size

Tells us the percentage of our userbase that comes from different community sizes.

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-input, remove-stderr, remove-stdout]
---
chs_bins = []
chs_users = []
chs_perc = []
groups = df_sums.groupby("timescale")
for scale in scale_ordering:
idata = groups.get_group(scale).copy()
binned_data = idata.groupby('bin').size().reset_index(name='count')
# Number
ch = alt.Chart(idata, title=f"{scale}").mark_bar().encode(
alt.X("bin:O", scale=alt.Scale(domain=labels), axis=alt.Axis(labelAngle=-45), title=f"{scale} Active Users"),
y=alt.Y('count()', title="Number of communities", scale=alt.Scale(domain=[0, max_y_bins])),
color="clusterhub",
tooltip=["users", "clusterhub"],
).interactive()
chs_bins.append(ch)
# Percentage breakdown chart
bin_sums = idata.groupby("bin").sum()["users"]
bin_sums = bin_sums / bin_sums.sum()
ch = alt.Chart(bin_sums.reset_index(), title=f"{scale}").mark_bar().encode(
x = "bin",
y = alt.Y("users", axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])),
tooltip=["bin", alt.Tooltip("users", format='.0%')],
).interactive()
chs_perc.append(ch)
# Display the charts
display(alt.hconcat(*chs_bins, title=f"Number of communities in bins of active users"))
display(alt.hconcat(*chs_users, title=f"Total Active Users by community size"))
display(alt.hconcat(*chs_perc, title=f"% {scale} Total Active Users by community size"))
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

## Map of hubs
### Geographic map of community locations

Below is a visualization that represents the hubs

Expand Down Expand Up @@ -469,3 +316,156 @@ for constellation, idata in communities.groupby("Constellation"):
fig.show("png")
fig.write_image(path_file)
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

## Active users

Average active users over the past 6 months.

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-cell]
---
# Sum by cluster so we avoid having too many categories
df_clusters = df.groupby(["cluster", "date", "timescale"]).sum("users").reset_index()
# Add logusers
df_clusters = df_clusters.query("users > 0")
df_clusters["logusers"] = df_clusters["users"].map(np.log10)
# List of clusters sorted by size
sorted_clusters = df_clusters.groupby("cluster").mean("users").sort_values("users", ascending=False).index.values
```

`````{code-cell} ipython3
---
editable: true
mystnb:
markdown_format: myst
slideshow:
slide_type: ''
tags: [remove-input]
---
grid = """
````{grid}
:class-container: big-number
%s
````
"""
scale_ordering = ["daily", "monthly"]
interior = []
for scale in scale_ordering:
users = df_clusters.query("timescale == @scale").groupby("cluster").mean("users")["users"].sum()
interior.append(dedent("""\
```{grid-item-card} %s
%s
```\
""" % (f"{scale.capitalize()} users", int(users))))
Markdown(grid % "\n".join(interior))
`````

+++ {"editable": true, "slideshow": {"slide_type": ""}}

Monthly active users over the past 6 months

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-input, full-width]
---
for scale in ["monthly", "daily"]:
for kind in ["users", "logusers"]:
bar = px.area(
df_clusters.query("timescale == @scale"),
x="date",
y=kind,
color="cluster",
category_orders={"cluster": sorted_clusters},
line_group="cluster",
title=f"{scale.capitalize()} {kind} across all 2i2c clusters",
height=500
)
bar.show()
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

### Active users by hub

Active users broken down by each hub that we run.
We break our hubs into two groups as some hubs have orders of magnitude more users than others.

+++ {"editable": true, "slideshow": {"slide_type": ""}}

#### Count hubs by community size

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-cell]
---
# Mean users for each hub
df_sums = df.groupby(["clusterhub", "timescale"]).mean("users")
# Calculate bins and add it to data for plotting
bins = [0, 5, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, 10000]
labels = [f"{bins[ii]}-{bins[ii+1]}" for ii in range(len(bins)-1)]
df_sums["bin"] = pd.cut(df_sums["users"], bins, labels=labels, right=False)
df_sums = df_sums.reset_index()
max_y_bins = df_sums.groupby(["timescale", "bin"]).count()["users"].max() + 10
max_y_users = df_sums.groupby(["timescale", "bin"]).sum()["users"].max() + 100
```

+++ {"editable": true, "slideshow": {"slide_type": ""}}

#### Total number of users binned by community size

Tells us the percentage of our userbase that comes from different community sizes.

```{code-cell} ipython3
---
editable: true
slideshow:
slide_type: ''
tags: [remove-input, remove-stderr, remove-stdout]
---
chs_bins = []
chs_users = []
chs_perc = []
groups = df_sums.groupby("timescale")
for scale in scale_ordering:
idata = groups.get_group(scale).copy()
binned_data = idata.groupby('bin').size().reset_index(name='count')
# Number
ch = alt.Chart(idata, title=f"{scale}").mark_bar().encode(
alt.X("bin:O", scale=alt.Scale(domain=labels), axis=alt.Axis(labelAngle=-45), title=f"{scale} Active Users"),
y=alt.Y('count()', title="Number of communities", scale=alt.Scale(domain=[0, max_y_bins])),
color="clusterhub",
tooltip=["users", "clusterhub"],
).interactive()
chs_bins.append(ch)
# Percentage breakdown chart
bin_sums = idata.groupby("bin").sum()["users"]
bin_sums = bin_sums / bin_sums.sum()
ch = alt.Chart(bin_sums.reset_index(), title=f"{scale}").mark_bar().encode(
x = "bin",
y = alt.Y("users", axis=alt.Axis(format='%'), scale=alt.Scale(domain=[0, 1])),
tooltip=["bin", alt.Tooltip("users", format='.0%')],
).interactive()
chs_perc.append(ch)
# Display the charts
display(alt.hconcat(*chs_bins, title=f"Number of communities in bins of active users"))
display(alt.hconcat(*chs_users, title=f"Total Active Users by community size"))
display(alt.hconcat(*chs_perc, title=f"% {scale} Total Active Users by community size"))
```
1 change: 1 addition & 0 deletions book/scripts/download_airtable_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
for (name, table_id, view_id) in views:
table = api.table(base_id, table_id)
records = table.all(view=view_id)
print(f"Downloading AirTable data from https://airtable.com/{base_id}/{table_id}/{view_id}...")
df = pd.DataFrame.from_records((r["fields"] for r in records))

# %% [markdown]
Expand Down
6 changes: 4 additions & 2 deletions book/scripts/download_grafana_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,16 @@ def get_pandas_prometheus(grafana_url: str, grafana_token: str, prometheus_uid:
# Add to our list so that we concatenate across all clusters
activity.append(iactivity)
except Exception:
errors.append(uid)
errors.append((uid, idata["name"].squeeze()))

# Convert into a DF and do a little munging
activity = pd.concat(activity)

# Write to a CSV for future ref
activity.to_csv(path_activity, index=False)
print(f"Finished loading hub activity data to {path_activity}...")
print(f"The following clusters had errors: {', '.join(errors)}")
if errors:
serrors = "\n".join(f"- {error}" for error in errors)
print(f"The following clusters had errors: {serrors}")
else:
print(f"Found data at {path_activity}, not downloading...")

0 comments on commit 4153ffd

Please sign in to comment.