Skip to content

Commit

Permalink
Updates to github metrics (fix typo in module name) (#1246)
Browse files Browse the repository at this point in the history
Co-authored-by: Dennis <[email protected]>
  • Loading branch information
lithium323 and dioptx authored Jan 14, 2025
1 parent e8bcfd2 commit 032e6ac
Show file tree
Hide file tree
Showing 27 changed files with 544 additions and 128 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ sphinx-serve: .makemarkers/sphinx-docs
# DOCKER IMAGE
# ----------------------------------------------------------------------------#

IMAGE_TAG = ghcr.io/lithium323/op-analytics:v20250110.3
IMAGE_TAG_DAGSTER = ghcr.io/lithium323/op-analytics-dagster:v20250110.006
IMAGE_TAG = ghcr.io/lithium323/op-analytics:v20250114.1
IMAGE_TAG_DAGSTER = ghcr.io/lithium323/op-analytics-dagster:v20250114.002

.PHONY: uv-build
uv-build:
Expand Down
2 changes: 1 addition & 1 deletion helm/dagster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ dagster-user-deployments:
image:
# When a tag is not supplied, it will default as the Helm chart version.
repository: "ghcr.io/lithium323/op-analytics-dagster"
tag: "v20250110.006"
tag: "v20250114.002"

# Change with caution! If you're using a fixed tag for pipeline run images, changing the
# image pull policy to anything other than "Always" will use a cached/stale image, which is
Expand Down
2 changes: 1 addition & 1 deletion k8s/ingestion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
containers:
- name: python-runner
imagePullPolicy: IfNotPresent
image: ghcr.io/lithium323/op-analytics:v20250110.3
image: ghcr.io/lithium323/op-analytics:v20250114.1
command: ["tini", "-v", "--", "opdata"]
args: ["chains", "noargs_ingest"]
env:
Expand Down
2 changes: 1 addition & 1 deletion k8s/load-public-bq.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
containers:
- name: python-runner
imagePullPolicy: IfNotPresent
image: ghcr.io/lithium323/op-analytics:v20250110.3
image: ghcr.io/lithium323/op-analytics:v20250114.1
command: ["tini", "-v", "--", "opdata"]
args: ["chains", "noargs_public_bq"]
env:
Expand Down
2 changes: 1 addition & 1 deletion k8s/models-blockbatch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
containers:
- name: python-runner
imagePullPolicy: IfNotPresent
image: ghcr.io/lithium323/op-analytics:v20250110.3
image: ghcr.io/lithium323/op-analytics:v20250114.1
command: ["tini", "-v", "--", "opdata"]
args: ["chains", "noargs_blockbatch"]
env:
Expand Down
2 changes: 1 addition & 1 deletion k8s/models-daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
containers:
- name: python-runner
imagePullPolicy: IfNotPresent
image: ghcr.io/lithium323/op-analytics:v20250110.3
image: ghcr.io/lithium323/op-analytics:v20250114.1
command: ["tini", "-v", "--", "opdata"]
args: ["chains", "noargs_intermediate"]
env:
Expand Down
440 changes: 399 additions & 41 deletions notebooks/adhoc/platform_github_metrics/utilities_prototyping.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "op-analytics"
version = "20250110.3"
version = "20250114.1"
description = "Data analysis tools by OP Labs."
readme = "README.md"
requires-python = ">=3.12"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from dataclasses import dataclass
from datetime import timedelta

import polars as pl

from op_analytics.coreutils.logger import structlog
from op_analytics.coreutils.threads import run_concurrently
from op_analytics.coreutils.time import now_dt
from op_analytics.coreutils.time import now_dt, date_fromstr

from .singlerepo import GithubRepoActivityData

Expand Down Expand Up @@ -40,19 +41,34 @@ class GithubActivityData:
@classmethod
def fetch(
cls,
current_dt: str | None = None,
closed_items_last_n_days: int | None = None,
partition_dt: str | None = None,
include_open: bool = True,
closed_min_dt: str | None = None,
closed_max_dt: str | None = None,
repo_concurrent_workers: int = 4,
) -> "GithubActivityData":
current_dt = current_dt or now_dt()
closed_items_last_n_days = closed_items_last_n_days or CLOSED_ITEMS_LAST_N_DAYS
partition_dt = partition_dt or now_dt()
closed_max_dt = closed_max_dt or partition_dt

if closed_min_dt is None:
closed_min = date_fromstr(partition_dt) - timedelta(days=CLOSED_ITEMS_LAST_N_DAYS)
closed_min_dt = closed_min.strftime("%Y-%m-%d")

log.info(
"github activity fetch",
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
)

# Fetch analytics for all repos.
repo_dfs: dict[str, GithubRepoActivityData] = run_concurrently(
lambda repo: GithubRepoActivityData.fetch(
repo=repo,
current_dt=current_dt,
closed_items_last_n_days=closed_items_last_n_days,
partition_dt=partition_dt,
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
),
targets=REPOS,
max_workers=repo_concurrent_workers,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from dataclasses import dataclass
from datetime import timedelta
from threading import Lock

import polars as pl
Expand All @@ -9,7 +8,7 @@

from op_analytics.coreutils.env.vault import env_get
from op_analytics.coreutils.logger import bound_contextvars, structlog
from op_analytics.coreutils.time import datetime_fromdt, parse_isoformat
from op_analytics.coreutils.time import parse_isoformat, datetime_fromdt

from .comments import COMMENTS_SCHEMA, comment_to_row
from .issues import ISSUES_SCHEMA, issue_to_row
Expand Down Expand Up @@ -48,8 +47,9 @@ def path(self):

def fetch_prs(
repo: OptimismRepo,
current_dt: str,
last_n_days: int,
include_open: bool,
closed_min_dt: str,
closed_max_dt: str,
) -> pl.DataFrame:
"""Fetch the current state of pull requests."""
g = init_client()
Expand All @@ -59,16 +59,18 @@ def fetch_prs(
rows = fetch_prs_or_issues(
paginator=repo_obj.get_pulls,
to_row_func=pr_to_row,
current_dt=current_dt,
last_n_days=last_n_days,
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
)
return pl.DataFrame(rows, schema=PRS_SCHEMA)


def fetch_issues(
repo: OptimismRepo,
current_dt: str,
last_n_days: int,
include_open: bool,
closed_min_dt: str,
closed_max_dt: str,
) -> pl.DataFrame:
"""Fetch the current state of issues."""
g = init_client()
Expand All @@ -78,8 +80,9 @@ def fetch_issues(
rows = fetch_prs_or_issues(
paginator=repo_obj.get_issues,
to_row_func=issue_to_row,
current_dt=current_dt,
last_n_days=last_n_days,
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
)
return pl.DataFrame(rows, schema=ISSUES_SCHEMA)

Expand Down Expand Up @@ -147,8 +150,9 @@ def fetch_reviews(repo_obj: Repository, pr_number: int) -> list[dict]:
def fetch_prs_or_issues(
paginator,
to_row_func,
current_dt: str,
last_n_days: int,
include_open: bool,
closed_min_dt: str,
closed_max_dt: str,
) -> list[dict]:
"""Helper function to fetch pull requests or issues.
Expand All @@ -170,28 +174,28 @@ def fetch_prs_or_issues(
If we want to backfill data we can set the threshold time way back and that way we will
paginate through all of the closed items.
"""

threshold = datetime_fromdt(current_dt) - timedelta(days=last_n_days)
assert threshold.tzinfo is None
min_dt = datetime_fromdt(closed_min_dt)
max_dt = datetime_fromdt(closed_max_dt)

start_time = time.time()
open_prs_response = list(paginator(state="open", sort="created", direction="desc"))
log.info(f"fetched {len(open_prs_response)} open in {time.time() - start_time:.2f}s")

open_prs = []
for open_pr in open_prs_response:
open_prs.append(to_row_func(open_pr))
if include_open:
for open_pr in open_prs_response:
open_prs.append(to_row_func(open_pr))

closed_prs = []
start_time = time.time()
for closed_pr in paginator(state="closed", sort="updated", direction="desc"):
closed_at = parse_isoformat(closed_pr._rawData["closed_at"])
if closed_at > threshold:
if closed_at >= min_dt and closed_at < max_dt:
closed_prs.append(to_row_func(closed_pr))
else:
break
log.info(
f"found {len(closed_prs)} closed after {threshold.date()} in {time.time() - start_time:.2f}s"
f"found {len(closed_prs)} closed between {min_dt.strftime("%Y-%m-%d")} and {max_dt.strftime("%Y-%m-%d")} in {time.time() - start_time:.2f}s"
)

return open_prs + closed_prs
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,39 @@ class GithubRepoActivityData:

@classmethod
def fetch(
cls, repo: str, current_dt: str, closed_items_last_n_days: int
cls,
repo: str,
include_open: bool,
partition_dt: str,
closed_min_dt: str,
closed_max_dt: str,
) -> "GithubRepoActivityData":
with bound_contextvars(repo=repo):
repo_obj = OptimismRepo(repo)

prs = fetch_prs(repo_obj, current_dt, closed_items_last_n_days)
issues = fetch_issues(repo_obj, current_dt, closed_items_last_n_days)
prs = fetch_prs(
repo_obj,
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
)
issues = fetch_issues(
repo_obj,
include_open=include_open,
closed_min_dt=closed_min_dt,
closed_max_dt=closed_max_dt,
)

pr_comments = bulk_fetch_comments(repo_obj, prs["number"].to_list())
reviews = bulk_fetch_reviews(repo_obj, prs["number"].to_list())
# For all the fetched prs also fetch comments and reviews.
pr_list = prs["number"].to_list()
pr_comments = bulk_fetch_comments(repo_obj, pr_list)
pr_reviews = bulk_fetch_reviews(repo_obj, pr_list)

extracols = dict(repo=pl.lit(repo), dt=pl.lit(current_dt))
extracols = dict(repo=pl.lit(repo), dt=pl.lit(partition_dt))

return cls(
prs=prs.with_columns(**extracols),
issues=issues.with_columns(**extracols),
pr_comments=pr_comments.with_columns(**extracols),
pr_reviews=reviews.with_columns(**extracols),
pr_reviews=pr_reviews.with_columns(**extracols),
)
2 changes: 1 addition & 1 deletion src/op_analytics/cli/subcommands/pulls/github/execute.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from op_analytics.coreutils.logger import structlog

from .actvity.allrepos import GithubActivityData
from .activity.allrepos import GithubActivityData
from .dataaccess import GithubDataset
from .traffic.allrepos import GithubTrafficData
from .traffic.bigquery import write_traffic_to_bq
Expand Down
64 changes: 37 additions & 27 deletions src/op_analytics/cli/subcommands/pulls/github/traffic/allrepos.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,36 +35,46 @@

@dataclass
class GithubTrafficData:
# Metrics for all repositories. Concatenated in long form. Metrics inluded:
#
# - views_total
# - views_unique
# - clones_total
# - clones_unique
# - forks_total
#
# For views and clones the Github APIs report the last 14 days (there are
# 15 distinct dates in the result). To avoid issues with overwriting data
# we only keep the last 2 days of fully reported data.
#
# If we fetch on day N, we only keep dates N-1 and N-2. We discard day N
# as it is possibly incomplete. The rest of the window is discarded because
# we don't what to overwrite it unnecessarily and also day N-14 is also
# possibly incomplete.
#
# On the forks endpoint github reports all historicals so we don't have
# a similar windowing problem at day N-14. Howeve datta at day N still may
# be incomplete, so we discard it.
"""Traffic data for a single repo.
Metrics
=======
We include the following metrics
- views_total
- views_unique
- clones_total
- clones_unique
- forks_total
For views and clones the Github APIs report the last 14 days (there are 15
distinct dates in the result). To avoid issues with overwriting data we
only keep the last 2 days of fully reported data.
If we fetch on day N, we only keep dates N-1 and N-2. Days N and N-14 will be
incomplete, so we discard them. The rest of the window is also discarded
because we don't what to overwrite it unnecessarily.
On the forks endpoint github reports all historicals so we don't have a
similar windowing problem at day N-14. However data at day N still may be
incomplete, so we discard it.
Referrers
=========
The Github API does not breakdown referals by date. That makes analysis
somewhat complicated since one has to manually take care of any reporting
overlaps that may exist.
The referrers df is a snapshot of the value reported by Github at fetch
time. The "dt" column is the fetch date.
"""

# Metrics for all repositories. Concatenated in long form.
all_metrics_df_truncated: pl.DataFrame

# Referrers data for all repositories. Concatenated in long form.
# This is a snapshot of the value reported by Github at the time of
# fetching. The "dt" value correponds to the date when the data was fetched
# from the API.
#
# The Github API does not breakdown referals by date. That makes analysis
# somewhat complicated since one has to manually take care of any reporting
# overlaps that may exist.
referrers_snapshot_df: pl.DataFrame

@classmethod
Expand Down
Empty file.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ def volumes_fees_revenue(context: OpExecutionContext):
context.log.info(result)


@asset(deps=["volumes_fees_revenue"])
@asset(deps=[volumes_fees_revenue])
def volumes_fees_revenue_to_clickhouse(context: OpExecutionContext):
from op_analytics.cli.subcommands.pulls.defillama import volume_fees_revenue

Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,27 +1,24 @@
from dagster import (
OpExecutionContext,
asset,
)
from dagster import OpExecutionContext, asset


@asset
def github_traffic(context: OpExecutionContext) -> None:
def traffic(context: OpExecutionContext) -> None:
from op_analytics.cli.subcommands.pulls.github import execute

result = execute.execute_pull_traffic()
context.log.info(result)


@asset
def github_activity(context: OpExecutionContext) -> None:
def activity(context: OpExecutionContext) -> None:
from op_analytics.cli.subcommands.pulls.github import execute

result = execute.execute_pull_activity()
context.log.info(result)


@asset(deps=["github_traffic", "github_activity"])
def github_data_to_clickhouse(context: OpExecutionContext) -> None:
@asset(deps=[traffic, activity])
def write_to_clickhouse(context: OpExecutionContext) -> None:
from op_analytics.cli.subcommands.pulls.github import execute

result = execute.insert_to_clickhouse()
Expand Down
Loading

0 comments on commit 032e6ac

Please sign in to comment.