From ec5ba3bb86f6354def0b41d065b95429037db428 Mon Sep 17 00:00:00 2001 From: Pablo Seibelt Date: Fri, 5 May 2023 12:56:06 -0300 Subject: [PATCH] feat: Add traffic streams (#193) * Fix stargazers empty * Have to convert to list first * Add new streams to collect traffic data * Run black formatter * Fix error message * Apply suggestion * Sorry, just realized my primary key was wrong * Add condition to skip invalid data * Add note about permissions * Accept Edgar's suggestion Co-authored-by: Edgar R. M. * Accept Edgar's suggestion Co-authored-by: Edgar R. M. * Update tap_github/repository_streams.py Co-authored-by: Edgar R. M. * Update README.md Co-authored-by: Edgar R. M. --------- Co-authored-by: Edgar R. M. --- README.md | 4 + tap_github/repository_streams.py | 131 +++++++++++++++++++++++++++++++ tap_github/streams.py | 8 ++ 3 files changed, 143 insertions(+) diff --git a/README.md b/README.md index 94fc506d..e141a40f 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,10 @@ To avoid this, the GitHub streams will exit early. I.e. when there are no more ` You can easily run `tap-github` by itself or in a pipeline using [Meltano](www.meltano.com). +### Notes regarding permissions + +* For the `traffic_*` streams, [you will need write access to the repository](https://docs.github.com/en/rest/metrics/traffic?apiVersion=2022-11-28). You can enable extraction for these streams by [selecting them in the catalog](https://hub.meltano.com/singer/spec/#metadata). + ### Executing the Tap Directly ```bash diff --git a/tap_github/repository_streams.py b/tap_github/repository_streams.py index 3f3bdfae..9a372aef 100644 --- a/tap_github/repository_streams.py +++ b/tap_github/repository_streams.py @@ -2244,3 +2244,134 @@ def query(self) -> str: ), ), ).to_dict() + + +class TrafficRestStream(GitHubRestStream): + """Base class for Traffic Streams""" + + @property + def metadata(self): + """Override default selection metadata for this stream. + + TODO: Remove this in favor of the recommended approach when the SDK has one. + """ + result = super().metadata + if self._tap_input_catalog is None: + result.root.selected = False + return result + + def parse_response(self, response: requests.Response) -> Iterable[dict]: + if response.status_code != 200: + return [] + + """Parse the response and return an iterator of result rows.""" + yield from extract_jsonpath(self.records_jsonpath, input=response.json()) + + def validate_response(self, response: requests.Response) -> None: + """Allow some specific errors. + Do not raise exceptions if the error says "Must have push access to repository" + as we actually expect these in this stream when we don't have write permissions into it. + """ + if response.status_code == 403: + contents = response.json() + if contents["message"] == "Resource not accessible by integration": + self.logger.info("Permissions missing to sync stream '%s'", self.name) + return + super().validate_response(response) + + +class TrafficClonesStream(TrafficRestStream): + """Defines 'traffic_clones' stream.""" + + name = "traffic_clones" + path = "/repos/{org}/{repo}/traffic/clones" + primary_keys = ["repo", "org", "timestamp"] + replication_key = "timestamp" + parent_stream_type = RepositoryStream + ignore_parent_replication_key = True + state_partitioning_keys = ["repo", "org"] + records_jsonpath = "$.clones[*]" + + schema = th.PropertiesList( + # Parent keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + # Clones Data + th.Property("timestamp", th.DateTimeType), + th.Property("count", th.IntegerType), + th.Property("uniques", th.IntegerType), + ).to_dict() + + +class TrafficReferralPathsStream(TrafficRestStream): + """Defines 'traffic_referral_paths' stream.""" + + name = "traffic_referral_paths" + path = "/repos/{org}/{repo}/traffic/popular/paths" + primary_keys = ["repo", "org", "path"] + replication_key = None + parent_stream_type = RepositoryStream + ignore_parent_replication_key = True + state_partitioning_keys = ["repo", "org"] + records_jsonpath = "[*]" + + schema = th.PropertiesList( + # Parent keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + # Referral path data + th.Property("path", th.StringType), + th.Property("title", th.StringType), + th.Property("count", th.IntegerType), + th.Property("uniques", th.IntegerType), + ).to_dict() + + +class TrafficReferrersStream(TrafficRestStream): + """Defines 'traffic_referrers' stream.""" + + name = "traffic_referrers" + path = "/repos/{org}/{repo}/traffic/popular/referrers" + primary_keys = ["repo", "org", "referrer"] + replication_key = None + parent_stream_type = RepositoryStream + ignore_parent_replication_key = True + state_partitioning_keys = ["repo", "org"] + records_jsonpath = "[*]" + + schema = th.PropertiesList( + # Parent keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + # Referrer data + th.Property("referrer", th.StringType), + th.Property("count", th.IntegerType), + th.Property("uniques", th.IntegerType), + ).to_dict() + + +class TrafficPageViewsStream(TrafficRestStream): + """Defines 'traffic_pageviews' stream.""" + + name = "traffic_pageviews" + path = "/repos/{org}/{repo}/traffic/views" + primary_keys = ["repo", "org", "timestamp"] + replication_key = None + parent_stream_type = RepositoryStream + ignore_parent_replication_key = True + state_partitioning_keys = ["repo", "org"] + records_jsonpath = "$.views[*]" + + schema = th.PropertiesList( + # Parent keys + th.Property("repo", th.StringType), + th.Property("org", th.StringType), + th.Property("repo_id", th.IntegerType), + # Page view data + th.Property("timestamp", th.DateTimeType), + th.Property("count", th.IntegerType), + th.Property("uniques", th.IntegerType), + ).to_dict() diff --git a/tap_github/streams.py b/tap_github/streams.py index c1dc9cad..e1b05e58 100644 --- a/tap_github/streams.py +++ b/tap_github/streams.py @@ -41,6 +41,10 @@ StargazersGraphqlStream, StargazersStream, StatsContributorsStream, + TrafficClonesStream, + TrafficPageViewsStream, + TrafficReferralPathsStream, + TrafficReferrersStream, WorkflowRunJobsStream, WorkflowRunsStream, WorkflowsStream, @@ -94,6 +98,10 @@ def __init__(self, valid_queries: Set[str], streams: List[Type[Stream]]): StargazersGraphqlStream, StargazersStream, StatsContributorsStream, + TrafficClonesStream, + TrafficPageViewsStream, + TrafficReferralPathsStream, + TrafficReferrersStream, WorkflowRunJobsStream, WorkflowRunsStream, WorkflowsStream,