list-branch-pr

#!/usr/bin/env python3

"""Get a list of pull requests that need building.

Requires a GITHUB_TOKEN in the environment.

This script's output is one commit to be built per line, with the following
fields separated by tab characters:

1. The pull request number where progress should be reported.
2. The commit hash to be checked out and built.
3. The type of build for this PR (i.e. whether it has been built before).
4. The name of an *.env file, without the .env suffix, of the repository that
   this commit belongs to.
"""

import functools
import glob
import hashlib
import os
import os.path
import random
import sys

from argparse import ArgumentParser, Namespace
from collections import defaultdict
from datetime import datetime, timezone

from gql import Client, gql
from gql.transport.requests import RequestsHTTPTransport
from alibot_helpers.utilities import parse_env_file
from alibot_helpers.github_utilities import \
    github_token, GithubCachedClient, setGithubStatus

DEFAULTENV_NAME = "DEFAULTS.env"


@functools.lru_cache(maxsize=None)
def query_repo_info(session, org, repo, base_branch, include_base_branch=False):
    """Query the given repo to get pull request statuses."""
    return session.execute(QUERY, {
        "repoOwner": org,
        "repoName": repo,
        "baseBranch": base_branch,
        "includeBaseBranch": include_base_branch,
    }, "statuses")["repository"]


@functools.lru_cache(maxsize=None)
def query_team_members(session, org, team_slug):
    """Fetch the logins of the members of the given team."""
    return frozenset(member["login"] for member in session.execute(QUERY, {
        "repoOwner": org,
        "teamSlug": team_slug,
    }, "team")["organization"]["team"]["members"]["nodes"])


class RepoConfig:
    """Encapsulates a build configuration of the CI worker."""

    def __init__(self, build_config, definitions_dir,
                 mesos_role, container_name, config_suffix,
                 worker_index, worker_pool_size):
        self.build_config = build_config
        self.check_name = ""
        self.repo_name = ""
        self.branch_ref = "master"
        self.trusted_users = frozenset()
        self.trusted_team_slug = None
        self.trusted_author_associations = {"OWNER", "MEMBER", "COLLABORATOR"}
        self.worker_index = worker_index
        self.worker_pool_size = worker_pool_size

        # Parse .env files for this build config to initialise above variables.
        mesos, docker = mesos_role, container_name + config_suffix
        env_file_path = build_config + ".env"
        self._parse_env_files(
            os.path.join(definitions_dir, DEFAULTENV_NAME),
            os.path.join(definitions_dir, mesos, DEFAULTENV_NAME),
            os.path.join(definitions_dir, mesos, docker, DEFAULTENV_NAME),
            os.path.join(definitions_dir, mesos, docker, env_file_path),
        )

        if not self.check_name or not self.repo_name:
            raise ValueError("CHECK_NAME and PR_REPO are required")

        print("repo", self.build_config,
              "CHECK_NAME=%s REPO_NAME=%s BRANCH_REF=%s" % (
                  self.check_name, self.repo_name, self.branch_ref),
              sep=": ", file=sys.stderr)

    def _parse_env_files(self, *env_file_paths):
        """Apply settings from env_file_paths to this class."""
        # This variable might be overridden, so only modify trusted_author_*
        # once we've loaded the whole chain of .env files.
        trust_collaborators = False

        # Some variables are used elsewhere in the CI builder, so they
        # are defined as shell variables. Handle these here (including
        # multiple assignments on one line). Note that non-assignments
        # containing "=" (e.g. command arguments), variables valid only
        # for one command, and the like confuse this simple approach.
        for env_file_path in env_file_paths:
            if not os.path.exists(env_file_path):
                continue
            for var, value in parse_env_file(env_file_path):
                if var == "PR_REPO":
                    self.repo_name = value
                elif var == "PR_BRANCH":
                    self.branch_ref = value
                elif var == "CHECK_NAME":
                    self.check_name = value
                elif var == "TRUST_COLLABORATORS":
                    # Shell-style boolean: value is True if non-empty.
                    trust_collaborators = bool(value)
                elif var == "TRUSTED_USERS":
                    self.trusted_users = frozenset(value.split(","))
                elif var == "TRUSTED_TEAM":
                    self.trusted_team_slug = value or None

        if trust_collaborators:
            self.trusted_author_associations.add("CONTRIBUTOR")

    def should_process(self, commit_sha):
        """Decide whether this worker should handle the given PR.

        This is determined by the commit hash of the PR's HEAD.
        """
        sha = hashlib.new("sha256")
        sha.update(self.build_config.encode("utf-8"))
        sha.update(commit_sha.encode("utf-8"))
        intended_worker = int(sha.hexdigest(), 16) % self.worker_pool_size
        return intended_worker == self.worker_index

    def trust_pr(self, cgh, session, pull):
        """Determine whether the PR is trustworthy and can be built.

        If we specified a list of trusted users, teams or if we trust
        contributors, we need to check if this is the case for the given PR.
        Some of these options (notably trusting a team) will actually consume
        API calls, so you need to be careful about what you enable.

        If the PR is not trusted, an appropriate status is set on its latest
        commit.
        """
        trusted = bool(
            # If this PR has any approving reviews, trust it.
            # We only include approving reviews in pull["reviews"].
            pull["reviews"]["isApproved"] or

            # Trust org members" and repo owners" PRs.
            # If requested, trust previous contributors" PRs.
            pull["authorAssociation"] in self.trusted_author_associations or

            # Trust trusted users.
            pull["author"]["login"] in self.trusted_users or

            # If we trust a team and the user is a member, trust this PR.
            self.trusted_team_slug and
            pull["author"]["login"] in query_team_members(
                session, self.repo_name.split("/")[0], self.trusted_team_slug
            )
        )
        if not trusted:
            setGithubStatus(cgh, Namespace(
                commit="{repo}#{pr}@{commit}".format(
                    repo=self.repo_name,
                    pr=pull["number"],
                    commit=pull["commits"]["nodes"][0]["commit"]["oid"],
                ),
                status="{}/pending".format(self.check_name),
                message="Security: approval needed, not starting",
                url=None,
            ), debug_print=False)
        return trusted

    def process_single_pr(self, pull_number, last_commit, pr_created,
                          is_draft=False, is_trusted=False):
        """Decide whether to queue this PR (or branch) for testing.

        If it should be queued, return an object describing the PR, else,
        return None.
        """
        commit_hash = last_commit["oid"]
        reviewed = tested = success = False
        if last_commit["status"] is not None:
            for ctx in last_commit["status"]["contexts"]:
                context, cstate = ctx["context"], ctx["state"]
                if context == "review" and cstate == "SUCCESS":
                    reviewed = True
                if context == self.check_name and \
                   cstate in ("SUCCESS", "ERROR", "FAILURE"):
                    tested = True
                    success = cstate == "SUCCESS"
        if self.should_process(commit_hash) and not is_draft and (reviewed or is_trusted):
            state = "untested" if not tested else \
                ("succeeded" if success else "failed")
        else:
            state = "skip"

        # The PR has been waiting for checks either since it was created or
        # since its latest commit was pushed. commit["pushedDate"] was
        # unfortunately deprecated and removed by GitHub, so committedDate
        # will have to do. Eliminate a common source for errors by falling
        # back to the PR creation time, in case the commit was created well
        # before the PR.
        waiting_since = max(pr_created, last_commit["committedDate"])
        print(f"pr: {pull_number:6d}@{commit_hash:.7s}: wait={waiting_since} "
              f"revd={reviewed:d} trust={is_trusted:d} tested={tested:d} "
              f"success={success:d} => state={state}", file=sys.stderr)
        return None if state == "skip" else (state, {
            "number": pull_number,
            "sha": commit_hash,
            "build_config": self.build_config,
            "waiting_since": waiting_since,
        })

    def process_pulls(self, cgh, session, repo_info, show_base_branch=False):
        """Return pull requests we can process, with relevant attributes."""
        for pull in repo_info["pullRequests"]["nodes"]:
            item = self.process_single_pr(
                pull["number"], pull["commits"]["nodes"][0]["commit"],
                pull["createdAt"],
                is_draft=pull["isDraft"] or pull["title"].startswith("[WIP]"),
                is_trusted=self.trust_pr(cgh, session, pull),
            )
            if item is not None:
                yield item

        if show_base_branch:
            item = self.process_single_pr(self.branch_ref, repo_info["object"],
                                          pull["createdAt"])
            if item is not None:
                yield item

    def __repr__(self):
        return "RepoConfig(%s)" % self.build_config


def main(args):
    """Script entry point."""
    grouped = defaultdict(list)
    # Find .env files for this worker, parse them and find PRs to process for
    # each build config.
    env_files = glob.glob(os.path.join(
        args.definitions_dir, args.mesos_role,
        args.container_name + args.config_suffix, "*.env",
    ))
    transport = RequestsHTTPTransport(url="https://api.github.com/graphql",
                                      auth=("bearer", github_token()))
    with GithubCachedClient() as cgh:
        with Client(transport=transport) as session:
            for env_file in env_files:
                env_file_name = os.path.basename(env_file)
                if env_file_name == DEFAULTENV_NAME:
                    continue

                try:
                    repo = RepoConfig(env_file_name[:-4],
                                      args.definitions_dir, args.mesos_role,
                                      args.container_name, args.config_suffix,
                                      args.worker_index, args.worker_pool_size)
                except ValueError as err:
                    print(env_file_name, err, sep=": ", file=sys.stderr)
                else:
                    # Extend PR groups with PRs from this repo.
                    org, _, repo_name = repo.repo_name.partition("/")
                    repo_info = query_repo_info(session, org, repo_name,
                                                repo.branch_ref,
                                                args.show_base_branch)
                    for state, item in repo.process_pulls(cgh, session, repo_info,
                                                          args.show_base_branch):
                        grouped[state].append(item)

    def print_prs(group, number=None):
        """Print N randomly chosen PRs from group on stdout."""
        prs = grouped[group]
        if number is not None and number <= len(prs):
            prs = random.sample(prs, number)
        # Sort by PR number so that older PRs are built first. Don't use
        # .sort() here as it modifies the list in-place, and that list might
        # also be used elsewhere.
        # pr["waiting_since"] can apparently be None sometimes, which breaks
        # sorting, so default to the empty string if so.
        for pull in sorted(prs, key=lambda pr: pr["waiting_since"] or ""):
            if group == "untested":
                commit_timestr = pull["waiting_since"]
                commit_time = datetime.fromisoformat(commit_timestr.replace("Z", "+00:00")) \
                    if commit_timestr else datetime.now(timezone.utc)
                waiting_since = str(int(commit_time.timestamp()))
            else:
                # If this PR has been built before, the commit time is a bit
                # meaningless -- the PR hasn't actually been waiting since
                # then for this check.
                waiting_since = ""
            print(group, pull["number"], pull["sha"], pull["build_config"],
                  waiting_since, sep="\t")

    if grouped["untested"]:
        # If there are untested PRs waiting, build all of them first.
        print_prs("untested")
    elif grouped["failed"] and (not grouped["succeeded"] or random.random() < 0.7):
        # Rebuild a failed PR, but sometimes fall through and rebuild a
        # successful one instead (if there are any). This is so a single red PR
        # can't stop all green PRs from being rebuilt occasionally.
        print_prs("failed", 1)
    elif grouped["succeeded"]:
        print_prs("succeeded", 1)
    else:
        print("nothing to test:", grouped, file=sys.stderr)


def parse_args():
    """Parse command-line arguments."""
    parser = ArgumentParser(description=__doc__, epilog="""\
    Some options are required only when the corresponding environment variable
    is not set. In case both are given, the command-line option overrides the
    environment variable.""")

    def add_env_arg(short_name, long_name, env_var, vtype=str, **kwargs):
        """Add an argument that falls back to an environment variable."""
        if "help" in kwargs:
            kwargs["help"] += (
                " (required if %(var)s is empty; default %(var)s=%(value)s)"
                % {"var": env_var, "value": os.environ.get(env_var, "")})
        # Ignore empty values from the environment!
        if os.environ.get(env_var):
            env_value = os.environ[env_var]
            try:
                typed_env_value = vtype(env_value)
            except ValueError:
                # Fall through if the value is of the incorrect type. If we
                # raised an error here, command-line options couldn't override
                # invalid values from the environment. Instead, falling through
                # makes the cmd-line option required, which is what we want.
                pass
            else:
                parser.add_argument(
                    short_name, long_name, required=False, type=vtype,
                    default=typed_env_value, **kwargs)
                return
        parser.add_argument(
            short_name, long_name, required=True, type=vtype, **kwargs)

    parser.add_argument(
        "--definitions-dir", metavar="DIR",
        default=os.path.join("ali-bot", "ci", "repo-config"),
        help=("directory where .env files are located in a hierarchy; expects "
              "a directory structure of the form DIR/ROLE/CONTAINER/*.env "
              "(default %(default)s)"))

    parser.add_argument(
        "-b", "--show-base-branch", action="store_true",
        help=("Also consider checks on the latest commit of each repo's base "
              "branch."))

    add_env_arg("-i", "--worker-index", "WORKER_INDEX", vtype=int,
                help="Index for the current worker")

    add_env_arg("-n", "--worker-pool-size", "WORKERS_POOL_SIZE", vtype=int,
                help="Total number of workers")

    add_env_arg("-r", "--mesos-role", "MESOS_ROLE",
                help="Mesos role of the current worker")

    add_env_arg("-c", "--container-name", "CUR_CONTAINER",
                help="Short name of the container we're running in, e.g. slc8")

    parser.add_argument(
        "-s", "--config-suffix", metavar="SUFFIX",
        default=os.environ.get("ALIBOT_CONFIG_SUFFIX", ""),
        help=("Suffix to disambiguate which .env files should be chosen for the"
              " current Mesos role and container (default ALIBOT_CONFIG_SUFFIX="
              "%(default)s or empty if undefined). If %(metavar)s starts with a"
              " dash, use -s=%(metavar)s instead of -s %(metavar)s."))

    return parser.parse_args()


QUERY = gql("""\
query statuses(
  $repoOwner: String!
  $repoName: String!
  $baseBranch: String!
  $includeBaseBranch: Boolean!
) {
  repository(owner: $repoOwner, name: $repoName) {
    # Fetch status for the latest commit on the base branch too, if requested.
    object(expression: $baseBranch) @include(if: $includeBaseBranch) {
      ...commitInfo
    }

    pullRequests(
      last: 75
      baseRefName: $baseBranch
      states: OPEN
      orderBy: { field: UPDATED_AT, direction: DESC }
    ) {
      nodes {
        number
        title
        isDraft
        createdAt
        authorAssociation
        author {
          login
        }
        # APPROVED reviews do not include dismissed approvals, so no need to
        # check the commit ID matches.
        # We only need to know whether this PR has been approved at all (the
        # only values that matter for totalCount are 0 and >0), so limit our
        # query to the latest APPROVED review, if any.
        reviews(last: 1, states: APPROVED) {
          isApproved: totalCount
        }
        commits(last: 1) {
          nodes {
            commit {
              ...commitInfo
            }
          }
        }
      }
    }
  }
}

fragment commitInfo on Commit {
  oid
  committedDate
  status {
    contexts {
      context
      state
    }
  }
}

query team($repoOwner: String!, $teamSlug: String!) {
  organization(login: $repoOwner) {
    team(slug: $teamSlug) {
      members {
        nodes {
          login
        }
      }
    }
  }
}
""")


if __name__ == "__main__":
    main(parse_args())