From 7fc139654ce1d269137b0476c9654f3e4331fc9e Mon Sep 17 00:00:00 2001 From: stefanodallapalma Date: Thu, 17 Sep 2020 14:56:04 +0200 Subject: [PATCH] GithubMiner ported from radon-h2020/radon-iac-miner --- .gitignore | 3 + MANIFEST.in | 1 + README.md | 68 ++++++++++++++ collector/__init__.py | 0 collector/command_line.py | 168 +++++++++++++++++++++++++++++++++++ collector/github.py | 180 ++++++++++++++++++++++++++++++++++++++ collector/report.py | 141 +++++++++++++++++++++++++++++ config.json | 1 + requirements.txt | 1 + setup.cfg | 2 + setup.py | 40 +++++++++ 11 files changed, 605 insertions(+) create mode 100644 MANIFEST.in create mode 100644 collector/__init__.py create mode 100644 collector/command_line.py create mode 100644 collector/github.py create mode 100644 collector/report.py create mode 100644 config.json create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index b6e4761..7887cc1 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# PyCharm +.idea/ \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..540b720 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/README.md b/README.md index 838a61f..216519e 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,70 @@ # radon-repositories-collector A tool to query GraphQL for collecting repositories metadata. + + +## How to install + +A PyPIP package will be available soon! +In the meantime, install it from source code with: + +``` +pip install -r requirements.txt +pip install . +``` + +## Command-line usage + +``` +usage: radon-repositories-collector [-h] [-v] [--from DATE_FROM] + [--to DATE_TO] [--pushed-after DATE_PUSH] + [--min-issues MIN_ISSUES] + [--min-releases MIN_RELEASES] + [--min-stars MIN_STARS] + [--min-watchers MIN_WATCHERS] [--verbose] + dest + +A Python library to collect repositories metadata from GitHub. + +positional arguments: + dest destination folder for report + +optional arguments: + -h, --help show this help message and exit + -v, --version show program's version number and exit + --from DATE_FROM collect repositories created since this date (default: + 2014-01-01 00:00:00) + --to DATE_TO collect repositories created up to this date (default: + 2014-01-01 00:00:00) + --pushed-after DATE_PUSH + collect only repositories pushed after this date + (default: 2019-01-01 00:00:00) + --min-issues MIN_ISSUES + collect repositories with at least issues + (default: 0) + --min-releases MIN_RELEASES + collect repositories with at least + releases (default: 0) + --min-stars MIN_STARS + collect repositories with at least stars + (default: 0) + --min-watchers MIN_WATCHERS + collect repositories with at least + watchers (default: 0) + --verbose show log (default: False) +``` + + +**Important!** The tool requires a personal access token to access the GraphQL APIs. See how to get one [here](https://github.com/settings/tokens). +Once generated, paste the token in the input field when asked. For example: + +``` +radon-repositories-collector . --from 2020-01-01 --to 2020-01-02 + +Github access token: +``` + +You may want to avoid the previous step. If so, add ```GITHUB_ACCESS_TOKEN=``` to the environment variables. + + +### Output +Running the tool from command-line generates an HTML report accessible at *\/report.html*. \ No newline at end of file diff --git a/collector/__init__.py b/collector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/collector/command_line.py b/collector/command_line.py new file mode 100644 index 0000000..2e36882 --- /dev/null +++ b/collector/command_line.py @@ -0,0 +1,168 @@ +import argparse +import copy +import io +import json +import os + +from datetime import datetime +from dotenv import load_dotenv +from getpass import getpass + +from collector.github import GithubRepositoriesCollector +from collector.report import create_report + +with open('config.json', 'r') as in_stream: + configuration = json.load(in_stream) + +def date(x: str) -> datetime: + """ + Check the passed date is well-formatted + :param x: a datetime + :return: datetime(x); raise an ArgumentTypeError otherwise + """ + try: + # String to datetime + x = datetime.strptime(x, '%Y-%m-%d') + except Exception: + raise argparse.ArgumentTypeError('Date format must be: YYYY-MM-DD') + + return x + + +def unsigned_int(x: str) -> int: + """ + Check the number is greater than or equal to zero + :param x: a number + :return: int(x); raise an ArgumentTypeError otherwise + """ + x = int(x) + if x < 0: + raise argparse.ArgumentTypeError('Minimum bound is 0') + return x + + +def valid_path(x: str) -> str: + """ + Check the path exists + :param x: a path + :return: the path if exists; raise an ArgumentTypeError otherwise + """ + if not os.path.isdir(x): + raise argparse.ArgumentTypeError('Insert a valid path') + + return x + + +def get_parser(): + description = 'A Python library to collect repositories metadata from GitHub.' + + parser = argparse.ArgumentParser(prog='radon-repositories-collector', description=description) + parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + configuration.get('version', '0.0')) + + parser.add_argument(action='store', + dest='dest', + type=valid_path, + help='destination folder for report') + + parser.add_argument('--from', + action='store', + dest='date_from', + type=date, + default=datetime.strptime('2014-01-01', '%Y-%m-%d'), + help='collect repositories created since this date (default: %(default)s)') + + parser.add_argument('--to', + action='store', + dest='date_to', + type=date, + default=datetime.strptime('2014-01-01', '%Y-%m-%d'), + help='collect repositories created up to this date (default: %(default)s)') + + parser.add_argument('--pushed-after', + action='store', + dest='date_push', + type=date, + default=datetime.strptime('2019-01-01', '%Y-%m-%d'), + help='collect only repositories pushed after this date (default: %(default)s)') + + parser.add_argument('--min-issues', + action='store', + dest='min_issues', + type=unsigned_int, + default=0, + help='collect repositories with at least issues (default: %(default)s)') + + parser.add_argument('--min-releases', + action='store', + dest='min_releases', + type=unsigned_int, + default=0, + help='collect repositories with at least releases (default: %(default)s)') + + parser.add_argument('--min-stars', + action='store', + dest='min_stars', + type=unsigned_int, + default=0, + help='collect repositories with at least stars (default: %(default)s)') + + parser.add_argument('--min-watchers', + action='store', + dest='min_watchers', + type=unsigned_int, + default=0, + help='collect repositories with at least watchers (default: %(default)s)') + + parser.add_argument('--verbose', + action='store_true', + dest='verbose', + default=False, + help='show log (default: %(default)s)') + + return parser + + + +def main(): + args = get_parser().parse_args() + + load_dotenv() + + token = os.getenv('GITHUB_ACCESS_TOKEN') + if not token: + token = getpass('Github access token:') + + github = GithubRepositoriesCollector( + access_token=token, + date_from=args.date_from, + date_to=args.date_to, + pushed_after=args.date_push, + min_stars=args.min_stars, + min_releases=args.min_releases, + min_watchers=args.min_watchers, + min_issues=args.min_issues + ) + + repositories = list() + for repository in github.collect_repositories(): + + if args.verbose: + print(f'Collecting {repository["url"]} ... ', end='', flush=True) + + # Save repository to collection + repositories.append(copy.deepcopy(repository)) + + if args.verbose: + print('DONE') + + # Generate html report + html = create_report(repositories) + filename = os.path.join(args.dest, 'report.html') + + with io.open(filename, "w", encoding="utf-8") as f: + f.write(html) + + if args.verbose: + print(f'Report created at {filename}') + + exit(0) diff --git a/collector/github.py b/collector/github.py new file mode 100644 index 0000000..3323ca9 --- /dev/null +++ b/collector/github.py @@ -0,0 +1,180 @@ +""" +A module to mine Github to extract relevant repositories based on given criteria +""" + +import re +import requests +from datetime import datetime + +QUERY = """{ search(query: "is:public stars:>=MIN_STARS mirror:false archived:false created:DATE_FROM..DATE_TO +pushed:>=PUSHED_AFTER", type: REPOSITORY, first: 50 AFTER) { repositoryCount pageInfo { endCursor startCursor +hasNextPage } edges { node { ... on Repository { id defaultBranchRef { name } owner { login } name url description +primaryLanguage { name } stargazers { totalCount } watchers { totalCount } releases { totalCount } issues { +totalCount } createdAt pushedAt updatedAt hasIssuesEnabled isArchived isDisabled isMirror isFork object(expression: +"master:") { ... on Tree { entries { name type } } } } } } } + + rateLimit { + limit + cost + remaining + resetAt + } +} +""" + + +class GithubRepositoriesCollector: + + def __init__(self, + access_token, + date_from: datetime, + date_to: datetime, + pushed_after: datetime, + min_stars: int = 0, + min_releases: int = 0, + min_watchers: int = 0, + min_issues: int = 0 + ): + """ + Crawl GitHub to extract repositories + + :param access_token: the token to query GraphQL (https://help.github.com/en/github/authenticating-to-github/creating-a-personal-access-token-for-the-command-line) + :param date_from: search for repositories created from date_from + :param date_to: search for repositories created up to date_to + :param pushed_after: datetime to filter out repositories. Repositories older than pushed_after are ignored. + :param min_stars: the minimum number of stars the repositories must have + :param min_releases: the minimum number of releases the repositories must have + :param min_watchers: the minimum number of watchers the repositories must have + :param min_issues: the minimum number of issues the repositories must have + """ + + self._token = access_token + self._quota = 0 + self._quota_reset_at = None + + self.date_from = date_from.strftime('%Y-%m-%dT%H:%M:%SZ') + self.date_to = date_to.strftime('%Y-%m-%dT%H:%M:%SZ') + self.pushed_after = pushed_after.strftime('%Y-%m-%dT%H:%M:%SZ') + self.min_stars = min_stars + self.min_releases = min_releases + self.min_watchers = min_watchers + self.min_issues = min_issues + + self.query = re.sub('MIN_STARS', str(self.min_stars), QUERY) + self.query = re.sub('DATE_FROM', str(self.date_from), self.query) + self.query = re.sub('DATE_TO', str(self.date_to), self.query) + self.query = re.sub('PUSHED_AFTER', self.pushed_after, self.query) + + @property + def quota(self): + return self._quota + + @property + def quota_reset_at(self): + return self._quota_reset_at + + def run_query(self, query): + """ + Run a graphql query + """ + request = requests.post('https://api.github.com/graphql', json={'query': query}, + headers={'Authorization': f'token {self._token}'}) + + if request.status_code == 200: + return request.json() + else: + print("Query failed to run by returning code of {}. {}".format(request.status_code, query)) + return None + + def filter_repositories(self, edges): + + for node in edges: + + node = node.get('node') + + if not node: + continue + + has_issues_enabled = node.get('hasIssuesEnabled', True) + issues = node['issues']['totalCount'] if node['issues'] else 0 + releases = node['releases']['totalCount'] if node['releases'] else 0 + stars = node['stargazers']['totalCount'] if node['stargazers'] else 0 + watchers = node['watchers']['totalCount'] if node['watchers'] else 0 + is_disabled = node.get('isDisabled', False) + is_fork = node.get('isFork', False) + is_locked = node.get('isLocked', False) + is_template = node.get('isTemplate', False) + primary_language = node['primaryLanguage']['name'].lower() if node['primaryLanguage'] else '' + + if self.min_issues and not has_issues_enabled: + continue + + if issues < self.min_issues: + continue + + if releases < self.min_releases: + continue + + if watchers < self.min_watchers: + continue + + if is_disabled or is_locked or is_template: + continue + + if is_fork: + continue + + object = node.get('object') + if not object: + continue + + dirs = [entry.get('name') for entry in object.get('entries', []) if entry.get('type') == 'tree'] + + yield dict( + id=node.get('id'), + default_branch=node.get('defaultBranchRef', {}).get('name'), + owner=node.get('owner', {}).get('login', ''), + name=node.get('name', ''), + url=node.get('url'), + description=node['description'] if node['description'] else '', + issues=issues, + releases=releases, + stars=stars, + watchers=watchers, + primary_language=primary_language, + created_at=str(node.get('createdAt')), + pushed_at=str(node.get('pushedAt')), + dirs=dirs + ) + + def collect_repositories(self): + + has_next_page = True + end_cursor = None + + while has_next_page: + + tmp_query = re.sub('AFTER', '', self.query) if not end_cursor else re.sub('AFTER', + f', after: "{end_cursor}"', + self.query) + result = self.run_query(tmp_query) + + if not result: + break + + if not result.get('data'): + break + + if not result['data'].get('search'): + break + + self._quota = int(result['data']['rateLimit']['remaining']) + self._quota_reset_at = result['data']['rateLimit']['resetAt'] + + has_next_page = bool(result['data']['search']['pageInfo'].get('hasNextPage')) + end_cursor = str(result['data']['search']['pageInfo'].get('endCursor')) + + edges = result['data']['search'].get('edges', []) + + for repo in self.filter_repositories(edges): + yield repo diff --git a/collector/report.py b/collector/report.py new file mode 100644 index 0000000..751f886 --- /dev/null +++ b/collector/report.py @@ -0,0 +1,141 @@ +import statistics +import datetime + + +def create_report(repositories: list) -> str: + """ + + :param repositories: a list of dictionaries containing repositories metadata + :return: the generated HTML report + """ + + """ + Generate an HTML report for the crawled repositories + """ + now = datetime.datetime.now() + generation_date = datetime.date(now.year, now.month, now.day) + avg_repos = len(repositories) + avg_issues = int(statistics.mean([d['issues'] for d in repositories])) + avg_releases = int(statistics.mean([d['releases'] for d in repositories])) + avg_stars = int(statistics.mean([d['stars'] for d in repositories])) + avg_watchers = int(statistics.mean([d['watchers'] for d in repositories])) + + accordion = '' + for item in repositories: + accordion += '{0}\n'.format(__generate_card(item)) + + return """ + + +
+ + +
+ +
+
+
+

Radon Repositories Collector Report

+

This report was generated on: {0}

+
+
+ +
+
+
+
+ +

{1}

+

Repositories

+
+
+
+
+
+ +

{2}

+

Avg issues

+
+
+
+
+
+ +

{3}

+

Avg releases

+
+
+
+
+
+ +

{4}

+

Avg stars

+
+
+
+
+
+ +

{5}

+

Avg watchers

+
+
+
+
+
+ {6} +
+ + + + + + """.format( + generation_date, + avg_repos, + avg_issues, + avg_releases, + avg_stars, + avg_watchers, + accordion) + + +def __generate_card(metadata: dict) -> str: + + return """ +
+
+
+ +
+
+
+
+

{4}

+ Created at: {5} + Pushed at: {6}
+ Default branch: {7} + Issues: {8} + Releases: {9} + Stars: {10} + Watchers: {11} + Language: {12} +
+
+
+ """.format(metadata.get('id').replace('=', '_'), + metadata.get('owner'), + metadata.get('name'), + metadata.get('url'), + metadata.get('description'), + metadata.get('created_at'), + metadata.get('pushed_at'), + metadata.get('default_branch'), + metadata.get('issues'), + metadata.get('releases'), + metadata.get('stars'), + metadata.get('watchers'), + metadata.get('primary_language')) diff --git a/config.json b/config.json new file mode 100644 index 0000000..08a1c63 --- /dev/null +++ b/config.json @@ -0,0 +1 @@ +{"version": "0.0.1"} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..61a1ac3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests~=2.24.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..224a779 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9c41e54 --- /dev/null +++ b/setup.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python + +import json +from setuptools import setup, find_packages + +with open("config.json", "r") as fh: + config = json.load(fh) + +with open("requirements.txt", "r") as reqs_file: + requirements = reqs_file.read().splitlines() + +with open("README.md", "r") as fh: + long_description = fh.read() + +VERSION = config.get("version", "0.0") + +setup(name='radon-repositories_collector', + version=VERSION, + description='A tool to query GraphQL for collecting repositories metadata.', + long_description=long_description, + long_description_content_type="text/markdown", + author='Stefano Dalla Palma', + maintainer='Stefano Dalla Palma', + author_email='stefano.dallapalma0@gmail.com', + url='https://github.com/radon-h2020/radon-repositories-collector', + download_url=f'https://github.com/radon-h2020/radon-repositories-collector/archive/{VERSION}.tar.gz', + packages=find_packages(exclude=('tests',)), + entry_points={ + 'console_scripts': ['radon-repositories-collector=collector.command_line:main'], + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3.7", + "License :: OSI Approved :: Apache Software License", + "Topic :: Software Development :: Libraries :: Python Modules", + "Operating System :: OS Independent" + ], + insall_requires=requirements +)