Skip to content
This repository has been archived by the owner on Feb 8, 2018. It is now read-only.

Commit

Permalink
Merge pull request #4153 from gratipay/chomp
Browse files Browse the repository at this point in the history
Load up npm
  • Loading branch information
chadwhitacre authored Oct 26, 2016
2 parents 621cd9e + cc8ea2c commit c6a3bab
Show file tree
Hide file tree
Showing 7 changed files with 261 additions and 3 deletions.
13 changes: 10 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,28 @@ branches:
- master
before_install:
- git branch -vv | grep '^*'
- pwd

# Sometimes ya just halfta ...
- test -d yajl || git clone https://github.com/lloyd/yajl.git && cd yajl && git checkout 2.1.0
- test -f Makefile || ./configure && sudo make install && cd ..

- npm install -g marky-markdown
cache:
directories:
- env/bin
- env/lib/python2.7/site-packages
- yajl
install:
- if [ "${TRAVIS_BRANCH}" = "master" -a "${TRAVIS_PULL_REQUEST}" = "false" ]; then rm -rf env; fi
- touch requirements.txt package.json
- make env
- npm install -g marky-markdown
- env/bin/pip install --upgrade ijson==2.3.0
before_script:
- echo "DATABASE_URL=dbname=gratipay" | tee -a tests/local.env local.env
- psql -U postgres -c 'CREATE DATABASE "gratipay";'
- if [ "${TRAVIS_BRANCH}" = "master" -a "${TRAVIS_PULL_REQUEST}" = "false" ]; then rm -rfv tests/py/fixtures; fi
script: make bgrun test doc
script: LD_LIBRARY_PATH=/usr/local/lib make bgrun test doc
notifications:
email: false
irc: false
sudo: false
35 changes: 35 additions & 0 deletions bin/sync-npm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/sh
# This is a script to run under the Heroku Scheduler add-on to periodically
# sync our database with the npm registry.

set -e
cd "`dirname $0`/.."

# Install dependencies.
# =====================

# cmake - required by ...
curl https://cmake.org/files/v3.6/cmake-3.6.2-Linux-x86_64.tar.gz > cmake.tgz
echo '5df4b69d9e85093ae78b1070d5cb9f824ce0bdd02528948c3f6a740e240083e5 cmake.tgz' \
| sha256sum -c /dev/stdin --status
tar zxf cmake.tgz
PATH=/app/cmake-3.6.2-Linux-x86_64/bin:$PATH

# yajl
git clone https://github.com/lloyd/yajl.git
cd yajl
git checkout 2.1.0
./configure -p /app/.heroku/python
make install
cd ..

# python
pip install ijson==2.3.0
pip install -e .


# Sync with npm.
# ==============

URL=https://registry.npmjs.com/-/all
curl $URL | sync-npm serialize /dev/stdin | sync-npm upsert /dev/stdin
Empty file.
137 changes: 137 additions & 0 deletions gratipay/package_managers/sync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Sync our database with package managers. Just npm for now.
"""
from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import csv
import sys
import time

import ijson.backends.yajl2_cffi as ijson


log = lambda *a: print(*a, file=sys.stderr)


def arrayize(seq):
"""Given a sequence of str, return a Postgres array literal str.
"""
array = []
for item in seq:
assert type(item) is str
escaped = item.replace(b'\\', b'\\\\').replace(b'"', b'\\"')
quoted = b'"' + escaped + b'"'
array.append(quoted)
joined = b', '.join(array)
return b'{' + joined + b'}'


def serialize_one(out, package):
"""Takes a package and emits a serialization suitable for COPY.
"""
if not package or package['name'].startswith('_'):
log('skipping', package)
return 0

row = ( package['package_manager']
, package['name']
, package['description']
, arrayize(package['emails'])
)

out.writerow(row)
return 1


def serialize(args):
"""
"""
path = args.path
parser = ijson.parse(open(path))
start = time.time()
package = None
nprocessed = 0
out = csv.writer(sys.stdout)

def log_stats():
log("processed {} packages in {:3.0f} seconds"
.format(nprocessed, time.time() - start))

for prefix, event, value in parser:

if not prefix and event == b'map_key':

# Flush the current package. We count on the first package being garbage.
processed = serialize_one(out, package)
nprocessed += processed
if processed and not(nprocessed % 1000):
log_stats()

# Start a new package.
package = { 'package_manager': b'npm'
, 'name': value
, 'description': b''
, 'emails': []
}

key = lambda k: package['name'] + b'.' + k

if event == b'string':
assert type(value) is unicode # Who knew? Seems to decode only for `string`.
value = value.encode('utf8')
if prefix == key(b'description'):
package['description'] = value
elif prefix in (key(b'author.email'), key(b'maintainers.item.email')):
package['emails'].append(value)

nprocessed += serialize_one(out, package) # Don't forget the last one!
log_stats()


def upsert(args):
from gratipay import wireup
db = wireup.db(wireup.env())
fp = open(args.path)
with db.get_cursor() as cursor:
assert cursor.connection.encoding == 'UTF8'

# http://tapoueh.org/blog/2013/03/15-batch-update.html
cursor.run("CREATE TEMP TABLE updates (LIKE packages INCLUDING ALL) ON COMMIT DROP")
cursor.copy_expert('COPY updates (package_manager, name, description, emails) '
'FROM STDIN WITH (FORMAT csv)', fp)
cursor.run("""
WITH updated AS (
UPDATE packages p
SET package_manager = u.package_manager
, description = u.description
, emails = u.emails
FROM updates u
WHERE p.name = u.name
RETURNING p.name
)
INSERT INTO packages(package_manager, name, description, emails)
SELECT package_manager, name, description, emails
FROM updates u LEFT JOIN updated USING(name)
WHERE updated.name IS NULL
GROUP BY u.package_manager, u.name, u.description, u.emails
""")


def parse_args(argv):
p = argparse.ArgumentParser()
p.add_argument('command', choices=['serialize', 'upsert'])
p.add_argument('path', help="the path to the input file")
p.add_argument( '-i', '--if_modified_since'
, help='a number of minutes in the past, past which we would like to see new '
'updates (only meaningful for `serialize`; -1 means all!)'
, type=int
, default=-1
)
return p.parse_args(argv)


def main(argv=sys.argv):
args = parse_args(argv[1:])
globals()[args.command](args)
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
, entry_points = { 'console_scripts'
: [ 'payday=gratipay.cli:payday'
, 'fake_data=gratipay.utils.fake_data:main'
, 'sync-npm=gratipay.package_managers.sync:main'
]
}
)
15 changes: 15 additions & 0 deletions sql/branch.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
BEGIN;

CREATE TABLE packages
( id bigserial PRIMARY KEY
, package_manager text NOT NULL
, name text NOT NULL
, description text NOT NULL
, readme text NOT NULL DEFAULT ''
, readme_raw text NOT NULL DEFAULT ''
, readme_type text NOT NULL DEFAULT ''
, emails text[] NOT NULL
, UNIQUE (package_manager, name)
);

END;
63 changes: 63 additions & 0 deletions tests/py/test_npm_sync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Tests for syncing npm. Requires a `pip install ijson`, which requires yajl.
"""
from __future__ import absolute_import, division, print_function, unicode_literals

from subprocess import Popen, PIPE

from gratipay.testing import Harness


def load(raw):
serialized = Popen( ('env/bin/sync-npm', 'serialize', '/dev/stdin')
, stdin=PIPE, stdout=PIPE
).communicate(raw)[0]
Popen( ('env/bin/sync-npm', 'upsert', '/dev/stdin')
, stdin=PIPE, stdout=PIPE
).communicate(serialized)[0]


class Tests(Harness):

def test_packages_starts_empty(self):
assert self.db.all('select * from packages') == []

# sn - sync-npm

def test_sn_inserts_packages(self):
load(br'''
{ "_updated": 1234567890
, "testing-package":
{ "name":"testing-package"
, "description":"A package for testing"
, "maintainers":[{"email":"[email protected]"}]
, "author": {"email":"[email protected]"}
, "time":{"modified":"2015-09-12T03:03:03.135Z"}
}
}
''')

package = self.db.one('select * from packages')
assert package.package_manager == 'npm'
assert package.name == 'testing-package'
assert package.description == 'A package for testing'
assert package.name == 'testing-package'


def test_sn_handles_quoting(self):
load(br'''
{ "_updated": 1234567890
, "testi\\\"ng-pa\\\"ckage":
{ "name":"testi\\\"ng-pa\\\"ckage"
, "description":"A package for \"testing\""
, "maintainers":[{"email":"alice@\"example\".com"}]
, "author": {"email":"\\\\\"bob\\\\\"@example.com"}
, "time":{"modified":"2015-09-12T03:03:03.135Z"}
}
}
''')

package = self.db.one('select * from packages')
assert package.package_manager == 'npm'
assert package.name == r'testi\"ng-pa\"ckage'
assert package.description == 'A package for "testing"'
assert package.emails == ['alice@"example".com', r'\\"bob\\"@example.com']

0 comments on commit c6a3bab

Please sign in to comment.