This repository has been archived by the owner on Feb 8, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 308
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4153 from gratipay/chomp
Load up npm
- Loading branch information
Showing
7 changed files
with
261 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/bin/sh | ||
# This is a script to run under the Heroku Scheduler add-on to periodically | ||
# sync our database with the npm registry. | ||
|
||
set -e | ||
cd "`dirname $0`/.." | ||
|
||
# Install dependencies. | ||
# ===================== | ||
|
||
# cmake - required by ... | ||
curl https://cmake.org/files/v3.6/cmake-3.6.2-Linux-x86_64.tar.gz > cmake.tgz | ||
echo '5df4b69d9e85093ae78b1070d5cb9f824ce0bdd02528948c3f6a740e240083e5 cmake.tgz' \ | ||
| sha256sum -c /dev/stdin --status | ||
tar zxf cmake.tgz | ||
PATH=/app/cmake-3.6.2-Linux-x86_64/bin:$PATH | ||
|
||
# yajl | ||
git clone https://github.com/lloyd/yajl.git | ||
cd yajl | ||
git checkout 2.1.0 | ||
./configure -p /app/.heroku/python | ||
make install | ||
cd .. | ||
|
||
# python | ||
pip install ijson==2.3.0 | ||
pip install -e . | ||
|
||
|
||
# Sync with npm. | ||
# ============== | ||
|
||
URL=https://registry.npmjs.com/-/all | ||
curl $URL | sync-npm serialize /dev/stdin | sync-npm upsert /dev/stdin |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
"""Sync our database with package managers. Just npm for now. | ||
""" | ||
from __future__ import absolute_import, division, print_function, unicode_literals | ||
|
||
import argparse | ||
import csv | ||
import sys | ||
import time | ||
|
||
import ijson.backends.yajl2_cffi as ijson | ||
|
||
|
||
log = lambda *a: print(*a, file=sys.stderr) | ||
|
||
|
||
def arrayize(seq): | ||
"""Given a sequence of str, return a Postgres array literal str. | ||
""" | ||
array = [] | ||
for item in seq: | ||
assert type(item) is str | ||
escaped = item.replace(b'\\', b'\\\\').replace(b'"', b'\\"') | ||
quoted = b'"' + escaped + b'"' | ||
array.append(quoted) | ||
joined = b', '.join(array) | ||
return b'{' + joined + b'}' | ||
|
||
|
||
def serialize_one(out, package): | ||
"""Takes a package and emits a serialization suitable for COPY. | ||
""" | ||
if not package or package['name'].startswith('_'): | ||
log('skipping', package) | ||
return 0 | ||
|
||
row = ( package['package_manager'] | ||
, package['name'] | ||
, package['description'] | ||
, arrayize(package['emails']) | ||
) | ||
|
||
out.writerow(row) | ||
return 1 | ||
|
||
|
||
def serialize(args): | ||
""" | ||
""" | ||
path = args.path | ||
parser = ijson.parse(open(path)) | ||
start = time.time() | ||
package = None | ||
nprocessed = 0 | ||
out = csv.writer(sys.stdout) | ||
|
||
def log_stats(): | ||
log("processed {} packages in {:3.0f} seconds" | ||
.format(nprocessed, time.time() - start)) | ||
|
||
for prefix, event, value in parser: | ||
|
||
if not prefix and event == b'map_key': | ||
|
||
# Flush the current package. We count on the first package being garbage. | ||
processed = serialize_one(out, package) | ||
nprocessed += processed | ||
if processed and not(nprocessed % 1000): | ||
log_stats() | ||
|
||
# Start a new package. | ||
package = { 'package_manager': b'npm' | ||
, 'name': value | ||
, 'description': b'' | ||
, 'emails': [] | ||
} | ||
|
||
key = lambda k: package['name'] + b'.' + k | ||
|
||
if event == b'string': | ||
assert type(value) is unicode # Who knew? Seems to decode only for `string`. | ||
value = value.encode('utf8') | ||
if prefix == key(b'description'): | ||
package['description'] = value | ||
elif prefix in (key(b'author.email'), key(b'maintainers.item.email')): | ||
package['emails'].append(value) | ||
|
||
nprocessed += serialize_one(out, package) # Don't forget the last one! | ||
log_stats() | ||
|
||
|
||
def upsert(args): | ||
from gratipay import wireup | ||
db = wireup.db(wireup.env()) | ||
fp = open(args.path) | ||
with db.get_cursor() as cursor: | ||
assert cursor.connection.encoding == 'UTF8' | ||
|
||
# http://tapoueh.org/blog/2013/03/15-batch-update.html | ||
cursor.run("CREATE TEMP TABLE updates (LIKE packages INCLUDING ALL) ON COMMIT DROP") | ||
cursor.copy_expert('COPY updates (package_manager, name, description, emails) ' | ||
'FROM STDIN WITH (FORMAT csv)', fp) | ||
cursor.run(""" | ||
WITH updated AS ( | ||
UPDATE packages p | ||
SET package_manager = u.package_manager | ||
, description = u.description | ||
, emails = u.emails | ||
FROM updates u | ||
WHERE p.name = u.name | ||
RETURNING p.name | ||
) | ||
INSERT INTO packages(package_manager, name, description, emails) | ||
SELECT package_manager, name, description, emails | ||
FROM updates u LEFT JOIN updated USING(name) | ||
WHERE updated.name IS NULL | ||
GROUP BY u.package_manager, u.name, u.description, u.emails | ||
""") | ||
|
||
|
||
def parse_args(argv): | ||
p = argparse.ArgumentParser() | ||
p.add_argument('command', choices=['serialize', 'upsert']) | ||
p.add_argument('path', help="the path to the input file") | ||
p.add_argument( '-i', '--if_modified_since' | ||
, help='a number of minutes in the past, past which we would like to see new ' | ||
'updates (only meaningful for `serialize`; -1 means all!)' | ||
, type=int | ||
, default=-1 | ||
) | ||
return p.parse_args(argv) | ||
|
||
|
||
def main(argv=sys.argv): | ||
args = parse_args(argv[1:]) | ||
globals()[args.command](args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
BEGIN; | ||
|
||
CREATE TABLE packages | ||
( id bigserial PRIMARY KEY | ||
, package_manager text NOT NULL | ||
, name text NOT NULL | ||
, description text NOT NULL | ||
, readme text NOT NULL DEFAULT '' | ||
, readme_raw text NOT NULL DEFAULT '' | ||
, readme_type text NOT NULL DEFAULT '' | ||
, emails text[] NOT NULL | ||
, UNIQUE (package_manager, name) | ||
); | ||
|
||
END; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
"""Tests for syncing npm. Requires a `pip install ijson`, which requires yajl. | ||
""" | ||
from __future__ import absolute_import, division, print_function, unicode_literals | ||
|
||
from subprocess import Popen, PIPE | ||
|
||
from gratipay.testing import Harness | ||
|
||
|
||
def load(raw): | ||
serialized = Popen( ('env/bin/sync-npm', 'serialize', '/dev/stdin') | ||
, stdin=PIPE, stdout=PIPE | ||
).communicate(raw)[0] | ||
Popen( ('env/bin/sync-npm', 'upsert', '/dev/stdin') | ||
, stdin=PIPE, stdout=PIPE | ||
).communicate(serialized)[0] | ||
|
||
|
||
class Tests(Harness): | ||
|
||
def test_packages_starts_empty(self): | ||
assert self.db.all('select * from packages') == [] | ||
|
||
# sn - sync-npm | ||
|
||
def test_sn_inserts_packages(self): | ||
load(br''' | ||
{ "_updated": 1234567890 | ||
, "testing-package": | ||
{ "name":"testing-package" | ||
, "description":"A package for testing" | ||
, "maintainers":[{"email":"[email protected]"}] | ||
, "author": {"email":"[email protected]"} | ||
, "time":{"modified":"2015-09-12T03:03:03.135Z"} | ||
} | ||
} | ||
''') | ||
|
||
package = self.db.one('select * from packages') | ||
assert package.package_manager == 'npm' | ||
assert package.name == 'testing-package' | ||
assert package.description == 'A package for testing' | ||
assert package.name == 'testing-package' | ||
|
||
|
||
def test_sn_handles_quoting(self): | ||
load(br''' | ||
{ "_updated": 1234567890 | ||
, "testi\\\"ng-pa\\\"ckage": | ||
{ "name":"testi\\\"ng-pa\\\"ckage" | ||
, "description":"A package for \"testing\"" | ||
, "maintainers":[{"email":"alice@\"example\".com"}] | ||
, "author": {"email":"\\\\\"bob\\\\\"@example.com"} | ||
, "time":{"modified":"2015-09-12T03:03:03.135Z"} | ||
} | ||
} | ||
''') | ||
|
||
package = self.db.one('select * from packages') | ||
assert package.package_manager == 'npm' | ||
assert package.name == r'testi\"ng-pa\"ckage' | ||
assert package.description == 'A package for "testing"' | ||
assert package.emails == ['alice@"example".com', r'\\"bob\\"@example.com'] |