-
Notifications
You must be signed in to change notification settings - Fork 1
/
freebase.sh
24 lines (18 loc) · 860 Bytes
/
freebase.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#!/bin/bash
# Gets and attempts to load Freebase.
# For strings, only load the @en version. See -lang below.
# Get a Freebase dump from https://developers.google.com/freebase/data
DUMP=freebase-rdf-2014-07-13-00-00.gz
wget -nc http://commondatastorage.googleapis.com/freebase-public/rdf/$DUMP
# Process it.
# Load batches of 6 files.
# Needs 'unbuffer' from 'expect'. Ugh.
rm -rf test.db log processed
mkdir -p par && (cd par && rm -f *.par)
zcat $DUMP | \
parallel --jobs 6 --pipe --files --block 100M --tmpdir par gzip | \
xargs -n 6 echo | \
unbuffer -p sed 's/ /,/g' | \
xargs -n 1 -I FILES bash -c '../tinygraph -gzip -config config.freebase -lang en -silent-ignore -load FILES 2>&1 | tee -a log; echo `date` FILES >> processed; rm `echo FILES | tr -d "[:space:]" | sed "s/,/ /g"`'
# watch 'ls -lt par/*.par'
# (cd test.db && watch ls -l)