-
Notifications
You must be signed in to change notification settings - Fork 0
/
importer.coffee
76 lines (67 loc) · 2.28 KB
/
importer.coffee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Arguments: coffee importer.coffee {text-file} {corpus} {sub-corpus}
fs = require "fs"
mongoose = require "mongoose"
colors = require "colors"
async = require "async"
[_, _, textFile, corpus, subCorpus] = process.argv
mongoose.connect "/tmp/mongodb-27017.sock/stm_#{corpus}"
Topic = mongoose.model "Topic", new mongoose.Schema
id: type: Number
name: String
Record = mongoose.model "SubCorpus_#{subCorpus}", new mongoose.Schema
article_id: String
topic: type: mongoose.Schema.ObjectId, ref: "Topic"
proportion: Number
topics = []
getOrInsertTopic = (id, callback) ->
return callback null, topics[id] if topics[id]?
Topic.findOneAndUpdate {id: id}, {$setOnInsert: id: id, name: "Topic #{id}"},
new: true, upsert: true, callback
processLine = (line, callback) ->
return callback null, 0 if line[0] is "#"
line = line
.split /\s+/
.filter (x) -> x isnt ""
return callback null, 0 if line.length is 0
[_, article_id, tuples...] = line
try
article_id = article_id.split("/")[-1..][0]
catch ex
console.error "Error: #{ex} in [#{line}]"
process.exit 1
# console.log "- decomposing article:".yellow, article_id
async.map [0...(tuples.length / 2)].map((i) -> i * 2),
(i, callback) ->
getOrInsertTopic Number(tuples[i]), (err, topic) ->
if err?
console.error "- error:".redBG, err
return callback err
callback null, article_id: article_id, topic: topic._id, proportion: Number tuples[i + 1]
(err, docs) ->
Record.create docs, ->
callback err, docs.length
fstr = []
fin = fs.createReadStream textFile, encoding: "utf8"
fin.on "data", (chunk) ->
chunk = chunk.split /[\r\n]+/
if fstr.length > 0
fstr[fstr.length - 1] += chunk[0]
chunk = chunk[1..]
fstr = fstr.concat chunk
if fstr.length > 100
fin.pause()
async.map fstr[...-1], processLine, (err, counts) ->
count = counts.reduce (s, x) -> s + x
return console.error "- error processing #{count} tuples".redBG, err if err?
console.log "- processed #{count} tuples".green
fin.resume()
fstr = fstr[-1..]
fin.on "end", ->
async.map fstr, processLine, (err, counts) ->
count = counts.reduce (s, x) -> s + x
if err?
console.error "- error processing #{count} tuples".redBG, err
return process.exit 1
console.log "- processed #{count} tuples".green
console.log "- done".green
process.exit()