-
Notifications
You must be signed in to change notification settings - Fork 1
/
omega-writer.js
88 lines (79 loc) · 2.47 KB
/
omega-writer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
(function () {
var fs = require('fs');
// var threadsFile = fs.createWriteStream('threads.csv');
// var commentsFile = fs.createWriteStream('comments.csv');
var threadsFile = 'threads.csv';
var commentsFile = 'comments.csv';
var threadsRows = [];
var commentsRows = [];
var stringify = require('csv-stringify');
var columnsThreads = ['text', 'title', 'url', 'id', 'subreddit', 'meta', 'time', 'author', 'ups', 'downs', 'authorlinkkarma', 'authorcommentkarma', 'authorisgold'];
var stringifierThreads = stringify({
columns: columnsThreads
});
stringifierThreads.on('readable', function () {
var row = '';
while (row = stringifierThreads.read()) {
// threadsFile.write(row);
if (threadsRows.length < 1000) {
threadsRows.push(row);
} else {
fs.appendFileSync(threadsFile, threadsRows.join(''));
threadsRows = [];
}
}
});
var columnsComments = ['text', 'id', 'subreddit', 'meta', 'time', 'author', 'ups', 'downs', 'authorlinkkarma', 'authorcommentkarma', 'authorisgold'];
var stringifierComments = stringify({
columns: columnsComments
});
stringifierComments.on('readable', function () {
var row = '';
while (row = stringifierComments.read()) {
// commentsFile.write(row);
if (commentsRows.length < 1000) {
commentsRows.push(row);
} else {
fs.appendFileSync(commentsFile, commentsRows.join(''));
commentsRows = [];
}
}
});
var natural = require('natural');
var tokenizer = new natural.TreebankWordTokenizer();
var threadCount = 0;
var commentCount = 0;
function sanitize(text) {
text = text.toLowerCase();
text = tokenizer.tokenize(text).join(' ');
return text;
}
module.exports = {
writeThread: function (entry) {
if (entry.text === null) {
entry.text = '';
}
if (entry.title === null) {
entry.title = '';
}
entry.text = sanitize(entry.text);
entry.title = sanitize(entry.title);
stringifierThreads.write(entry);
threadCount++;
if (threadCount % 1000 === 0) {
console.log('\t\t%d threads', threadCount);
}
},
writeComment: function (entry) {
if (entry.text === undefined) {
entry.text = '';
}
entry.text = sanitize(entry.text);
stringifierComments.write(entry);
commentCount++;
if (commentCount % 1000 === 0) {
console.log('\t\t%d comments', commentCount);
}
}
};
})();