-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathWikipedia.js
139 lines (126 loc) · 4.84 KB
/
Wikipedia.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/**
* Another custom node. Make sure to have a look at the documentation in
* `ExampleNode.js` first, so that you're familiar with the basics.
*/
const _ = require('lodash');
const got = require('got');
const processTemporaryNode = require('@cocoon/util/processTemporaryNode')
.default;
module.exports.Wikipedia = {
category: 'I/O',
description: `A custom Cocoon node that retrieves information via the Wikipedia API.`,
in: {
data: {
description: `An array of Ghibli movie data.`,
},
},
out: {
data: {
description: `The input data, annotated with Wikipedia information.`,
},
},
async *process(context) {
const { data } = context.ports.read();
/**
* For performance reasons, the data in Cocoon is passed via reference
* instead of making deep copies between every node. But we want to avoid
* changing data in a way that would change it in the cache of previous
* nodes as well, which could cause subtle issues in your processing
* pipeline.
*
* There's various ways to create deep copies in Cocoon, the easiest of
* which is annotating the input port with `clone: true`.
*
* But in this case it's sufficient to just shallow-copy the data since we
* will write new objects to it.
*/
const shallowDataCopy = [...data];
for (let i = 0; i < data.length; i++) {
/**
* Remember that Cocoon processors are generator functions. That means we
* can yield a progress report at any time, so that we can see it in the
* editor.
*
* But it has another benefit: since Cocoon processing runs on a single
* thread, to avoid expensive data serialisation, long processing tasks
* can make the Cocoon process unresponsive. Breaking processing up into
* smaller chunks also makes the processing interruptible.
*
* So yield often! The performance overhead is small, since UI updates are
* throttled.
*/
yield `Querying info for ${data[i].title}`;
// Example query:
// https://en.wikipedia.org/w/api.php?action=query&titles=My+Neighbor+Totoro&prop=images
const pageInfo = await queryWikipedia(
context,
`titles=${encodeURIComponent(data[i].title)}&prop=images`
);
/**
* It's not trivial to select the poster image. Wikipedia lists the images
* in order, but the infobox image could be at any position. Our best
* approach is to pick an image that contains the title.
*
* Fortunately for us, Cocoon has a node that can calculate a variety of
* distance metrics, and even combine them to form complex
* multi-dimensional distances.
*
* Cocoon nodes can use temporary nodes as part of the processing (since
* nodes are really just functions). That way you can avoid building and
* repeating overly complex graphs in the Cocoon editor itself.
*
* Documentation for the `Distance` node we're using is available at
* https://cocoon-docs.aen.now.sh/#distance
*/
const distanceResults = {};
for await (const progress of processTemporaryNode(
context,
'Distance',
{
affluent: pageInfo.images.map(x => ({ title: x.title })),
attribute: 'related',
data: [{ title: data[i].title }],
metrics: {
title: { type: 'String' },
},
},
distanceResults
)) {
yield;
}
const bestImage = distanceResults.data[0].related[0].title;
context.debug(
`calculated title distances, best match is: ${bestImage}`,
distanceResults
);
// Example query:
// https://en.wikipedia.org/w/api.php?action=query&titles=File:My%20Neighbor%20Totoro%20-%20Tonari%20no%20Totoro%20(Movie%20Poster).jpg&prop=imageinfo&iiprop=url
const imageInfo = await queryWikipedia(
context,
`titles=${encodeURIComponent(bestImage)}&prop=imageinfo&iiprop=url`
);
/**
* It's usually good practice to construct a new object using the
* spread-syntax, unless you're dealing with memory constraints or need to
* optimise for performance. It's the least error prone way of annotating
* the original data objects. Don't optimise prematurely!
*/
shallowDataCopy[i] = {
...data[i],
wikipedia: {
...pageInfo,
...imageInfo,
},
};
}
context.ports.write({ data: shallowDataCopy });
},
};
async function queryWikipedia(context, query) {
const url = `https://en.wikipedia.org/w/api.php?action=query&${query}&format=json`;
context.debug(`querying ${url}`);
const results = await got(url, { json: true });
context.debug(`got results:`, results.body);
const pages = results.body.query.pages;
return pages ? pages[Object.keys(pages)[0]] : null;
}