forked from shioju/crawlee-core
-
Notifications
You must be signed in to change notification settings - Fork 0
/
configuration.js
327 lines (327 loc) · 12.3 KB
/
configuration.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Configuration = void 0;
const tslib_1 = require("tslib");
const node_async_hooks_1 = require("node:async_hooks");
const node_events_1 = require("node:events");
const node_path_1 = require("node:path");
const log_1 = tslib_1.__importStar(require("@apify/log"));
const memory_storage_1 = require("@crawlee/memory-storage");
const fs_extra_1 = require("fs-extra");
const events_1 = require("./events");
const typedefs_1 = require("./typedefs");
/**
* `Configuration` is a value object holding Crawlee configuration. By default, there is a
* global singleton instance of this class available via `Configuration.getGlobalConfig()`.
* Places that depend on a configurable behaviour depend on this class, as they have the global
* instance as the default value.
*
* *Using global configuration:*
* ```js
* import { BasicCrawler, Configuration } from 'crawlee';
*
* // Get the global configuration
* const config = Configuration.getGlobalConfig();
* // Set the 'persistStateIntervalMillis' option
* // of global configuration to 10 seconds
* config.set('persistStateIntervalMillis', 10_000);
*
* // No need to pass the configuration to the crawler,
* // as it's using the global configuration by default
* const crawler = new BasicCrawler();
* ```
*
* *Using custom configuration:*
* ```js
* import { BasicCrawler, Configuration } from 'crawlee';
*
* // Create a new configuration
* const config = new Configuration({ persistStateIntervalMillis: 30_000 });
* // Pass the configuration to the crawler
* const crawler = new BasicCrawler({ ... }, config);
* ```
*
* The configuration provided via environment variables always takes precedence. We can also
* define the `crawlee.json` file in the project root directory which will serve as a baseline,
* so the options provided in constructor will override those. In other words, the precedence is:
*
* ```text
* crawlee.json < constructor options < environment variables
* ```
*
* ## Supported Configuration Options
*
* Key | Environment Variable | Default Value
* ---|---|---
* `memoryMbytes` | `CRAWLEE_MEMORY_MBYTES` | -
* `logLevel` | `CRAWLEE_LOG_LEVEL` | -
* `headless` | `CRAWLEE_HEADLESS` | `true`
* `defaultDatasetId` | `CRAWLEE_DEFAULT_DATASET_ID` | `'default'`
* `defaultKeyValueStoreId` | `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID` | `'default'`
* `defaultRequestQueueId` | `CRAWLEE_DEFAULT_REQUEST_QUEUE_ID` | `'default'`
* `persistStateIntervalMillis` | `CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS` | `60_000`
* `purgeOnStart` | `CRAWLEE_PURGE_ON_START` | `true`
* `persistStorage` | `CRAWLEE_PERSIST_STORAGE` | `true`
*
* ## Advanced Configuration Options
*
* Key | Environment Variable | Default Value
* ---|---|---
* `inputKey` | `CRAWLEE_INPUT_KEY` | `'INPUT'`
* `xvfb` | `CRAWLEE_XVFB` | -
* `chromeExecutablePath` | `CRAWLEE_CHROME_EXECUTABLE_PATH` | -
* `defaultBrowserPath` | `CRAWLEE_DEFAULT_BROWSER_PATH` | -
* `disableBrowserSandbox` | `CRAWLEE_DISABLE_BROWSER_SANDBOX` | -
* `availableMemoryRatio` | `CRAWLEE_AVAILABLE_MEMORY_RATIO` | `0.25`
*/
class Configuration {
/**
* Creates new `Configuration` instance with provided options. Env vars will have precedence over those.
*/
constructor(options = {}) {
Object.defineProperty(this, "options", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
Object.defineProperty(this, "services", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
Object.defineProperty(this, "storageManagers", {
enumerable: true,
configurable: true,
writable: true,
value: new Map()
});
this.buildOptions(options);
// Increase the global limit for event emitter memory leak warnings.
node_events_1.EventEmitter.defaultMaxListeners = 50;
// set the log level to support CRAWLEE_ prefixed env var too
const logLevel = this.get('logLevel');
if (logLevel) {
const level = Number.isFinite(+logLevel)
? +logLevel
: log_1.LogLevel[String(logLevel).toUpperCase()];
log_1.default.setLevel(level);
}
}
/**
* Returns configured value. First checks the environment variables, then provided configuration,
* fallbacks to the `defaultValue` argument if provided, otherwise uses the default value as described
* in the above section.
*/
get(key, defaultValue) {
// prefer env vars, always iterate through the whole map as there might be duplicate env vars for the same option
let envValue;
for (const [k, v] of (0, typedefs_1.entries)(Configuration.ENV_MAP)) {
if (key === v) {
envValue = process.env[k];
if (envValue) {
break;
}
}
}
if (envValue != null) {
return this._castEnvValue(key, envValue);
}
// check instance level options
if (this.options.has(key)) {
return this.options.get(key);
}
// fallback to defaults
return (defaultValue ?? Configuration.DEFAULTS[key] ?? envValue);
}
_castEnvValue(key, value) {
if (Configuration.INTEGER_VARS.includes(key)) {
return +value;
}
if (Configuration.BOOLEAN_VARS.includes(key)) {
// 0, false and empty string are considered falsy values
return !['0', 'false', ''].includes(String(value).toLowerCase());
}
return value;
}
/**
* Sets value for given option. Only affects this `Configuration` instance, the value will not be propagated down to the env var.
* To reset a value, we can omit the `value` argument or pass `undefined` there.
*/
set(key, value) {
this.options.set(key, value);
}
/**
* Sets value for given option. Only affects the global `Configuration` instance, the value will not be propagated down to the env var.
* To reset a value, we can omit the `value` argument or pass `undefined` there.
*/
static set(key, value) {
this.getGlobalConfig().set(key, value);
}
/**
* Returns cached instance of {@apilink StorageClient} using options as defined in the environment variables or in
* this {@apilink Configuration} instance. Only first call of this method will create the client, following calls will
* return the same client instance.
*
* Caching works based on the `storageClientOptions`, so calling this method with different options will return
* multiple instances, one for each variant of the options.
* @internal
*/
getStorageClient() {
if (this.options.has('storageClient')) {
return this.options.get('storageClient');
}
const options = this.options.get('storageClientOptions');
return this.createMemoryStorage(options);
}
getEventManager() {
if (this.options.has('eventManager')) {
return this.options.get('eventManager');
}
if (this.services.has('eventManager')) {
return this.services.get('eventManager');
}
const eventManager = new events_1.LocalEventManager(this);
this.services.set('eventManager', eventManager);
return eventManager;
}
/**
* Creates an instance of MemoryStorage using options as defined in the environment variables or in this `Configuration` instance.
* @internal
*/
createMemoryStorage(options = {}) {
const cacheKey = `MemoryStorage-${JSON.stringify(options)}`;
if (this.services.has(cacheKey)) {
return this.services.get(cacheKey);
}
const storage = new memory_storage_1.MemoryStorage({
persistStorage: this.get('persistStorage'),
// Override persistStorage if user provides it via storageClientOptions
...options,
});
this.services.set(cacheKey, storage);
return storage;
}
useStorageClient(client) {
this.options.set('storageClient', client);
}
static useStorageClient(client) {
this.getGlobalConfig().useStorageClient(client);
}
useEventManager(events) {
this.options.set('eventManager', events);
}
/**
* Returns the global configuration instance. It will respect the environment variables.
*/
static getGlobalConfig() {
if (Configuration.storage.getStore()) {
return Configuration.storage.getStore();
}
Configuration.globalConfig ?? (Configuration.globalConfig = new Configuration());
return Configuration.globalConfig;
}
/**
* Gets default {@apilink StorageClient} instance.
*/
static getStorageClient() {
return this.getGlobalConfig().getStorageClient();
}
/**
* Gets default {@apilink EventManager} instance.
*/
static getEventManager() {
return this.getGlobalConfig().getEventManager();
}
/**
* Resets global configuration instance. The default instance holds configuration based on env vars,
* if we want to change them, we need to first reset the global state. Used mainly for testing purposes.
*/
static resetGlobalState() {
delete this.globalConfig;
}
buildOptions(options) {
// try to load configuration from crawlee.json as the baseline
const path = (0, node_path_1.join)(process.cwd(), 'crawlee.json');
if ((0, fs_extra_1.pathExistsSync)(path)) {
try {
const file = (0, fs_extra_1.readFileSync)(path);
const optionsFromFileConfig = JSON.parse(file.toString());
Object.assign(options, optionsFromFileConfig);
}
catch {
// ignore
}
}
this.options = new Map((0, typedefs_1.entries)(options));
}
}
exports.Configuration = Configuration;
/**
* Maps environment variables to config keys (e.g. `CRAWLEE_MEMORY_MBYTES` to `memoryMbytes`)
*/
Object.defineProperty(Configuration, "ENV_MAP", {
enumerable: true,
configurable: true,
writable: true,
value: {
CRAWLEE_AVAILABLE_MEMORY_RATIO: 'availableMemoryRatio',
CRAWLEE_PURGE_ON_START: 'purgeOnStart',
CRAWLEE_MEMORY_MBYTES: 'memoryMbytes',
CRAWLEE_DEFAULT_DATASET_ID: 'defaultDatasetId',
CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID: 'defaultKeyValueStoreId',
CRAWLEE_DEFAULT_REQUEST_QUEUE_ID: 'defaultRequestQueueId',
CRAWLEE_INPUT_KEY: 'inputKey',
CRAWLEE_PERSIST_STATE_INTERVAL_MILLIS: 'persistStateIntervalMillis',
CRAWLEE_HEADLESS: 'headless',
CRAWLEE_XVFB: 'xvfb',
CRAWLEE_CHROME_EXECUTABLE_PATH: 'chromeExecutablePath',
CRAWLEE_DEFAULT_BROWSER_PATH: 'defaultBrowserPath',
CRAWLEE_DISABLE_BROWSER_SANDBOX: 'disableBrowserSandbox',
CRAWLEE_LOG_LEVEL: 'logLevel',
CRAWLEE_PERSIST_STORAGE: 'persistStorage',
}
});
Object.defineProperty(Configuration, "BOOLEAN_VARS", {
enumerable: true,
configurable: true,
writable: true,
value: ['purgeOnStart', 'headless', 'xvfb', 'disableBrowserSandbox', 'persistStorage']
});
Object.defineProperty(Configuration, "INTEGER_VARS", {
enumerable: true,
configurable: true,
writable: true,
value: ['memoryMbytes', 'persistStateIntervalMillis', 'systemInfoIntervalMillis']
});
Object.defineProperty(Configuration, "DEFAULTS", {
enumerable: true,
configurable: true,
writable: true,
value: {
defaultKeyValueStoreId: 'default',
defaultDatasetId: 'default',
defaultRequestQueueId: 'default',
inputKey: 'INPUT',
maxUsedCpuRatio: 0.95,
availableMemoryRatio: 0.25,
storageClientOptions: {},
purgeOnStart: true,
headless: true,
persistStateIntervalMillis: 60000,
systemInfoIntervalMillis: 1000,
persistStorage: true,
}
});
/**
* Provides access to the current-instance-scoped Configuration without passing it around in parameters.
* @internal
*/
Object.defineProperty(Configuration, "storage", {
enumerable: true,
configurable: true,
writable: true,
value: new node_async_hooks_1.AsyncLocalStorage()
});
//# sourceMappingURL=configuration.js.map