Skip to content

Commit

Permalink
Merge pull request #21 from Bartozzz/development
Browse files Browse the repository at this point in the history
Bump version 1.4.1
  • Loading branch information
Bartozzz authored Apr 23, 2018
2 parents d9312c9 + 3206d37 commit 1efe17b
Show file tree
Hide file tree
Showing 9 changed files with 227 additions and 175 deletions.
54 changes: 47 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ spider.get("/").then(({ req, res, uri }) => {
});
```

### Example 4: _Setting cookies_

```javascript
const url = "http://example.com/";
const spider = crawlerr(url);

spider.request.setCookie(spider.request.cookie("foobar=…"), url);
spider.request.setCookie(spider.request.cookie("session=…"), url);

spider.get("/profile").then(({ req, res, uri }) => {
//… spider.request.getCookieString(url);
//… spider.request.setCookies(url);
});
```

## API

### `crawlerr(base [, options])`
Expand All @@ -76,14 +91,18 @@ Creates a new `Crawlerr` instance for a specific website with custom `options`.

| Option | Default | Description |
|:-------------|:--------|:-----------------------------------------------|
| `concurrent` | `10` | How many request can be send at the same time |
| `concurrent` | `10` | How many request can be run simultaneously |
| `interval` | `250` | How often should new request be send (in ms) |
|| `null` | See [`request` defaults](https://github.com/request/request#requestdefaultsoptions) for more informations |

<br />

#### **public** `.get(url)`

Requests `url`. Returns a `Promise` with `{ req, res, uri }` as response, where `req` is the [Request object](#request), `res` is the [Response object](#response) and `uri` is the absolute `url` (resolved from `base`).
Requests `url`. Returns a `Promise` which resolves with `{ req, res, uri }`, where:
- `req` is the [Request object](#request);
- `res` is the [Response object](#response);
- `uri` is the absolute `url` (resolved from `base`).

**Example:**

Expand All @@ -95,9 +114,9 @@ spider

<br />

#### **public** `.when(url)`
#### **public** `.when(pattern)`

Searches on the entire website (not just a single page) urls matching the `url` pattern. `url` can include named [wildcards](https://github.com/Bartozzz/wildcard-named) which can be then retrieved in the response with `res.param`.
Searches the entire website for urls which match the specified `pattern`. `pattern` can include named [wildcards](https://github.com/Bartozzz/wildcard-named) which can be then retrieved in the response via `res.param`.

**Example:**

Expand All @@ -117,13 +136,12 @@ Executes a `callback` for a given `event`. For more informations about which eve

```javascript
spider.on("error", …);
spider.on("reject", …);
spider.on("resolve", …);
```

<br />

#### **public** `start()`/`stop()`
#### **public** `.start()`/`.stop()`

Starts/stops the crawler.

Expand All @@ -134,11 +152,33 @@ spider.start();
spider.stop();
```

<br />

#### **public** `.request`

A configured [`request`](https://github.com/request/request) object which is used by [`retry-request`](https://github.com/stephenplusplus/retry-request) when crawling webpages. Extends from `request.jar()`. Can be configured when initializing a new crawler instance through `options`. See [crawler options](https://github.com/Bartozzz/crawlerr#crawlerrbase--options) and [`request` documentation](https://github.com/request/request) for more informations.

**Example:**

```javascript
const url = "https://example.com";
const spider = crawlerr(url);
const request = spider.request;

request.post(`${url}/login`, (err, res, body) => {
request.setCookie(request.cookie("session=…"), url);
// Next requests will include this cookie

spider.get("/profile").then(…);
spider.get("/settings").then(…);
});
```

---

### Request

<sub>Extends the default `Node.js` [incoming message](https://nodejs.org/api/http.html#http_class_http_incomingmessage).</sub>
<sup>Extends the default `Node.js` [incoming message](https://nodejs.org/api/http.html#http_class_http_incomingmessage).</sup>

#### **public** `get(header)`

Expand Down
25 changes: 20 additions & 5 deletions dist/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,18 @@ var _mergeDescriptors = require("merge-descriptors");

var _mergeDescriptors2 = _interopRequireDefault(_mergeDescriptors);

var _request = require("request");

var _request2 = _interopRequireDefault(_request);

var _events = require("events");

var _events2 = _interopRequireDefault(_events);

var _setprototypeof = require("setprototypeof");

var _setprototypeof2 = _interopRequireDefault(_setprototypeof);

var _promise = require("./queue/promise");

var _promise2 = _interopRequireDefault(_promise);
Expand All @@ -20,9 +28,9 @@ var _router = require("./routing/router");

var _router2 = _interopRequireDefault(_router);

var _request = require("./routing/request");
var _request3 = require("./routing/request");

var _request2 = _interopRequireDefault(_request);
var _request4 = _interopRequireDefault(_request3);

var _response = require("./routing/response");

Expand All @@ -47,11 +55,18 @@ function createCrawler(base) {
concurrent: 10
}, options);

// Will be used by retry-request:
var requestJar = _request2.default.jar();
var requestObj = _request2.default.defaults(_extends({ jar: requestJar }, config));
(0, _setprototypeof2.default)(requestObj, requestJar);

// Crawler base:
var crawler = {
base: base,
opts: config,
req: _request2.default,
res: _response2.default
req: _request4.default,
res: _response2.default,
request: requestObj
};

// Glues all the components together:
Expand All @@ -63,5 +78,5 @@ function createCrawler(base) {
}

module.exports = createCrawler;
module.exports.request = _request2.default;
module.exports.request = _request4.default;
module.exports.response = _response2.default;
132 changes: 66 additions & 66 deletions dist/routing/request.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,71 +16,71 @@ var _typeIs2 = _interopRequireDefault(_typeIs);

function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }

exports.default = Object.create(_http2.default.IncomingMessage.prototype, {
/**
* Returns request header. The `Referrer` header field is special-cased,
* both `Referrer` and `Referer` are interchangeable.
*
* @param {string} header
* @return {string}
* @access public
*/
get: function get(header) {
if (!header || typeof header !== "string") {
throw new TypeError("req.get takes a string as argument, got " + (typeof header === "undefined" ? "undefined" : _typeof(header)));
}

var lower = header.toLowerCase();

switch (lower) {
case "referer":
case "referrer":
return this.headers.referrer || this.headers.referer;

default:
return this.headers[lower];
}
},


/**
* Check if the incoming request contains the "Content-Type" header field,
* and it contains the give mime `type`.
*
* @param {string|Array} types...
* @return {string|false}
* @access public
*/
is: function is() {
for (var _len = arguments.length, types = Array(_len), _key = 0; _key < _len; _key++) {
types[_key] = arguments[_key];
}

return (0, _typeIs2.default)(this, types);
},


/**
* Return the value of param `name` when present or `defaultValue`:
* - checks route placeholders, ex: `user/[all:username]`;
* - checks body params, ex: `id=12, {"id":12}`;
* - checks query string params, ex: `?id=12`;
*
* @param {string} name
* @param {any} defaultValue
* @return {string}
* @access public
*/
param: function param(name, defaultValue) {
var params = this.params || {};
var query = this.query || {};
var body = this.body || {};

if (params[name] != null) return params[name];
if (query[name] != null) return query[name];
if (body[name] != null) return body[name];

return defaultValue;
var req = Object.create(_http2.default.IncomingMessage.prototype);

/**
* Returns request header. The `Referrer` header field is special-cased, both
* `Referrer` and `Referer` are interchangeable.
*
* @param {string} header
* @return {string}
* @access public
*/
req.get = function (header) {
if (!header || typeof header !== "string") {
throw new TypeError("req.get takes a string as argument, got " + (typeof header === "undefined" ? "undefined" : _typeof(header)));
}
});

var lower = header.toLowerCase();

switch (lower) {
case "referer":
case "referrer":
return this.headers.referrer || this.headers.referer;

default:
return this.headers[lower];
}
};

/**
* Check if the incoming request contains the "Content-Type" header field and it
* contains the give mime `type`.
*
* @param {string|Array} types...
* @return {string|false}
* @access public
*/
req.is = function () {
for (var _len = arguments.length, types = Array(_len), _key = 0; _key < _len; _key++) {
types[_key] = arguments[_key];
}

return (0, _typeIs2.default)(this, types);
};

/**
* Return the value of param `name` when present or `defaultValue`:
* - checks route placeholders, ex: `user/[all:username]`;
* - checks body params, ex: `id=12, {"id":12}`;
* - checks query string params, ex: `?id=12`;
*
* @param {string} name
* @param {any} defaultValue
* @return {string}
* @access public
*/
req.param = function (name, defaultValue) {
var params = this.params || {};
var query = this.query || {};
var body = this.body || {};

if (params[name] != null) return params[name];
if (query[name] != null) return query[name];
if (body[name] != null) return body[name];

return defaultValue;
};

exports.default = req;
module.exports = exports["default"];
2 changes: 1 addition & 1 deletion dist/routing/router.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ exports.default = {
uri = this.normalizeUri(uri);

return new Promise(function (resolve, reject) {
(0, _retryRequest2.default)(uri, function (error, response) {
(0, _retryRequest2.default)(uri, { request: _this2.request }, function (error, response) {
if (error || response.statusCode !== 200) {
return reject(error || uri);
}
Expand Down
Loading

0 comments on commit 1efe17b

Please sign in to comment.