Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filepath and raw XML Text files are now supported by the fetch method #114

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ npm-debug.log
tmp
lib/tests/
lib/**/*.map
yarn.lock
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
## NPM : https://www.npmjs.com/package/@myagizmaktav/sitemapper

## Sitemap-parser
[![Code Scanning](https://github.com/seantomburke/sitemapper/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/seantomburke/sitemapper/actions/workflows/codeql-analysis.yml)
[![NPM Publish](https://github.com/seantomburke/sitemapper/actions/workflows/npm-publish.yml/badge.svg)](https://github.com/seantomburke/sitemapper/actions/workflows/npm-publish.yml)
Expand Down Expand Up @@ -35,7 +37,24 @@ sitemap.fetch('https://wp.seantburke.com/sitemap.xml').then(function(sites) {
console.log(sites);
});

// OR

sitemap.fetch("D:\\githubprojects\\src\\data\\sites-xmls\\Sitemap.xml").then(function(sites) {
console.log(sites);
}
);

// OR

sitemap.fetch('<?xml version="1.0" encoding="utf-8" standalone="yes" ?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://wp.seantburke.com/?p=231</loc></url></urlset>').then(function(sites) {
console.log(sites);
}
);



```

### Examples in ES6
```javascript
import Sitemapper from 'sitemapper';
Expand Down
24 changes: 8 additions & 16 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"name": "sitemapper",
"name": "@myagizmaktav/sitemapper",
myagizmaktav marked this conversation as resolved.
Show resolved Hide resolved
"version": "3.2.8",
"description": "Parser for XML Sitemaps to be used with Robots.txt and web crawlers",
"keywords": [
Expand All @@ -11,10 +11,10 @@
"crawlers",
"webcrawler"
],
"homepage": "http://github.com/seantomburke/sitemapper",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR changes the main URLs of the underlying package, so I wouldn't be able to merge this in. The NPM package needs to reference this repository and not the fork.

"homepage": "https://github.com/myagizmaktav/sitemapper",
"tonicExampleFilename": "example.js",
"bugs": {
"url": "http://github.com/seantomburke/sitemapper/issues"
"url": "https://github.com/myagizmaktav/sitemapper/issues"
},
"license": "MIT",
"files": [
Expand All @@ -25,13 +25,9 @@
"types": "./sitemapper.d.ts",
"repository": {
"type": "git",
"url": "git://github.com/seantomburke/sitemapper.git"
},
"author": {
"name": "Sean Thomas Burke",
"email": "[email protected]",
"url": "http://www.seantburke.com"
"url": "git+https://github.com/myagizmaktav/sitemapper.git"
},
"author": "Mehmet Yağız Maktav <[email protected]>",
"scripts": {
"compile": "babel src -d lib -s && tsc --project ./src/tests/",
"build": "npm run clean && npm run compile",
Expand All @@ -42,15 +38,11 @@
"docs": "documentation build ./src/assets/sitemapper.js -f md > docs.md"
},
"maintainers": [
{
"name": "Sean Thomas Burke",
"email": "[email protected]",
"url": "http://www.seantburke.com"
}
"Sean Thomas Burke <[email protected]> (http://www.seantburke.com)",
"Mehmet Yağız Maktav <[email protected]>"
],
"directories": {
"lib": "./lib",
"test": "./test"
"lib": "lib"
},
"engines": {
"node": ">= 10.0.0"
Expand Down
67 changes: 49 additions & 18 deletions src/assets/sitemapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import got from "got";
import zlib from "zlib";
import pLimit from "p-limit";
import isGzip from "is-gzip";
import fs from "fs";

/**
* @typedef {Object} Sitemapper
Expand Down Expand Up @@ -186,33 +187,63 @@ export default class Sitemapper {
rejectUnauthorized: this.rejectUnauthorized,
},
};

const isUrlRegex =
// eslint-disable-next-line no-useless-escape
/^https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)/;
try {
// create a request Promise with the url and request options
const requester = got.get(url, requestOptions);
let data;

// initialize the timeout method based on the URL, and pass the request object.
this.initializeTimeout(url, requester);
// if is this url
if (isUrlRegex.test(url)) {
// create a request Promise with the url and request options
const requester = got.get(url, requestOptions);

// get the response from the requester promise
const response = await requester;
// initialize the timeout method based on the URL, and pass the request object.
this.initializeTimeout(url, requester);

// if the response does not have a successful status code then clear the timeout for this url.
if (!response || response.statusCode !== 200) {
clearTimeout(this.timeoutTable[url]);
return { error: response.error, data: response };
}
// get the response from the requester promise
let response = await requester;
// console.log(String(response.body));

// if the response does not have a successful status code then clear the timeout for this url.
if (!response || response.statusCode !== 200) {
clearTimeout(this.timeoutTable[url]);
return { error: response.error, data: response };
}
let responseBody;

if (isGzip(response.rawBody)) {
responseBody = await this.decompressResponseBody(response.body);
} else {
responseBody = response.body;
}

let responseBody;
data = await parseStringPromise(responseBody);
}
// if is this raw XmlText
else if (url.includes("xmlns") && url.includes("sitemaps")) {
let responseBody;
if (isGzip(url)) {
responseBody = await this.decompressResponseBody(url);
} else {
responseBody = await url;
}

if (isGzip(response.rawBody)) {
responseBody = await this.decompressResponseBody(response.body);
} else {
responseBody = response.body;
data = await parseStringPromise(responseBody);
}
// if is this file path
else if (url.includes(":\\")) {
const rawBody = await fs.readFileSync(url, "utf-8");
let responseBody;
if (isGzip(rawBody)) {
responseBody = await this.decompressResponseBody(rawBody);
} else {
responseBody = await rawBody;
}

data = await parseStringPromise(responseBody);
} else throw new Error("Invalid url");
// otherwise parse the XML that was returned.
const data = await parseStringPromise(responseBody);

// return the results
return { error: null, data };
Expand Down
25 changes: 25 additions & 0 deletions src/tests/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -262,4 +262,29 @@ describe('Sitemapper', function () {
});
});
});

describe("fetch readsRawData", function () {
it("fetch should be read raw data", function (done) {
this.timeout(30000);
const url =
'<?xml version="1.0" encoding="utf-8" standalone="yes" ?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://wp.seantburke.com/?p=234</loc></url><url><loc>https://wp.seantburke.com/?p=231</loc></url></urlset>';
sitemapper.getSites(url, (err, sites) => {
sites.should.be.Array;
isUrl(sites[0]).should.be.true;
done();
});
});
});
describe("fetch readsFilePath", function () {
it("fetch should be read filePath", function (done) {
this.timeout(30000);
const url =
"D:\\githubprojects\\ProjectSeriesBackend\\src\\data\\sites-xmls\\Sitemap.xml";
sitemapper.getSites(url, (err, sites) => {
sites.should.be.Array;
isUrl(sites[0]).should.be.true;
done();
});
});
});
});