Skip to content

Commit

Permalink
Exclusions option to filter urls (#148)
Browse files Browse the repository at this point in the history
  • Loading branch information
chapmandu authored Oct 24, 2024
1 parent 89048e9 commit 2b0894a
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 4 deletions.
1 change: 1 addition & 0 deletions sitemapper.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export interface SitemapperOptions {
timeout?: number;
url?: string;
fields?: {[name: string]: boolean};
exclusions?: RegExp[];
}

declare class Sitemapper {
Expand Down
27 changes: 23 additions & 4 deletions src/assets/sitemapper.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ export default class Sitemapper {
* @params {boolean} [options.rejectUnauthorized] - If true (default), it will throw on invalid certificates, such as expired or self-signed ones.
* @params {lastmod} [options.lastmod] - the minimum lastmod value for urls
* @params {hpagent.HttpProxyAgent|hpagent.HttpsProxyAgent} [options.proxyAgent] - instance of npm "hpagent" HttpProxyAgent or HttpsProxyAgent to be passed to npm "got"
* @params {Array<RegExp>} [options.exclusions] - Array of regex patterns to exclude URLs
*
* @example let sitemap = new Sitemapper({
* url: 'https://wp.seantburke.com/sitemap.xml',
* timeout: 15000,
* lastmod: 1630693759
* lastmod: 1630693759,
* exclusions: [/foo.com/, /bar.xml/] // Filters out URLs matching these patterns
* });
*/
constructor(options) {
Expand All @@ -49,6 +51,7 @@ export default class Sitemapper {
settings.rejectUnauthorized === false ? false : true;
this.fields = settings.fields || false;
this.proxyAgent = settings.proxyAgent || {};
this.exclusions = settings.exclusions || [];
}

/**
Expand Down Expand Up @@ -319,6 +322,9 @@ export default class Sitemapper {

return modified >= this.lastmod;
})
.filter((site) => {
return !this.isExcluded(site.loc[0])
})
.map((site) => {
if( !this.fields) {
return site.loc && site.loc[0];
Expand All @@ -343,9 +349,11 @@ export default class Sitemapper {
console.debug(`Additional sitemap found during "crawl('${url}')"`);
}
// Map each child url into a promise to create an array of promises
const sitemap = data.sitemapindex.sitemap.map(
(map) => map.loc && map.loc[0]
);
const sitemap = data.sitemapindex.sitemap
.map((map) => map.loc && map.loc[0])
.filter((url) => {
return !this.isExcluded(url)
});

// Parse all child urls within the concurrency limit in the settings
const limit = pLimit(this.concurrency);
Expand Down Expand Up @@ -446,6 +454,17 @@ export default class Sitemapper {
});
});
}

/**
* Checks if a urls is excluded based on the exclusion patterns.
*
* @param {string} url - The URL to check.
* @returns {boolean} Returns true if the urls is excluded, false otherwise.
*/
isExcluded(url) {
if (this.exclusions.length === 0) return false;
return this.exclusions.some((pattern) => pattern.test(url));
}
}

/**
Expand Down
78 changes: 78 additions & 0 deletions src/tests/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -263,4 +263,82 @@ describe('Sitemapper', function () {
});
});
});

describe('exclusions option', function () {
// check for the url that should be excluded in a later test
it('should prevent false positive', function (done) {
this.timeout(30000);
const url = 'https://wp.seantburke.com/sitemap.xml';
// exclude video and image sitemap index urls
sitemapper.exclusions = [/video/,/image/]
sitemapper.fetch(url)
.then(data => {
data.sites.should.be.Array;
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.true
done();
})
.catch(error => {
console.error('Test failed');
done(error);
});
});

it('should filter out page_id urls', function (done) {
this.timeout(30000);
const url = 'https://wp.seantburke.com/sitemap.xml';
// exclude page_id=2
sitemapper.exclusions = [/page_id/]
sitemapper.fetch(url)
.then(data => {
data.sites.should.be.Array;
data.sites.includes('https://wp.seantburke.com/?page_id=2').should.be.false;
done();
})
.catch(error => {
console.error('Test failed');
done(error);
});
});
});

describe('isExcluded method', function () {
it('should return false when no exclusions are set', function () {
const result = sitemapper.isExcluded('https://foo.com/page1');
result.should.be.false();
});

it('should return false when url does not match any exclusion patterns', function () {
sitemapper.exclusions = [/\.pdf$/, /private/];
const result = sitemapper.isExcluded('https://foo.com/page1');
result.should.be.false();
});

it('should return false when url matches an exclusion pattern', function () {
sitemapper.exclusions = [/\.pdf$/, /private/];
const result = sitemapper.isExcluded('https://foo.com/document.pdf');
result.should.be.true();
});

it('should return true when url matches any of multiple exclusion patterns', function () {
sitemapper.exclusions = [/\.pdf$/, /private/, /temp/];
const result = sitemapper.isExcluded('https://foo.com/private/temp.html');
result.should.be.true();
});

it('should handle complex regex patterns correctly', function () {
sitemapper.exclusions = [/^https:\/\/foo\.com\/([a-z]{2})\/private/]
const result1 = sitemapper.isExcluded('https://foo.com/en/private/page');
const result2 = sitemapper.isExcluded('https://foo.com/en/public/page');
result1.should.be.true();
result2.should.be.false();
});

it('should handle case sensitivity correctly', function () {
sitemapper.exclusions = [/private/i];
const result1 = sitemapper.isExcluded('https://foo.com/PRIVATE/page');
const result2 = sitemapper.isExcluded('https://foo.com/Private/page');
result1.should.be.true();
result2.should.be.true();
});
});
});

0 comments on commit 2b0894a

Please sign in to comment.