Skip to content

Commit

Permalink
Refactor: separate modules
Browse files Browse the repository at this point in the history
  • Loading branch information
SukkaW committed Jan 11, 2025
1 parent eca2949 commit 29410eb
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 179 deletions.
97 changes: 42 additions & 55 deletions Build/build-reject-domainset.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@
import path from 'node:path';
import process from 'node:process';

import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { processHosts } from './lib/parse-filter/hosts';
import { processDomainLists } from './lib/parse-filter/domainlists';
import { processFilterRules } from './lib/parse-filter/filters';

import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source';
import { compareAndWriteFile } from './lib/create-file';
Expand All @@ -18,6 +20,7 @@ import { addArrayElementsToSet } from 'foxts/add-array-elements-to-set';
import { appendArrayInPlace } from './lib/append-array-in-place';
import { OUTPUT_INTERNAL_DIR, SOURCE_DIR } from './constants/dir';
import { DomainsetOutput } from './lib/create-file';
import { foundDebugDomain } from './lib/parse-filter/shared';

const readLocalRejectDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf'));
const readLocalRejectExtraDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka_extra.conf'));
Expand Down Expand Up @@ -63,65 +66,49 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);

// Parse from AdGuard Filters
const shouldStop = await span
await span
.traceChild('download and process hosts / adblock filter rules')
.traceAsyncFn(async (childSpan) => {
// eslint-disable-next-line sukka/no-single-return -- not single return
let shouldStop = false;
await Promise.all([
// Parse from remote hosts & domain lists
HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),

DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),

ADGUARD_FILTERS.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectOutput(black);
})
),
ADGUARD_FILTERS_EXTRA.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black, foundDebugDomain }) => {
if (foundDebugDomain) {
// eslint-disable-next-line sukka/no-single-return -- not single return
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectExtraOutput(black);
})
),
ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
})),
getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
// Dedupe domainSets
// span.traceChildAsync('collect black keywords/suffixes', async () =>
/**
.traceAsyncFn((childSpan) => Promise.all([
// Parse from remote hosts & domain lists
HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)),
HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),

DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)),
DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)),

ADGUARD_FILTERS.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectOutput(black);
})
),
ADGUARD_FILTERS_EXTRA.map(
entry => processFilterRules(childSpan, ...entry)
.then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
appendArrayToRejectExtraOutput(black);
})
),
ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => {
addArrayElementsToSet(filterRuleWhitelistDomainSets, white);
addArrayElementsToSet(filterRuleWhitelistDomainSets, black);
})),
getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput),
readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput),
readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput),
// Dedupe domainSets
// span.traceChildAsync('collect black keywords/suffixes', async () =>
/**
* Collect DOMAIN, DOMAIN-SUFFIX, and DOMAIN-KEYWORD from non_ip/reject.conf for deduplication
* DOMAIN-WILDCARD is not really useful for deduplication, it is only included in AdGuardHome output
*/
rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
].flat());
// eslint-disable-next-line sukka/no-single-return -- not single return
return shouldStop;
});
rejectOutput.addFromRuleset(readLocalRejectRulesetPromise),
rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise)
].flat()));

if (shouldStop) {
if (foundDebugDomain.value) {
process.exit(1);
}

Expand Down
4 changes: 3 additions & 1 deletion Build/lib/get-phishing-domains.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { processDomainLists, processHosts } from './parse-filter';
import { processHosts } from './parse-filter/hosts';
import { processDomainLists } from './parse-filter/domainlists';

import * as tldts from 'tldts-experimental';

import { dummySpan, printTraceResult } from '../trace';
Expand Down
7 changes: 3 additions & 4 deletions Build/lib/parse-filter.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { describe, it } from 'mocha';

import { parse, processFilterRules } from './parse-filter';
import type { ParseType } from './parse-filter';
import { parse, processFilterRules } from './parse-filter/filters';
import type { ParseType } from './parse-filter/filters';
import { createCacheKey } from './cache-filesystem';
import { createSpan } from '../trace';

Expand All @@ -20,8 +20,7 @@ describe.skip('processFilterRules', () => {
console.log(processFilterRules(
createSpan('noop'),
cacheKey('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt'),
[],
7_200_000
[]
));
});
});
51 changes: 51 additions & 0 deletions Build/lib/parse-filter/domainlists.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import picocolors from 'picocolors';
import { normalizeDomain } from '../normalize-domain';
import { processLine } from '../process-line';
import { onBlackFound } from './shared';
import { fetchAssetsWithout304 } from '../fetch-assets';
import type { Span } from '../../trace';

function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
let line = processLine(l);
if (!line) return;
line = line.toLowerCase();

const domain = normalizeDomain(line);
if (!domain) return;
if (domain !== line) {
console.log(
picocolors.red('[process domain list]'),
picocolors.gray(`line: ${line}`),
picocolors.gray(`domain: ${domain}`),
picocolors.gray(meta)
);

return;
}

onBlackFound(domain, meta);

set.push(includeAllSubDomain ? `.${line}` : line);
}

export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
domainListsUrl,
mirrors
));
const domainSets: string[] = [];
const filterRules = text.split('\n');

span.traceChildSync('parse domain list', () => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});

return domainSets;
});
}
128 changes: 9 additions & 119 deletions Build/lib/parse-filter.ts → Build/lib/parse-filter/filters.ts
Original file line number Diff line number Diff line change
@@ -1,121 +1,12 @@
import { NetworkFilter } from '@ghostery/adblocker';
import { processLine } from './process-line';
import tldts from 'tldts-experimental';

import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import type { Span } from '../trace';
import type { Span } from '../../trace';
import { fetchAssetsWithout304 } from '../fetch-assets';
import { onBlackFound, onWhiteFound } from './shared';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source';
import { noop } from 'foxts/noop';
import { fetchAssetsWithout304 } from './fetch-assets';

let foundDebugDomain = false;

const onBlackFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain = true;
}
}
: noop;

const onWhiteFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
if (line.includes(DEBUG_DOMAIN_TO_FIND!)) {
console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND)));
foundDebugDomain = true;
}
}
: noop;

function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
let line = processLine(l);
if (!line) return;
line = line.toLowerCase();

const domain = normalizeDomain(line);
if (!domain) return;
if (domain !== line) {
console.log(
picocolors.red('[process domain list]'),
picocolors.gray(`line: ${line}`),
picocolors.gray(`domain: ${domain}`),
picocolors.gray(meta)
);

return;
}

onBlackFound(domain, meta);

set.push(includeAllSubDomain ? `.${line}` : line);
}

export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
domainListsUrl,
mirrors
));
const domainSets: string[] = [];
const filterRules = text.split('\n');

span.traceChildSync('parse domain list', () => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});

return domainSets;
});
}

function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
const line = processLine(l);
if (!line) {
return;
}

const _domain = line.split(/\s/)[1]?.trim();
if (!_domain) {
return;
}
const domain = normalizeDomain(_domain);
if (!domain) {
return;
}

onBlackFound(domain, meta);

set.push(includeAllSubDomain ? `.${domain}` : domain);
}

export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));

const domainSets: string[] = [];

const filterRules = text.split('\n');

span.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});

return domainSets;
});
}
import { normalizeDomain } from '../normalize-domain';
import { looseTldtsOpt } from '../../constants/loose-tldts-opt';
import tldts from 'tldts-experimental';
import { NetworkFilter } from '@ghostery/adblocker';

const enum ParseType {
WhiteIncludeSubdomain = 0,
Expand All @@ -134,7 +25,7 @@ export async function processFilterRules(
filterRulesUrl: string,
fallbackUrls?: string[] | null,
allowThirdParty = false
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
): Promise<{ white: string[], black: string[] }> {
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => {
const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls);

Expand Down Expand Up @@ -226,8 +117,7 @@ export async function processFilterRules(

return {
white,
black,
foundDebugDomain
black
};
}

Expand Down
46 changes: 46 additions & 0 deletions Build/lib/parse-filter/hosts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import type { Span } from '../../trace';
import { fetchAssetsWithout304 } from '../fetch-assets';
import { normalizeDomain } from '../normalize-domain';
import { processLine } from '../process-line';
import { onBlackFound } from './shared';

function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
const line = processLine(l);
if (!line) {
return;
}

const _domain = line.split(/\s/)[1]?.trim();
if (!_domain) {
return;
}
const domain = normalizeDomain(_domain);
if (!domain) {
return;
}

onBlackFound(domain, meta);

set.push(includeAllSubDomain ? `.${domain}` : domain);
}

export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));

const domainSets: string[] = [];

const filterRules = text.split('\n');

span.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});

return domainSets;
});
}
Loading

0 comments on commit 29410eb

Please sign in to comment.