diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 677585e93..f837c3dcc 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -2,7 +2,9 @@ import path from 'node:path'; import process from 'node:process'; -import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter'; +import { processHosts } from './lib/parse-filter/hosts'; +import { processDomainLists } from './lib/parse-filter/domainlists'; +import { processFilterRules } from './lib/parse-filter/filters'; import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source'; import { compareAndWriteFile } from './lib/create-file'; @@ -18,6 +20,7 @@ import { addArrayElementsToSet } from 'foxts/add-array-elements-to-set'; import { appendArrayInPlace } from './lib/append-array-in-place'; import { OUTPUT_INTERNAL_DIR, SOURCE_DIR } from './constants/dir'; import { DomainsetOutput } from './lib/create-file'; +import { foundDebugDomain } from './lib/parse-filter/shared'; const readLocalRejectDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')); const readLocalRejectExtraDomainsetPromise = readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka_extra.conf')); @@ -63,65 +66,49 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); // Parse from AdGuard Filters - const shouldStop = await span + await span .traceChild('download and process hosts / adblock filter rules') - .traceAsyncFn(async (childSpan) => { - // eslint-disable-next-line sukka/no-single-return -- not single return - let shouldStop = false; - await Promise.all([ - // Parse from remote hosts & domain lists - HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)), - HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)), - - DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)), - DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)), - - ADGUARD_FILTERS.map( - entry => processFilterRules(childSpan, ...entry) - .then(({ white, black, foundDebugDomain }) => { - if (foundDebugDomain) { - // eslint-disable-next-line sukka/no-single-return -- not single return - shouldStop = true; - // we should not break here, as we want to see full matches from all data source - } - addArrayElementsToSet(filterRuleWhitelistDomainSets, white); - appendArrayToRejectOutput(black); - }) - ), - ADGUARD_FILTERS_EXTRA.map( - entry => processFilterRules(childSpan, ...entry) - .then(({ white, black, foundDebugDomain }) => { - if (foundDebugDomain) { - // eslint-disable-next-line sukka/no-single-return -- not single return - shouldStop = true; - // we should not break here, as we want to see full matches from all data source - } - addArrayElementsToSet(filterRuleWhitelistDomainSets, white); - appendArrayToRejectExtraOutput(black); - }) - ), - ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => { - addArrayElementsToSet(filterRuleWhitelistDomainSets, white); - addArrayElementsToSet(filterRuleWhitelistDomainSets, black); - })), - getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput), - readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput), - readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput), - readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput), - // Dedupe domainSets - // span.traceChildAsync('collect black keywords/suffixes', async () => - /** + .traceAsyncFn((childSpan) => Promise.all([ + // Parse from remote hosts & domain lists + HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectOutput)), + HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToRejectExtraOutput)), + + DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectOutput)), + DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToRejectExtraOutput)), + + ADGUARD_FILTERS.map( + entry => processFilterRules(childSpan, ...entry) + .then(({ white, black }) => { + addArrayElementsToSet(filterRuleWhitelistDomainSets, white); + appendArrayToRejectOutput(black); + }) + ), + ADGUARD_FILTERS_EXTRA.map( + entry => processFilterRules(childSpan, ...entry) + .then(({ white, black }) => { + addArrayElementsToSet(filterRuleWhitelistDomainSets, white); + appendArrayToRejectExtraOutput(black); + }) + ), + ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => { + addArrayElementsToSet(filterRuleWhitelistDomainSets, white); + addArrayElementsToSet(filterRuleWhitelistDomainSets, black); + })), + getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput), + readLocalRejectDomainsetPromise.then(appendArrayToRejectOutput), + readLocalRejectDomainsetPromise.then(appendArrayToRejectExtraOutput), + readLocalRejectExtraDomainsetPromise.then(appendArrayToRejectExtraOutput), + // Dedupe domainSets + // span.traceChildAsync('collect black keywords/suffixes', async () => + /** * Collect DOMAIN, DOMAIN-SUFFIX, and DOMAIN-KEYWORD from non_ip/reject.conf for deduplication * DOMAIN-WILDCARD is not really useful for deduplication, it is only included in AdGuardHome output */ - rejectOutput.addFromRuleset(readLocalRejectRulesetPromise), - rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise) - ].flat()); - // eslint-disable-next-line sukka/no-single-return -- not single return - return shouldStop; - }); + rejectOutput.addFromRuleset(readLocalRejectRulesetPromise), + rejectExtraOutput.addFromRuleset(readLocalRejectRulesetPromise) + ].flat())); - if (shouldStop) { + if (foundDebugDomain.value) { process.exit(1); } diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 989dec96b..f25b37643 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,4 +1,6 @@ -import { processDomainLists, processHosts } from './parse-filter'; +import { processHosts } from './parse-filter/hosts'; +import { processDomainLists } from './parse-filter/domainlists'; + import * as tldts from 'tldts-experimental'; import { dummySpan, printTraceResult } from '../trace'; diff --git a/Build/lib/parse-filter.test.ts b/Build/lib/parse-filter.test.ts index 7314cbb6f..043571bb8 100644 --- a/Build/lib/parse-filter.test.ts +++ b/Build/lib/parse-filter.test.ts @@ -1,7 +1,7 @@ import { describe, it } from 'mocha'; -import { parse, processFilterRules } from './parse-filter'; -import type { ParseType } from './parse-filter'; +import { parse, processFilterRules } from './parse-filter/filters'; +import type { ParseType } from './parse-filter/filters'; import { createCacheKey } from './cache-filesystem'; import { createSpan } from '../trace'; @@ -20,8 +20,7 @@ describe.skip('processFilterRules', () => { console.log(processFilterRules( createSpan('noop'), cacheKey('https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt'), - [], - 7_200_000 + [] )); }); }); diff --git a/Build/lib/parse-filter/domainlists.ts b/Build/lib/parse-filter/domainlists.ts new file mode 100644 index 000000000..9eca7ed01 --- /dev/null +++ b/Build/lib/parse-filter/domainlists.ts @@ -0,0 +1,51 @@ +import picocolors from 'picocolors'; +import { normalizeDomain } from '../normalize-domain'; +import { processLine } from '../process-line'; +import { onBlackFound } from './shared'; +import { fetchAssetsWithout304 } from '../fetch-assets'; +import type { Span } from '../../trace'; + +function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { + let line = processLine(l); + if (!line) return; + line = line.toLowerCase(); + + const domain = normalizeDomain(line); + if (!domain) return; + if (domain !== line) { + console.log( + picocolors.red('[process domain list]'), + picocolors.gray(`line: ${line}`), + picocolors.gray(`domain: ${domain}`), + picocolors.gray(meta) + ); + + return; + } + + onBlackFound(domain, meta); + + set.push(includeAllSubDomain ? `.${line}` : line); +} + +export function processDomainLists( + span: Span, + domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false +) { + return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { + const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304( + domainListsUrl, + mirrors + )); + const domainSets: string[] = []; + const filterRules = text.split('\n'); + + span.traceChildSync('parse domain list', () => { + for (let i = 0, len = filterRules.length; i < len; i++) { + domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); + } + }); + + return domainSets; + }); +} diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter/filters.ts similarity index 78% rename from Build/lib/parse-filter.ts rename to Build/lib/parse-filter/filters.ts index 78006bb76..8731a3b91 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter/filters.ts @@ -1,121 +1,12 @@ -import { NetworkFilter } from '@ghostery/adblocker'; -import { processLine } from './process-line'; -import tldts from 'tldts-experimental'; - import picocolors from 'picocolors'; -import { normalizeDomain } from './normalize-domain'; -import type { Span } from '../trace'; +import type { Span } from '../../trace'; +import { fetchAssetsWithout304 } from '../fetch-assets'; +import { onBlackFound, onWhiteFound } from './shared'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; -import { looseTldtsOpt } from '../constants/loose-tldts-opt'; -import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source'; -import { noop } from 'foxts/noop'; -import { fetchAssetsWithout304 } from './fetch-assets'; - -let foundDebugDomain = false; - -const onBlackFound = DEBUG_DOMAIN_TO_FIND - ? (line: string, meta: string) => { - if (line.includes(DEBUG_DOMAIN_TO_FIND!)) { - console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); - foundDebugDomain = true; - } - } - : noop; - -const onWhiteFound = DEBUG_DOMAIN_TO_FIND - ? (line: string, meta: string) => { - if (line.includes(DEBUG_DOMAIN_TO_FIND!)) { - console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); - foundDebugDomain = true; - } - } - : noop; - -function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { - let line = processLine(l); - if (!line) return; - line = line.toLowerCase(); - - const domain = normalizeDomain(line); - if (!domain) return; - if (domain !== line) { - console.log( - picocolors.red('[process domain list]'), - picocolors.gray(`line: ${line}`), - picocolors.gray(`domain: ${domain}`), - picocolors.gray(meta) - ); - - return; - } - - onBlackFound(domain, meta); - - set.push(includeAllSubDomain ? `.${line}` : line); -} - -export function processDomainLists( - span: Span, - domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false -) { - return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { - const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304( - domainListsUrl, - mirrors - )); - const domainSets: string[] = []; - const filterRules = text.split('\n'); - - span.traceChildSync('parse domain list', () => { - for (let i = 0, len = filterRules.length; i < len; i++) { - domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); - } - }); - - return domainSets; - }); -} - -function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { - const line = processLine(l); - if (!line) { - return; - } - - const _domain = line.split(/\s/)[1]?.trim(); - if (!_domain) { - return; - } - const domain = normalizeDomain(_domain); - if (!domain) { - return; - } - - onBlackFound(domain, meta); - - set.push(includeAllSubDomain ? `.${domain}` : domain); -} - -export function processHosts( - span: Span, - hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false -) { - return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => { - const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors)); - - const domainSets: string[] = []; - - const filterRules = text.split('\n'); - - span.traceChild('parse hosts').traceSyncFn(() => { - for (let i = 0, len = filterRules.length; i < len; i++) { - hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl); - } - }); - - return domainSets; - }); -} +import { normalizeDomain } from '../normalize-domain'; +import { looseTldtsOpt } from '../../constants/loose-tldts-opt'; +import tldts from 'tldts-experimental'; +import { NetworkFilter } from '@ghostery/adblocker'; const enum ParseType { WhiteIncludeSubdomain = 0, @@ -134,7 +25,7 @@ export async function processFilterRules( filterRulesUrl: string, fallbackUrls?: string[] | null, allowThirdParty = false -): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { +): Promise<{ white: string[], black: string[] }> { const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => { const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls); @@ -226,8 +117,7 @@ export async function processFilterRules( return { white, - black, - foundDebugDomain + black }; } diff --git a/Build/lib/parse-filter/hosts.ts b/Build/lib/parse-filter/hosts.ts new file mode 100644 index 000000000..8f0857b92 --- /dev/null +++ b/Build/lib/parse-filter/hosts.ts @@ -0,0 +1,46 @@ +import type { Span } from '../../trace'; +import { fetchAssetsWithout304 } from '../fetch-assets'; +import { normalizeDomain } from '../normalize-domain'; +import { processLine } from '../process-line'; +import { onBlackFound } from './shared'; + +function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { + const line = processLine(l); + if (!line) { + return; + } + + const _domain = line.split(/\s/)[1]?.trim(); + if (!_domain) { + return; + } + const domain = normalizeDomain(_domain); + if (!domain) { + return; + } + + onBlackFound(domain, meta); + + set.push(includeAllSubDomain ? `.${domain}` : domain); +} + +export function processHosts( + span: Span, + hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false +) { + return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => { + const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors)); + + const domainSets: string[] = []; + + const filterRules = text.split('\n'); + + span.traceChild('parse hosts').traceSyncFn(() => { + for (let i = 0, len = filterRules.length; i < len; i++) { + hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl); + } + }); + + return domainSets; + }); +} diff --git a/Build/lib/parse-filter/shared.ts b/Build/lib/parse-filter/shared.ts new file mode 100644 index 000000000..435cd9212 --- /dev/null +++ b/Build/lib/parse-filter/shared.ts @@ -0,0 +1,23 @@ +import picocolors from 'picocolors'; +import { DEBUG_DOMAIN_TO_FIND } from '../../constants/reject-data-source'; +import { noop } from 'foxts/noop'; + +export const foundDebugDomain = { value: false }; + +export const onBlackFound = DEBUG_DOMAIN_TO_FIND + ? (line: string, meta: string) => { + if (line.includes(DEBUG_DOMAIN_TO_FIND!)) { + console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); + foundDebugDomain.value = true; + } + } + : noop; + +export const onWhiteFound = DEBUG_DOMAIN_TO_FIND + ? (line: string, meta: string) => { + if (line.includes(DEBUG_DOMAIN_TO_FIND!)) { + console.warn(picocolors.red(meta), '(white)', line.replaceAll(DEBUG_DOMAIN_TO_FIND!, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); + foundDebugDomain.value = true; + } + } + : noop;