Skip to content

Commit

Permalink
remove debug prints
Browse files Browse the repository at this point in the history
  • Loading branch information
andrefs committed Aug 17, 2023
1 parent a165df3 commit bd15616
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 82 deletions.
11 changes: 0 additions & 11 deletions src/manager/lib/Manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ export default class Manager {
}

async saveCrawl2(jobResult: CrawlResourceResult) {
console.log('XXXXXXXXXXXXXXXX saveCrawl2 0', { jobResult });
if (jobResult.status === 'not_ok') {
return await Resource.markAsCrawled(
jobResult.url,
Expand All @@ -143,31 +142,26 @@ export default class Manager {
predicate: t.predicate.value,
object: t.object.value,
}));
console.log('XXXXXXXXXXXXXXXX saveCrawl2 1', { triples });

if (triples.length) {
const source = (await Resource.findOne({
url: jobResult.url,
})) as IResource;
console.log('XXXXXXXXXXXXXXXX saveCrawl2 2', { source });

// add new resources
await Resource.addFromTriples(triples);

// add new triples
const res = await Triple.upsertMany(source, triples);
console.log('XXXXXXXXXXXXXXXX saveCrawl2 3', { res });

if (res.upsertedCount) {
const tids = Object.values(res.upsertedIds).map((i) => new ObjectId(i));
console.log('XXXXXXXXXXXXXXXX saveCrawl2 4', { tids });
// filter out reflexive triples and triples not referring to head resource
const tObjs = (await Triple.find({ _id: { $in: tids } })).filter(
(t) =>
t.subject !== t.object &&
(t.subject == source.url || t.object == source.url)
);
console.log('XXXXXXXXXXXXXXXX saveCrawl2 5', { tObjs });

// TODO convert to TripleDocument
const triplesByNode: { [url: string]: HydratedDocument<ITriple>[] } =
Expand All @@ -179,7 +173,6 @@ export default class Manager {
}
triplesByNode[source.url].push(t);
}
console.log('XXXXXXXXXXXXXXXX saveCrawl2 6', { triplesByNode });
await this.updatePaths(source.url, triplesByNode);
}
}
Expand Down Expand Up @@ -226,14 +219,11 @@ export default class Manager {
sourceUrl: string,
triplesByNode: { [url: string]: HydratedDocument<ITriple>[] }
) {
console.log('XXXXXXXXXXXXXXXX updatePaths 0', { sourceUrl, triplesByNode });
const pids = await Path.distinct('processId', {
'head.url': sourceUrl,
});
console.log('XXXXXXXXXXXXXXXX updatePaths 1', { pids });
for (const pid of pids) {
const proc = await Process.findOne({ pid });
console.log('XXXXXXXXXXXXXXXX updatePaths 2', { pid, proc });
await proc?.extendPaths(triplesByNode);
}
}
Expand Down Expand Up @@ -408,7 +398,6 @@ export default class Manager {
workerId: string,
workerAvail: JobCapacity
): AsyncIterable<Exclude<JobRequest, ResourceCrawlJobRequest>> {
console.log('XXXXXXXXXXXXXX assignJobs 0', { workerId, workerAvail });
if (this.jobs.beingSaved.count() > 2) {
log.warn(
`Too many jobs (${this.jobs.beingSaved.count()}) being saved, waiting for them to reduce before assigning new jobs`
Expand Down
2 changes: 0 additions & 2 deletions src/manager/lib/ProcessManager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -205,15 +205,13 @@ app.post('/processes/:pid/edit', async (req, res) => {
});

app.post('/processes', async (req, res) => {
console.log('XXXXXXXXXXXXXXXXx /processes', { body: req.body });
const seeds: string[] = req.body.seeds
.split(/\s*[\n,]\s*/)
.filter((s: string) => !s.match(/^\s*$/));
const uniqueSeeds = [...new Set(seeds)];

const pathHeads: Map<string, number> = new Map();
for (const s of seeds) {
console.log('XXXXXXXXXXXXXXXXx /processes', { s });
const domain = new URL(s).origin;
if (!pathHeads.get(domain)) {
pathHeads.set(domain, 0);
Expand Down
15 changes: 0 additions & 15 deletions src/models/Domain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,6 @@ schema.statics.lockForCrawl = async function (wId: string, origins: [string]) {
};

schema.statics.domainsToCheck = async function* (wId, limit) {
console.log('XXXXXXXXXXXXXX domainsToCheck 0', { wId, limit });
let domainsFound = 0;
let procSkip = 0;
let pathLimit = 20;
Expand Down Expand Up @@ -459,14 +458,12 @@ schema.statics.domainsToCheck = async function* (wId, limit) {
};

schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
console.log('XXXXXXXXXX domainsToCrawl2 0', { wId, domLimit, resLimit });
let domainsFound = 0;
let procSkip = 0;
let pathLimit = 20; // TODO get from config

PROCESS_LOOP: while (domainsFound < domLimit) {
const proc = await Process.getOneRunning(procSkip);
console.log('XXXXXXXXXX domainsToCrawl2 1', { proc });
if (!proc) {
return;
}
Expand All @@ -475,7 +472,6 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
let pathSkip = 0;
PATHS_LOOP: while (domainsFound < domLimit) {
const paths = await proc.getPaths(pathSkip, pathLimit);
console.log('XXXXXXXXXX domainsToCrawl2 2', { paths });

// if this process has no more available paths, skip it
if (!paths.length) {
Expand All @@ -499,7 +495,6 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
wId,
Array.from(origins).slice(0, 20)
);
console.log('XXXXXXXXXX domainsToCrawl2 3', { domains });

// these paths returned no available domains, skip them
if (!domains.length) {
Expand All @@ -514,24 +509,16 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
for (const d of domains) {
domainInfo[d.origin] = { domain: d, resources: [] };
}
console.log('XXXXXXXXXX domainsToCrawl2 4', { domainInfo });
for (const h of unvisHeads) {
if (h.domain in domainInfo) {
domainInfo[h.domain].resources!.push({ url: h.url });
}
}
console.log('XXXXXXXXXX domainsToCrawl2 5', { domainInfo });

for (const d in domainInfo) {
const dPathHeads = domainInfo[d].resources!;
const limit = Math.max(resLimit - dPathHeads.length, 0);

console.log('XXXXXXXXXX domainsToCrawl2 6', {
d,
resLimit,
dPathHeads,
limit,
});
const additionalResources = limit
? await Resource.find({
origin: d,
Expand All @@ -546,7 +533,6 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
0,
resLimit
);
console.log('XXXXXXXXXX domainsToCrawl2 7', { allResources });

await Resource.updateMany(
{ url: { $in: allResources.map((r) => r.url) } },
Expand All @@ -562,7 +548,6 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
resources: allResources,
};
domainsFound++;
console.log('XXXXXXXXXX domainsToCrawl2 8', { res, domainsFound });
yield res;
}
}
Expand Down
24 changes: 0 additions & 24 deletions src/models/Path.ts
Original file line number Diff line number Diff line change
Expand Up @@ -178,101 +178,77 @@ schema.method('copy', function () {
});

schema.method('extendWithExistingTriples', async function () {
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 0', { _this: this });
// if path has outOfBounds triple, try to extend with that
if (!!this.outOfBounds) {
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 1');
const t = await Triple.findById(this.outOfBounds);
const process = await Process.findOne({ pid: this.processId });
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 2', { t, process });
if (
!this.tripleIsOutOfBounds(t, process) &&
process?.whiteBlackListsAllow(t!)
) {
const newHeadUrl: string =
t!.subject === this.head.url ? t!.object : t!.subject;
const prop = t!.predicate;
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 3', {
newHeadUrl,
prop,
});

const np = this.copy();
np.head.url = newHeadUrl;
np.predicates.elems = Array.from(
new Set([...this.predicates.elems, prop])
);
np.nodes.elems.push(newHeadUrl);
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 4', { np });

await ProcessTriple.findOneAndUpdate(
{ processId: this.processId, triple: t },
{},
{ upsert: true }
);
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 5');
const path = await Path.create(np);
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 6');
await Path.deleteOne({ _id: this._id });
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 7');

return path.extendWithExistingTriples();
}
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 8');
}
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 9');
// find triples which include the head but dont belong to the path yet
let triples: HydratedDocument<ITriple>[] = await Triple.find({
nodes: { $eq: this.head.url, $nin: this.nodes.elems },
});
console.log('XXXXXXXXXXXXXX extendWithExistingTriples 10', { triples });
return this.extend(triples);
});

schema.method('extend', async function (triples: HydratedDocument<ITriple>[]) {
console.log('XXXXXXXXXXXX path.extend 0', { head: this.head, triples });
let newPaths: { [prop: string]: { [newHead: string]: PathSkeleton } } = {};
let procTriples: Types.ObjectId[] = [];
const process = await Process.findOne({ pid: this.processId });
console.log('XXXXXXXXXXXX path.extend 0.1', { process });

for (const t of triples.filter(
(t) => this.shouldCreateNewPath(t) && process?.whiteBlackListsAllow(t)
)) {
console.log('XXXXXXXXXXXX path.extend 1', { t });
const newHeadUrl: string =
t.subject === this.head.url ? t.object : t.subject;
const prop = t.predicate;
console.log('XXXXXXXXXXXX path.extend 2', { newHeadUrl, prop });

newPaths[prop] = newPaths[prop] || {};
// avoid extending the same path twice with the same triple
if (!newPaths[prop][newHeadUrl]) {
const np = this.copy();
np.head.url = newHeadUrl;
console.log('XXXXXXXXXXXX path.extend 3', { np });

if (this.tripleIsOutOfBounds(t, process)) {
console.log('XXXXXXXXXXXX path.extend 4');
np.outOfBounds = t._id;
} else {
console.log('XXXXXXXXXXXX path.extend 5');
procTriples.push(t._id);
np.predicates.elems = Array.from(
new Set([...this.predicates.elems, prop])
);
np.nodes.elems.push(newHeadUrl);
}
console.log('XXXXXXXXXXXX path.extend 6', { np });
newPaths[prop][newHeadUrl] = np;
}
}
console.log('XXXXXXXXXXXX path.extend 7', { newPaths, procTriples });
const nps: PathSkeleton[] = [];
Object.values(newPaths).forEach((x) =>
Object.values(x).forEach((y) => nps.push(y))
);
console.log('XXXXXXXXXXXX path.extend 8', { newPaths, nps, procTriples });

return { newPaths: nps, procTriples };
});
Expand Down
37 changes: 7 additions & 30 deletions src/models/Process.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,43 +209,30 @@ schema.method('getPaths', async function (skip = 0, limit = 20) {
schema.method(
'extendPathsWithExistingTriples',
async function (paths: PathDocument[]) {
console.log('XXXXXXXXXXXXXX extendPathsWithExistingTriples 0', { paths });
for (const path of paths) {
console.log('XXXXXXXXXXXXXX extendPathsWithExistingTriples 1', { path });
const newPathObjs = [];
const toDelete = new Set();
const procTriples = new Set();

const { newPaths: nps, procTriples: pts } =
await path.extendWithExistingTriples();
console.log('XXXXXXXXXXXXXX extendPathsWithExistingTriples 2', {
nps,
pts,
});

// if new paths were created
if (nps.length) {
toDelete.add(path._id);
newPathObjs.push(...nps);
for (const pt of pts) {
procTriples.add(pt);
}
}
console.log('XXXXXXXXXXXXXX extendPathsWithExistingTriples 3', {
toDelete,
newPathObjs,
procTriples,
});

// create new paths
const newPaths = await Path.create(newPathObjs);
console.log('XXXXXXXXXXXXXX extendPathsWithExistingTriples 4', {
newPaths,
});
// create new paths
const newPaths = await Path.create(newPathObjs);

// delete old paths
await Path.deleteMany({ _id: { $in: Array.from(toDelete) } });
// delete old paths
await Path.deleteMany({ _id: { $in: Array.from(toDelete) } });

await this.extendPathsWithExistingTriples(newPaths);
await this.extendPathsWithExistingTriples(newPaths);
}
}
}
);
Expand All @@ -255,25 +242,21 @@ schema.method(
async function (triplesByNode: {
[url: string]: HydratedDocument<ITriple>[];
}) {
console.log('XXXXXXXXXXXXXX extendPaths 0', { triplesByNode });
const newHeads = Object.keys(triplesByNode);
console.log('XXXXXXXXXXXXXX extendPaths 1', { newHeads });
const paths = await Path.find({
processId: this.pid,
'head.url':
newHeads.length === 1
? newHeads[0]
: { $in: Object.keys(triplesByNode) },
});
console.log('XXXXXXXXXXXXXX extendPaths 2', { paths });

const pathsToDelete = new Set();
const newPathObjs = [];
const toDelete = new Set();
const procTriples = new Set();

for (const path of paths) {
console.log('XXXXXXXXXXXXXX extendPaths 3', { path });
const { newPaths: nps, procTriples: pts } = await path.extend(
triplesByNode[path.head.url]
);
Expand All @@ -285,11 +268,6 @@ schema.method(
}
}
}
console.log('XXXXXXXXXXXXXX extendPaths 4', {
newPathObjs,
toDelete,
procTriples,
});

// add proc-triple associations
await ProcessTriple.insertMany(
Expand All @@ -298,7 +276,6 @@ schema.method(

// create new paths
const newPaths = await Path.create(newPathObjs);
console.log('XXXXXXXXXXXXXX extendPaths 5', { newPaths });

// delete old paths
await Path.deleteMany({ _id: { $in: Array.from(toDelete) } });
Expand Down

0 comments on commit bd15616

Please sign in to comment.