diff --git a/src/common/lib/mongoose-validators.ts b/src/common/lib/mongoose-validators.ts index 3d32884..6cd9d45 100644 --- a/src/common/lib/mongoose-validators.ts +++ b/src/common/lib/mongoose-validators.ts @@ -5,4 +5,9 @@ const urlValidator = { message: (props: any) => `${props.value} is not a valid URL!`, }; -export { urlValidator }; +const urlListValidator = { + validator: (v: string[]) => v.every(isValid), + message: (props: any) => `One of ${props} is not a valid URL!`, +}; + +export { urlValidator, urlListValidator }; diff --git a/src/manager/lib/ProcessManager.ts b/src/manager/lib/ProcessManager.ts index e066392..594d338 100644 --- a/src/manager/lib/ProcessManager.ts +++ b/src/manager/lib/ProcessManager.ts @@ -1,4 +1,9 @@ -import { Process, ProcessClass, Resource } from '@derzis/models'; +import { + Process, + ProcessClass, + ProcessDocument, + Resource, +} from '@derzis/models'; import express from 'express'; import { create } from 'express-handlebars'; import path from 'path'; @@ -239,7 +244,7 @@ app.post('/processes', async (req, res) => { pathHeads.set(domain, pathHeads.get(domain)! + 1); } - const p = await Process.create({ + const p = { params: { maxPathLength: req.body.maxPathLength, maxPathProps: req.body.maxPathProps, @@ -256,9 +261,10 @@ app.post('/processes', async (req, res) => { }, seeds: uniqueSeeds, pathHeads, - }); + }; + const process = await Process.create(p); await Process.startNext(); - res.redirect(303, '/processes/' + p.pid); + res.redirect(303, '/processes/' + process.pid); }); app.get('/processes/last/triples', async (req, res) => { diff --git a/src/models/Domain.ts b/src/models/Domain.ts index 055913e..6ad773b 100644 --- a/src/models/Domain.ts +++ b/src/models/Domain.ts @@ -19,20 +19,6 @@ import { } from '@typegoose/typegoose'; const log = createLogger('Domain'); -@index({ - status: 1, - 'crawl.pathHeads': 1, - 'crawl.nextAllowed': -1, -}) -@index({ - 'crawl.nextAllowed': -1, -}) -@index({ - 'robots.status': 1, -}) -@index({ - jobId: 1, -}) class LastWarningClass { @prop() public errType!: @@ -92,6 +78,21 @@ class CrawlClass { @prop() public nextAllowed?: Date; } + +@index({ + status: 1, + 'crawl.pathHeads': 1, + 'crawl.nextAllowed': -1, +}) +@index({ + 'crawl.nextAllowed': -1, +}) +@index({ + 'robots.status': 1, +}) +@index({ + jobId: 1, +}) class DomainClass { @prop({ required: true, index: true, unique: true }) public origin!: string; diff --git a/src/models/Path.ts b/src/models/Path.ts index 64eb130..6a1701f 100644 --- a/src/models/Path.ts +++ b/src/models/Path.ts @@ -1,5 +1,5 @@ import { Types, Document } from 'mongoose'; -import { urlValidator } from '@derzis/common'; +import { urlListValidator, urlValidator } from '@derzis/common'; import { prop, index, @@ -17,37 +17,22 @@ import { TripleDocument, } from '@derzis/models'; -@pre('save', function () { - this.nodes.count = this.nodes.elems.length; - this.predicates.count = this.predicates.elems.length; - if (this.predicates.count) { - this.lastPredicate = this.predicates.elems[this.predicates.count - 1]; - } - const origin = new URL(this.head.url).origin; - this.head.domain = origin; -}) -@index({ processId: 1 }) -@index({ - 'seed.url': 1, - 'head.url': 1, - 'predicates.count': 1, -}) -@index({ - 'head.url': 1, - 'nodes.count': 1, -}) class ResourceCount { @prop({ default: 0 }) public count!: number; - @prop({ default: [], validate: urlValidator }) + @prop({ default: [], validate: urlListValidator }) public elems!: string[]; } +class SeedClass { + @prop({ required: true, validate: urlValidator }) + public url!: string; +} class HeadClass { @prop({ required: true, validate: urlValidator }) public url!: string; - @prop({ required: true }) + @prop() public domain!: string; } @@ -65,6 +50,26 @@ type PathSkeleton = Pick & nodes: Pick; }; +@pre('save', function () { + this.nodes.count = this.nodes.elems.length; + this.predicates.count = this.predicates.elems.length; + if (this.predicates.count) { + this.lastPredicate = this.predicates.elems[this.predicates.count - 1]; + } + + const origin = new URL(this.head.url).origin; + this.head.domain = origin; +}) +@index({ processId: 1 }) +@index({ + 'seed.url': 1, + 'head.url': 1, + 'predicates.count': 1, +}) +@index({ + 'head.url': 1, + 'nodes.count': 1, +}) class PathClass { _id!: Types.ObjectId; createdAt!: Date; @@ -73,8 +78,8 @@ class PathClass { @prop({ required: true }) public processId!: string; - @prop({ required: true, validate: urlValidator }) - public seed!: string; + @prop({ required: true }) + public seed!: SeedClass; @prop({ required: true }) public head!: HeadClass; diff --git a/src/models/Process.ts b/src/models/Process.ts index 66d167b..fb51f04 100644 --- a/src/models/Process.ts +++ b/src/models/Process.ts @@ -15,24 +15,6 @@ import { Severity, } from '@typegoose/typegoose'; -@index({ status: 1 }) -@index({ createdAt: 1 }) -@pre('save', async function () { - const today = new Date(new Date().setUTCHours(0, 0, 0, 0)); - - const count = await Process.countDocuments({ - createdAt: { $gt: today }, - }); - if (!this.pid) { - const date = today.toISOString().split('T')[0] + '-' + count; - const word = humanize(date); - this.pid = `${word}-${date}`; - } - if (!this.notification) { - this.notification = {}; - } - this.notification.ssePath = `/processes/${this.pid}/events`; -}) class NotificationClass { @prop() public email?: string; @@ -57,7 +39,26 @@ class ParamsClass { public blackList?: string[]; } +@index({ status: 1 }) +@index({ createdAt: 1 }) +@pre('save', async function () { + const today = new Date(new Date().setUTCHours(0, 0, 0, 0)); + const count = await Process.countDocuments({ + createdAt: { $gt: today }, + }); + if (!this.pid) { + const date = today.toISOString().split('T')[0] + '-' + count; + const word = humanize(date); + this.pid = `${word}-${date}`; + } + if (!this.notification) { + this.notification = {}; + } + const ssePath = `/processes/${this.pid}/events`; + this.notification.ssePath = ssePath; +}) class ProcessClass { + _id!: Types.ObjectId; createdAt!: Date; updatedAt!: Date; @@ -312,6 +313,7 @@ class ProcessClass { // TODO configurable number of simultaneous processes public static async startNext(this: ReturnModelType) { const runningProcs = await this.countDocuments({ status: 'running' }); + if (!runningProcs) { const process = await this.findOneAndUpdate( { status: 'queued' }, diff --git a/src/models/Resource.ts b/src/models/Resource.ts index 8411d03..18c5b72 100644 --- a/src/models/Resource.ts +++ b/src/models/Resource.ts @@ -13,8 +13,6 @@ import { Severity, } from '@typegoose/typegoose'; -@index({ url: 1, status: 1 }) -@index({ domain: 1, status: 1 }) class CrawlId { @prop() public domainTs!: Date; @@ -23,6 +21,8 @@ class CrawlId { public counter!: number; } +@index({ url: 1, status: 1 }) +@index({ domain: 1, status: 1 }) class ResourceClass { createdAt!: Date; updatedAt!: Date;