diff --git a/src/common/lib/mongoose-types.ts b/src/common/lib/mongoose-types.ts index 97980e8..3a624bd 100644 --- a/src/common/lib/mongoose-types.ts +++ b/src/common/lib/mongoose-types.ts @@ -12,6 +12,10 @@ class UrlType extends mongoose.SchemaType { } return url; } + + public get(fn: Function) { + return fn(this); + } } // @ts-ignore diff --git a/src/models/Path.ts b/src/models/Path.ts index 1fdf3aa..575cacf 100644 --- a/src/models/Path.ts +++ b/src/models/Path.ts @@ -1,274 +1,215 @@ -import { HydratedDocument, Model, model, Schema, Types } from 'mongoose'; -import { urlType } from '@derzis/common'; -import config from '@derzis/config'; +import { Types } from 'mongoose'; +import { UrlType } from '@derzis/common'; +import { prop, index, pre, getModelForClass } from '@typegoose/typegoose'; import { - Resource, - Domain, - SimpleTriple, - ITriple, + TripleClass, + ProcessClass, Triple, Process, - IProcessDocument, ProcessTriple, } from '@derzis/models'; -export interface PathSkeleton { - seed: { url: string }; - head: { url: string }; - predicates: { elems: string[] }; - nodes: { elems: string[] }; - outOfBounds?: Schema.Types.ObjectId; - processId: string; -} +@pre('save', function () { + this.nodes.count = this.nodes.elems.length; + this.predicates.count = this.predicates.elems.length; + if (this.predicates.count) { + this.lastPredicate = this.predicates.elems[this.predicates.count - 1]; + } + const origin = new URL(this.head.url).origin; + this.head.domain = origin; +}) +@index({ processId: 1 }) +@index({ + 'seed.url': 1, + 'head.url': 1, + 'predicates.count': 1, +}) +@index({ + 'head.url': 1, + 'nodes.count': 1, +}) +class ResourceCount { + @prop({ default: 0 }) + public count!: number; -export interface IPath { - processId: string; - seed: { - url: string; - }; - predicates: { - elems: string[]; - count: number; - }; - lastPredicate: string; - nodes: { - elems: string[]; - count: number; - }; - outOfBounds?: Schema.Types.ObjectId; - head: { - url: string; - domain: string; - }; + @prop({ default: [] }) + public elems!: UrlType[]; } +class HeadClass { + @prop({ required: true }) + public url!: UrlType; -export interface IPathMethods { - markDisabled(): Promise; - markFinished(): Promise; - shouldCreateNewPath(triple: SimpleTriple): boolean; - tripleIsOutOfBounds(triple: SimpleTriple, process: IProcessDocument): boolean; - extendWithExistingTriples(): Promise<{ - newPaths: PathDocument[]; - procTriples: string[]; - }>; - extend( - triples: HydratedDocument[] - ): Promise<{ newPaths: PathDocument[]; procTriples: string[] }>; + @prop({ required: true }) + public domain!: string; } -export type PathDocument = HydratedDocument; -export interface PathModel extends Model {} +type RecursivePartial = { + [P in keyof T]?: T[P] extends (infer U)[] + ? RecursivePartial[] + : T[P] extends object | undefined + ? RecursivePartial + : T[P]; +}; -const schema = new Schema( - { - processId: { - type: String, - required: true, - }, - seed: { - url: { ...urlType, required: true }, - }, - predicates: { - elems: [urlType], - count: Number, - }, - lastPredicate: urlType, - nodes: { - elems: [urlType], - count: Number, - }, - outOfBounds: { - type: Schema.Types.ObjectId, - ref: 'Triple', - }, - head: { - url: { ...urlType, required: true }, - domain: urlType, - }, - }, - { timestamps: true } -); +class PathClass { + @prop({ required: true }) + public processId!: string; -schema.index({ processId: 1 }); + @prop({ required: true }) + public seed!: UrlType; -schema.index({ - 'seed.url': 1, - 'head.url': 1, - 'predicates.count': 1, -}); + @prop({ required: true }) + public head!: HeadClass; -schema.index({ - 'head.url': 1, - 'nodes.count': 1, -}); + @prop({ default: [] }) + public predicates!: ResourceCount; -schema.pre('save', async function () { - this.nodes.count = this.nodes.elems.length; - this.predicates.count = this.predicates.elems.length; - if (this.predicates.count) { - this.lastPredicate = this.predicates.elems[this.predicates.count - 1]; - } - const origin = new URL(this.head.url).origin; - this.head.domain = origin; -}); + @prop() + public lastPredicate?: UrlType; -schema.method('markDisabled', async function () { - this.status = 'disabled'; - await this.save(); - await Resource.rmPath(this); - return; -}); + @prop({ default: [] }) + public nodes!: ResourceCount; -schema.method('markFinished', async function () { - this.status = 'finished'; - await this.save(); - await Resource.rmPath(this); - return; -}); + @prop({ ref: 'Triple' }) + public outOfBounds?: Types.ObjectId; -schema.method('shouldCreateNewPath', function (t: ITriple) { - //console.log('XXXXXXXXXXXXXX shouldCreateNewPath', { t, _this: this }); - // triple is reflexive - if (t.subject === t.object) { - return false; - } + public shouldCreateNewPath(this: PathClass, t: TripleClass): boolean { + //console.log('XXXXXXXXXXXXXX shouldCreateNewPath', { t, _this: this }); + // triple is reflexive + if (t.subject === t.object) { + return false; + } - // head appears in triple predicate - if (t.predicate === this.head.url) { - return false; - } + // head appears in triple predicate + if (t.predicate === this.head.url) { + return false; + } - const newHeadUrl: string = t.subject === this.head.url ? t.object : t.subject; + const newHeadUrl: string = + t.subject === this.head.url ? t.object : t.subject; - // path already has outOfBounds triple - if (!!this.outOfBounds) { - return false; - } + // path already has outOfBounds triple + if (!!this.outOfBounds) { + return false; + } - // new head already contained in path - if (this.nodes.elems.includes(newHeadUrl)) { - return false; - } - //console.log('XXXXXXXXXXXXXX shouldCreateNewPath TRUE'); + // new head already contained in path + if (this.nodes.elems.includes(newHeadUrl)) { + return false; + } + //console.log('XXXXXXXXXXXXXX shouldCreateNewPath TRUE'); - return true; -}); + return true; + } -schema.method( - 'tripleIsOutOfBounds', - function (t: ITriple, process: IProcessDocument) { - const pathPreds: Set = new Set(this.predicates.elems); + public tripleIsOutOfBounds( + t: TripleClass, + process: ProcessDocumentClass + ): boolean { + const pathPreds: Set = new Set(this.predicates.elems); return ( this.nodes.count >= process.params.maxPathLength || (!pathPreds.has(t.predicate) && this.predicates.count >= process.params.maxPathProps) ); } -); - -schema.method('copy', function () { - const copy: PathSkeleton = { - processId: this.processId, - seed: this.seed, - head: this.head, - predicates: { elems: [...this.predicates.elems] }, - nodes: { elems: [...this.nodes.elems] }, - }; - return copy; -}); -schema.method('extendWithExistingTriples', async function () { - // if path has outOfBounds triple, try to extend with that - if (!!this.outOfBounds) { - const t = await Triple.findById(this.outOfBounds); - const process = await Process.findOne({ pid: this.processId }); - if ( - !this.tripleIsOutOfBounds(t, process) && - process?.whiteBlackListsAllow(t!) - ) { - const newHeadUrl: string = - t!.subject === this.head.url ? t!.object : t!.subject; - const prop = t!.predicate; - - const np = this.copy(); - np.head.url = newHeadUrl; - np.predicates.elems = Array.from( - new Set([...this.predicates.elems, prop]) - ); - np.nodes.elems.push(newHeadUrl); + public extendWithExistingTriples(): Promise<{ + newPaths: PathClass[]; + procTriples: string[]; + }> { + // if path has outOfBounds triple, try to extend with that + if (!!this.outOfBounds) { + const t: TripleClass | null = await Triple.findById(this.outOfBounds); + const process = await Process.findOne({ pid: this.processId }); + if ( + t && + !this.tripleIsOutOfBounds(t, process) && + process?.whiteBlackListsAllow(t!) + ) { + const newHeadUrl: string = + t!.subject === this.head.url ? t!.object : t!.subject; + const prop = t!.predicate; + + const np = this.copy(); + np.head.url = newHeadUrl; + np.predicates.elems = Array.from( + new Set([...this.predicates.elems, prop]) + ); + np.nodes.elems.push(newHeadUrl); - await ProcessTriple.findOneAndUpdate( - { processId: this.processId, triple: t }, - {}, - { upsert: true } - ); - const path = await Path.create(np); - await Path.deleteOne({ _id: this._id }); + await ProcessTriple.findOneAndUpdate( + { processId: this.processId, triple: t }, + {}, + { upsert: true } + ); + const path = await Path.create(np); + await Path.deleteOne({ _id: this._id }); - return path.extendWithExistingTriples(); + return path.extendWithExistingTriples(); + } } + // find triples which include the head but dont belong to the path yet + let triples: TripleClass[] = await Triple.find({ + nodes: { $eq: this.head.url, $nin: this.nodes.elems }, + }); + return this.extend(triples); } - // find triples which include the head but dont belong to the path yet - let triples: HydratedDocument[] = await Triple.find({ - nodes: { $eq: this.head.url, $nin: this.nodes.elems }, - }); - return this.extend(triples); -}); -schema.method('extend', async function (triples: HydratedDocument[]) { - let newPaths: { [prop: string]: { [newHead: string]: PathSkeleton } } = {}; - let procTriples: Types.ObjectId[] = []; - const process = await Process.findOne({ pid: this.processId }); - - for (const t of triples.filter( - (t) => this.shouldCreateNewPath(t) && process?.whiteBlackListsAllow(t) - )) { - const newHeadUrl: string = - t.subject === this.head.url ? t.object : t.subject; - const prop = t.predicate; + public copy(): RecursivePartial { + const copy = { + processId: this.processId, + seed: this.seed, + head: this.head, + predicates: { elems: [...this.predicates.elems] }, + nodes: { elems: [...this.nodes.elems] }, + }; + return copy; + } - newPaths[prop] = newPaths[prop] || {}; - // avoid extending the same path twice with the same triple - if (!newPaths[prop][newHeadUrl]) { - const np = this.copy(); - np.head.url = newHeadUrl; + public extend( + triples: TripleClass[] + ): Promise<{ newPaths: PathClass[]; procTriples: string[] }> { + let newPaths: { [prop: string]: { [newHead: string]: PathClass } } = {}; + let procTriples: Types.ObjectId[] = []; + const process = await Process.findOne({ pid: this.processId }); - if (this.tripleIsOutOfBounds(t, process)) { - np.outOfBounds = t._id; - } else { - procTriples.push(t._id); - np.predicates.elems = Array.from( - new Set([...this.predicates.elems, prop]) - ); - np.nodes.elems.push(newHeadUrl); + for (const t of triples.filter( + (t) => this.shouldCreateNewPath(t) && process?.whiteBlackListsAllow(t) + )) { + const newHeadUrl: string = + t.subject === this.head.url ? t.object : t.subject; + const prop = t.predicate; + + newPaths[prop] = newPaths[prop] || {}; + // avoid extending the same path twice with the same triple + if (!newPaths[prop][newHeadUrl]) { + const np = this.copy(); + np.head.url = newHeadUrl; + + if (this.tripleIsOutOfBounds(t, process)) { + np.outOfBounds = t._id; + } else { + procTriples.push(t._id); + np.predicates.elems = Array.from( + new Set([...this.predicates.elems, prop]) + ); + np.nodes.elems.push(newHeadUrl); + } + newPaths[prop][newHeadUrl] = np; } - newPaths[prop][newHeadUrl] = np; } + const nps: RecursivePartial[] = []; + Object.values(newPaths).forEach((x) => + Object.values(x).forEach((y) => nps.push(y)) + ); + + return { newPaths: nps, procTriples }; } - const nps: PathSkeleton[] = []; - Object.values(newPaths).forEach((x) => - Object.values(x).forEach((y) => nps.push(y)) - ); +} - return { newPaths: nps, procTriples }; +const Path = getModelForClass(PathClass, { + schemaOptions: { timestamps: true }, }); -//schema.post('save', async function(doc){ -// const resUpdate = doc.status === 'active' ? -// {'$addToSet': {paths: doc._id}} : -// {'$pull': {paths: doc._id}}; -// -// const domUpdate = doc.status === 'active' ? -// {'$inc': {'crawl.pathHeads': 1}} : -// {'$inc': {'crawl.pathHeads': -1}}; -// -// const Resource = require('./Resource'); -// await Resource.updateOne({url: doc.head.url}, resUpdate); -// const r = await Resource.findOne({url: doc.head.url}); -// r.headCount = r.paths?.length || 0; -// await r.save(); -// -// await require('./Domain').updateOne({origin: new URL(doc.head.url).origin}, domUpdate); -//}); - -export const Path = model('Path', schema); +export { Path, PathClass }; diff --git a/src/models/Triple.ts b/src/models/Triple.ts index abc0850..ca3fd5f 100644 --- a/src/models/Triple.ts +++ b/src/models/Triple.ts @@ -10,6 +10,7 @@ import { getModelForClass, ReturnModelType, } from '@typegoose/typegoose'; +import { Document } from 'cheerio'; @ModelOptions({ schemaOptions: { @@ -64,5 +65,5 @@ class TripleClass { const Triple = getModelForClass(TripleClass, { schemaOptions: { timestamps: true }, }); - -export { Triple, TripleClass }; +type TripleDocument = TripleClass & Document; +export { Triple, TripleClass, TripleDocument };