Skip to content

Commit

Permalink
Merge pull request evt-project#119 from evt-project/feature/file-desc…
Browse files Browse the repository at this point in the history
…ription-parser

Feature/file description parser
  • Loading branch information
szenzaro authored Feb 1, 2021
2 parents da86999 + 4097ba2 commit 455b13a
Show file tree
Hide file tree
Showing 6 changed files with 289 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Updated to Angular 9

### Added
- File description data extraction
- Critical text pages division
- Xi:include support for edition text
- Manuscript description data extraction
Expand Down
77 changes: 77 additions & 0 deletions src/app/models/evt-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -772,3 +772,80 @@ export class Dim extends GenericElement {
max?: number;
gEl?: G[];
}

export class FileDesc extends GenericElement {
titleStmt: TitleStmt;
publicationStmt: PublicationStmt;
sourceDesc: SourceDesc;
editionStmt?: EditionStmt;
extent?: Extent;
seriesStmt?: SeriesStmt;
notesStmt?: NotesStmt;
}

export class TitleStmt extends GenericElement {
titles: Array<ParseResult<GenericElement>>; // TODO: Add specific type when title is handled
subtitles: Array<ParseResult<GenericElement>>; // TODO: Add specific type when subtitle is handled
authors: Array<ParseResult<GenericElement>>; // TODO: Add specific type when author is handled
principals: Array<ParseResult<GenericElement>>; // TODO: Add specific type when principal is handled
respStmts: RespStmt[];
editors: Array<ParseResult<GenericElement>>; // TODO: Add specific type when editor is handled
sponsors: Array<ParseResult<GenericElement>>; // TODO: Add specific type when sponsor is handled
funders: Array<ParseResult<GenericElement>>; // TODO: Add specific type when funder is handled
}

export class RespStmt extends GenericElement {
responsibility: Resp;
people: Array<ParseResult<NamedEntityRef>>;
notes: Note[];
}

export class Resp extends GenericElement {
normalizedResp: string;
date: string;
}

export class EditionStmt extends GenericElement {
structuredData: boolean;
edition: Array<ParseResult<GenericElement>>; // TODO: Add specific type when edition is handled
respStmt: RespStmt[];
}

export class PublicationStmt extends GenericElement {
structuredData: boolean;
publisher: Array<ParseResult<GenericElement>>; // TODO: Add specific type when publisher is handled
distributor: Array<ParseResult<GenericElement>>; // TODO: Add specific type when distributor is handled
authority: Array<ParseResult<GenericElement>>; // TODO: Add specific type when authority is handled
pubPlace: Array<ParseResult<GenericElement>>; // TODO: Add specific type when pubPlace is handled
address: Array<ParseResult<GenericElement>>; // TODO: Add specific type when address is handled
idno: Array<ParseResult<GenericElement>>; // TODO: Add specific type when idno is handled
availability: Array<ParseResult<GenericElement>>; // TODO: Add specific type when availability is handled
date: Array<ParseResult<GenericElement>>; // TODO: Add specific type when date is handled
licence: Array<ParseResult<GenericElement>>; // TODO: Add specific type when licence is handled
}

export class SeriesStmt extends GenericElement {
structuredData: boolean;
title: Array<ParseResult<GenericElement>>; // TODO: Add specific type when title is handled
idno: Array<ParseResult<GenericElement>>; // TODO: Add specific type when idno is handled
respStmt: RespStmt[];
biblScope: Array<ParseResult<GenericElement>>; // TODO: Add specific type when biblScope is handled
editor: Array<ParseResult<GenericElement>>; // TODO: Add specific type when editor is handled
}

export class NotesStmt extends GenericElement {
notes: Note[];
relatedItems: Array<ParseResult<GenericElement>>; // TODO: Add specific type when relatedItem is handled
}

export class SourceDesc extends GenericElement {
structuredData: boolean;
msDesc: MsDesc;
bibl: Array<ParseResult<GenericElement>>; // TODO: Add specific type when bibl is handled
biblFull: Array<ParseResult<GenericElement>>; // TODO: Add specific type when biblFull is handled
biblStruct: Array<ParseResult<GenericElement>>; // TODO: Add specific type when biblStruct is handled
recordingStmt: Array<ParseResult<GenericElement>>; // TODO: Add specific type when recordingStmt is handled
scriptStmt: Array<ParseResult<GenericElement>>; // TODO: Add specific type when scriptStmt is handled
}

export class Extent extends GenericElement { }
4 changes: 4 additions & 0 deletions src/app/services/xml-parsers/basic-parsers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ export class GenericElemParser extends AttrParser implements Parser<XMLElement>
}
}

export class GenericParser extends GenericElemParser {
protected genericElemParser = createParser(GenericElemParser, this.genericParse);
}

export class AttributeParser extends EmptyParser implements Parser<XMLElement> {
parse(data: HTMLElement): Attributes {
return Array.from(data.attributes)
Expand Down
179 changes: 179 additions & 0 deletions src/app/services/xml-parsers/header-parser.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import { isNestedInElem } from 'src/app/utils/dom-utils';
import {
EditionStmt, Extent, FileDesc, GenericElement, MsDesc, NamedEntityRef, Note,
NotesStmt, PublicationStmt, Resp, RespStmt, SeriesStmt, SourceDesc, TitleStmt, XMLElement,
} from '../../models/evt-models';
import { GenericElemParser, GenericParser, NoteParser, queryAndParseElement, queryAndParseElements } from './basic-parsers';
import { MsDescParser } from './msdesc-parser';
import { NamedEntityRefParser } from './named-entity-parsers';
import { createParser, Parser } from './parser-models';

export class RespParser extends GenericElemParser implements Parser<XMLElement> {
parse(xml: XMLElement): Resp {
const { ref, when } = this.attributeParser.parse(xml);
const normalizedResp = ref?.indexOf('http://') < 0 && ref?.indexOf('https://') < 0 ? `http://${ref}` : ref ?? '';

return {
...super.parse(xml),
type: Resp,
normalizedResp,
date: when || '',
};
}
}

export class RespStmtParser extends GenericElemParser implements Parser<XMLElement> {
private namedEntityRefParser = createParser(NamedEntityRefParser, this.genericParse);

parse(xml: XMLElement): RespStmt {
const people = Array.from(xml.querySelectorAll<XMLElement>(':scope > name, :scope > orgName, :scope > persName'))
.map(p => {
if (['orgName', 'persName'].includes(p.tagName)) {
return this.namedEntityRefParser.parse(p) as NamedEntityRef;
}

return this.genericParse(p) as GenericElement;
});

return {
...super.parse(xml),
type: RespStmt,
responsibility: queryAndParseElement<Resp>(xml, 'resp', createParser(RespParser, this.genericParse)),
notes: queryAndParseElements<Note>(xml, 'note', createParser(NoteParser, this.genericParse)),
people,
};
}
}

export class TitleStmtParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): TitleStmt {
const title = queryAndParseElements<GenericElement>(xml, 'title[type="main"]', this.genericElemParser);

return {
...super.parse(xml),
type: TitleStmt,
titles: title.length > 0 ? title : queryAndParseElements<GenericElement>(xml, 'title:not([type="sub"])', this.genericElemParser),
subtitles: queryAndParseElements<GenericElement>(xml, 'title[type="sub"]', this.genericElemParser),
authors: queryAndParseElements<GenericElement>(xml, 'author', this.genericElemParser),
editors: queryAndParseElements<GenericElement>(xml, 'editor', this.genericElemParser),
sponsors: queryAndParseElements<GenericElement>(xml, 'sponsor', this.genericElemParser),
funders: queryAndParseElements<GenericElement>(xml, 'funder', this.genericElemParser),
principals: queryAndParseElements<GenericElement>(xml, 'principal', this.genericElemParser),
respStmts: queryAndParseElements<RespStmt>(xml, 'respStmt', createParser(RespStmtParser, this.genericParse)),
};
}
}

export class EditionStmtParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): EditionStmt {
return {
...super.parse(xml),
type: EditionStmt,
edition: queryAndParseElements<GenericElement>(xml, 'edition', this.genericElemParser),
respStmt: queryAndParseElements<RespStmt>(xml, 'respStmt', createParser(RespStmtParser, this.genericParse)),
structuredData: Array.from(xml.children).filter(el => el.tagName === 'p').length !== xml.children.length,
};
}
}

export class PublicationStmtParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): PublicationStmt {
return {
...super.parse(xml),
type: PublicationStmt,
structuredData: Array.from(xml.children).filter(el => el.tagName === 'p').length !== xml.children.length,
publisher: queryAndParseElements<GenericElement>(xml, 'publisher', this.genericElemParser),
distributor: queryAndParseElements<GenericElement>(xml, 'distributor', this.genericElemParser),
authority: queryAndParseElements<GenericElement>(xml, 'authority', this.genericElemParser),
pubPlace: queryAndParseElements<GenericElement>(xml, 'pubPlace', this.genericElemParser),
address: queryAndParseElements<GenericElement>(xml, 'address', this.genericElemParser),
idno: queryAndParseElements<GenericElement>(xml, 'idno', this.genericElemParser),
availability: queryAndParseElements<GenericElement>(xml, 'availability', this.genericElemParser),
date: queryAndParseElements<GenericElement>(xml, 'date', this.genericElemParser),
licence: queryAndParseElements<GenericElement>(xml, 'licence', this.genericElemParser),
};
}
}

export class SeriesStmtParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): SeriesStmt {
return {
...super.parse(xml),
type: SeriesStmt,
structuredData: Array.from(xml.querySelectorAll(':scope > p')).length === 0,
title: queryAndParseElements<GenericElement>(xml, 'title', this.genericElemParser),
idno: queryAndParseElements<GenericElement>(xml, 'idno', this.genericElemParser),
respStmt: queryAndParseElements<RespStmt>(xml, 'respStmt', createParser(RespStmtParser, this.genericParse)),
editor: queryAndParseElements<GenericElement>(xml, 'editor', this.genericElemParser),
biblScope: queryAndParseElements<GenericElement>(xml, 'biblScope', this.genericElemParser),
};
}
}

export class NotesStmtParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): NotesStmt {
return {
...super.parse(xml),
type: NotesStmt,
notes: queryAndParseElements<Note>(xml, 'note', createParser(NoteParser, this.genericParse)),
relatedItems: queryAndParseElements<GenericElement>(xml, 'relatedItem', this.genericElemParser),
};
}
}

export class SourceDescParser extends GenericParser implements Parser<XMLElement> {
parse(xml: XMLElement): SourceDesc {
return {
...super.parse(xml),
type: SourceDesc,
structuredData: Array.from(xml.children).filter(el => el.tagName === 'p').length !== xml.children.length,
msDesc: queryAndParseElement<MsDesc>(xml, 'note', createParser(MsDescParser, this.genericParse)),
bibl: queryAndParseElements<GenericElement>(xml, 'bibl', this.genericElemParser),
biblFull: queryAndParseElements<GenericElement>(xml, 'biblFull', this.genericElemParser),
biblStruct: queryAndParseElements<GenericElement>(xml, 'biblStruct', this.genericElemParser),
recordingStmt: queryAndParseElements<GenericElement>(xml, 'recordingStmt', this.genericElemParser),
scriptStmt: queryAndParseElements<GenericElement>(xml, 'scriptStmt', this.genericElemParser),
};
}
}

export class ExtentParser extends GenericElemParser implements Parser<XMLElement> {
parse(xml: XMLElement): Extent {
return {
...super.parse(xml),
type: Extent,
};
}
}

export class FileDescParser extends GenericElemParser implements Parser<XMLElement> {
private excludeFromParsing = [
'listBibl',
'listEvent',
'listOrg',
'listPerson',
'listPlace',
'listWit',
'sourceDesc list',
];

parse(xml: XMLElement): FileDesc {
xml = xml.cloneNode(true) as XMLElement;
Array.from(xml.querySelectorAll<XMLElement>(this.excludeFromParsing.toString()))
.filter((list) => !isNestedInElem(list, list.tagName))
.forEach(el => el.remove());

return {
...super.parse(xml),
type: FileDesc,
titleStmt: queryAndParseElement<TitleStmt>(xml, 'titleStmt', createParser(TitleStmtParser, this.genericParse)),
editionStmt: queryAndParseElement<EditionStmt>(xml, 'editionStmt', createParser(EditionStmtParser, this.genericParse)),
publicationStmt: queryAndParseElement<PublicationStmt>(
xml, 'publicationStmt', createParser(PublicationStmtParser, this.genericParse)),
sourceDesc: queryAndParseElement<SourceDesc>(xml, 'sourceDesc', createParser(SourceDescParser, this.genericParse)),
extent: queryAndParseElement<Extent>(xml, 'extent', createParser(ExtentParser, this.genericParse)),
notesStmt: queryAndParseElement<NotesStmt>(xml, 'notesStmt', createParser(NotesStmtParser, this.genericParse)),
seriesStmt: queryAndParseElement<SeriesStmt>(xml, 'seriesStmt', createParser(SeriesStmtParser, this.genericParse)),
};
}
}
23 changes: 21 additions & 2 deletions src/app/services/xml-parsers/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import { CharParser, GlyphParser, GParser } from './character-declarations-parse
import { ChoiceParser } from './choice-parser';
import { SicParser, SurplusParser } from './editorial-parsers';
import { GraphicParser, SurfaceParser, ZoneParser } from './facsimile-parser';
import {
EditionStmtParser, ExtentParser, FileDescParser, NotesStmtParser, PublicationStmtParser, RespParser, RespStmtParser,
SeriesStmtParser, SourceDescParser, TitleStmtParser,
} from './header-parser';
import {
AccMatParser, AcquisitionParser, AdditionalParser, AdditionsParser, AdminInfoParser, AltIdentifierParser, BindingDescParser,
BindingParser, CollationParser, CollectionParser, ConditionParser, CustEventParser, CustodialHistParser, DecoDescParser, DecoNoteParser,
Expand All @@ -26,8 +30,9 @@ import {
import { createParser, Parser, ParseResult } from './parser-models';

type AnalysisTags = 'w';
type CoreTags = 'add' | 'choice' | 'del' | 'gap' | 'graphic' | 'head' | 'l' | 'lb' | 'lg' | 'note' | 'p' | 'ptr' | 'sic';
type CoreTags = 'add' | 'choice' | 'del' | 'gap' | 'graphic' | 'head' | 'l' | 'lb' | 'lg' | 'note' | 'p' | 'ptr' | 'resp' | 'respStmt' | 'sic';
type GaijiTags = 'char' | 'g' | 'glyph';
type HeaderTags = 'editionStmt' | 'extent' | 'fileDesc' | 'notesStmt' | 'publicationStmt' | 'seriesStmt' | 'sourceDesc' | 'titleStmt';
type MsDescriptionTags = 'accMat' | 'acquisition' | 'additional' | 'additions' | 'adminInfo' | 'altIdentifier' |
'binding' | 'bindingDesc' | 'collation' | 'collection' | 'condition' | 'custEvent' | 'custodialHist' |
'decoDesc' | 'decoNote' | 'depth' | 'dim' | 'dimensions' | 'explicit' | 'filiation' | 'finalRubric' | 'foliation' |
Expand All @@ -40,7 +45,7 @@ type NamesDatesTags = 'event' | 'geogname' | 'org' | 'orgname' | 'persname' | 'p
type TextCritTags = 'app' | 'lem' | 'rdg';
type TranscrTags = 'damage' | 'supplied' | 'surface' | 'surplus' | 'zone';

type SupportedTagNames = AnalysisTags | CoreTags | GaijiTags | MsDescriptionTags | TextCritTags | TranscrTags | NamesDatesTags;
type SupportedTagNames = AnalysisTags | CoreTags | GaijiTags | HeaderTags | MsDescriptionTags | TextCritTags | TranscrTags | NamesDatesTags;

const analysisParseF: { [T in AnalysisTags]: Parser<XMLElement> } = {
w: createParser(WordParser, parse),
Expand All @@ -59,6 +64,8 @@ const coreParseF: { [T in CoreTags]: Parser<XMLElement> } = {
note: createParser(NoteParser, parse),
p: createParser(ParagraphParser, parse),
ptr: createParser(PtrParser, parse),
resp: createParser(RespParser, parse),
respStmt: createParser(RespStmtParser, parse),
sic: createParser(SicParser, parse),
};

Expand All @@ -68,6 +75,17 @@ const gaijiParseF: { [T in GaijiTags]: Parser<XMLElement> } = {
glyph: createParser(GlyphParser, parse),
};

const headerParseF: { [T in HeaderTags]: Parser<XMLElement> } = {
editionStmt: createParser(EditionStmtParser, parse),
extent: createParser(ExtentParser, parse),
fileDesc: createParser(FileDescParser, parse),
notesStmt: createParser(NotesStmtParser, parse),
publicationStmt: createParser(PublicationStmtParser, parse),
seriesStmt: createParser(SeriesStmtParser, parse),
sourceDesc: createParser(SourceDescParser, parse),
titleStmt: createParser(TitleStmtParser, parse),
};

const msDescriptionParseF: { [T in MsDescriptionTags]: Parser<XMLElement> } = {
accMat: createParser(AccMatParser, parse),
acquisition: createParser(AcquisitionParser, parse),
Expand Down Expand Up @@ -162,6 +180,7 @@ export const parseF: { [T in SupportedTagNames]: Parser<XMLElement> } = {
...analysisParseF,
...coreParseF,
...gaijiParseF,
...headerParseF,
...namesDatesParseF,
...textCritParseF,
...transcrParseF,
Expand Down
10 changes: 7 additions & 3 deletions src/app/services/xml-parsers/parser-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ export function createParser<U, T extends Parser<U>>(c: new (raw: ParseFn) => T,

export function getID(xml: XMLElement, prefix: string = '') { return xml.getAttribute('xml:id') || prefix + xpath(xml); }
export function getClass(xml: XMLElement) { return xml.tagName ? xml.tagName.toLowerCase() : ''; }
export function parseChildren(xml: XMLElement, parseFn: ParseFn) {
return complexElements(xml.childNodes).map(child => parseFn(child as XMLElement));
export function parseChildren(xml: XMLElement, parseFn: ParseFn, excludeEmptyText?: boolean) {
return complexElements(xml.childNodes, excludeEmptyText).map(child => parseFn(child as XMLElement));
}
export function getDefaultN(n: string) { return n || ''; }
export function getDefaultAttr(attr: string) { return attr || ''; }
Expand All @@ -24,4 +24,8 @@ export function unhandledElement(xml: XMLElement, name: string, parseFn: ParseFn
return flat(Array.from(xml.querySelectorAll<XMLElement>(`:scope > ${name}`)).map(e => parseChildren(e, parseFn)));
}

function complexElements(nodes: NodeListOf<ChildNode>): ChildNode[] { return Array.from(nodes).filter((n) => n.nodeType !== 8); }
export function complexElements(nodes: NodeListOf<ChildNode>, excludeEmptyText?: boolean): ChildNode[] {
const interestingNodes = Array.from(nodes).filter((n) => n.nodeType !== 8);

return excludeEmptyText ? interestingNodes.filter((n) => n.nodeType !== 3 || n.textContent.trim()) : interestingNodes;
}

0 comments on commit 455b13a

Please sign in to comment.