Skip to content

Commit

Permalink
Merge pull request evt-project#110 from evt-project/feature/page-divi…
Browse files Browse the repository at this point in the history
…sion-range

Feature/page division range
  • Loading branch information
ChiaraDipi authored Jan 13, 2021
2 parents 74c9be4 + 80253e4 commit bb91fcd
Show file tree
Hide file tree
Showing 9 changed files with 22,933 additions and 170 deletions.
22,805 changes: 22,777 additions & 28 deletions package-lock.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@
<evt-icon
[iconInfo]="{icon: 'file-alt', additionalClasses: 'mr-2'}"
label-left></evt-icon>
{{item.label}}
{{ item.label | translate: {value: (pages$ | async).indexOf(item) + 1} }}
</ng-template>
</ng-select>
5 changes: 2 additions & 3 deletions src/app/models/evt-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ export type XMLElement = HTMLElement;
export type OriginalEncodingNodeType = XMLElement;

export interface EditionStructure {
pages: Map<Page>;
pagesIndexes: string[];
pages: Page[];
}

export type ViewModeId = 'readingText' | 'imageText' | 'textText' | 'collation' | 'textSources' | 'textVersions';
Expand Down Expand Up @@ -146,7 +145,7 @@ export interface Witnesses {

export interface Witness {
id: string;
name: GenericElement[];
name: string | Array<ParseResult<GenericElement>> | XMLElement;
attributes: Attributes;
content: Array<ParseResult<GenericElement>>;
groupId: string;
Expand Down
5 changes: 2 additions & 3 deletions src/app/services/evt-model.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ export class EVTModelService {
);

public readonly pages$: Observable<Page[]> = this.editionSource$.pipe(
map((source) => this.editionStructureParser.parsePages(source)),
map(editionStructure => editionStructure.pagesIndexes.map(pageId => editionStructure.pages[pageId])),
map((source) => this.editionStructureParser.parsePages(source).pages),
shareReplay(1),
);

Expand Down Expand Up @@ -102,7 +101,7 @@ export class EVTModelService {
);

public readonly witnesses$ = this.witnessesData$.pipe(
map(({witnesses}) => witnesses),
map(({ witnesses }) => witnesses),
shareReplay(1),
);

Expand Down
2 changes: 1 addition & 1 deletion src/app/services/xml-parsers/parser-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ export interface Parser<T> { parse(data: T): ParseResult<GenericElement>; }
export type ParseFn = (d: XMLElement) => ParseResult<GenericElement>;
export function createParser<U, T extends Parser<U>>(c: new (raw: ParseFn) => T, data: ParseFn): T { return new c(data); }

export function getID(xml: XMLElement) { return xml.getAttribute('xml:id') || xpath(xml); }
export function getID(xml: XMLElement, prefix: string = '') { return xml.getAttribute('xml:id') || prefix + xpath(xml); }
export function getClass(xml: XMLElement) { return xml.tagName ? xml.tagName.toLowerCase() : ''; }
export function parseChildren(xml: XMLElement, parseFn: ParseFn) {
return complexElements(xml.childNodes).map(child => parseFn(child as XMLElement));
Expand Down
174 changes: 109 additions & 65 deletions src/app/services/xml-parsers/structure-xml-parser.service.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import { Injectable } from '@angular/core';

import { Page, XMLElement } from '../../models/evt-models';
import { getElementsAfterTreeNode, getElementsBetweenTreeNode } from '../../utils/dom-utils';
import { Map } from '../../utils/js-utils';
import { EditionStructure, GenericElement, OriginalEncodingNodeType, Page, XMLElement } from '../../models/evt-models';
import { createNsResolver, getElementsBetweenTreeNode, isNestedInElem } from '../../utils/dom-utils';
import { GenericParserService } from './generic-parser.service';
import { getID, ParseResult } from './parser-models';

@Injectable({
providedIn: 'root',
Expand All @@ -14,70 +13,115 @@ export class StructureXmlParserService {
) {
}

parsePages(document: XMLElement) {
const pages: Map<Page> = {};
const pagesIndexes: string[] = [];
const pageTagName = 'pb';

if (document) {
const pageElements = document.querySelectorAll(pageTagName);
const l = pageElements.length;
if (l > 0) {
for (let i = 0; i < l; i++) {
const element = pageElements[i];
let pageContent: XMLElement[] = [];
if (i < l - 1) { // TODO: handle last page
if (i === 0) {
pageContent = getElementsBetweenTreeNode(element.closest('body'), pageElements[i + 1]);
} else {
pageContent = getElementsBetweenTreeNode(element, pageElements[i + 1]);
}
} else {
pageContent = getElementsAfterTreeNode(element);
}
// Exclude nodes in <front>
pageContent = pageContent.filter(el => {
if (el.nodeType === 3) {
return !el.parentElement.closest('front') && el.parentElement.tagName !== 'front';
}
if (el.nodeType === 1) {
return !el.closest('front') && el.tagName !== 'front';
}

return false;
});

const page: Page = {
id: element.getAttribute('xml:id') || 'page_' + (pagesIndexes.length + 1),
label: element.getAttribute('n') || 'Page ' + (pagesIndexes.length + 1),
originalContent: pageContent,
parsedContent: pageContent.map(child => this.genericParserService.parse(child as XMLElement)),
};
pages[page.id] = page;
pagesIndexes.push(page.id);
}
} else {
// No <pb> used => TODO: Decide how to handle text division
console.warn('TODO: Decide how to handle text division when there are no <pb>s');
const mainText = document.querySelector('text');
const content = Array.from(mainText.childNodes);
const page: Page = {
id: `page_${new Date().getTime()}`,
label: 'Main Text',
originalContent: content as XMLElement[],
parsedContent: content.map(child => this.genericParserService.parse(child as XMLElement))
// tslint:disable-next-line: no-string-literal
.filter(c => !!c['content']), // TODO: FIXME: fix property access
};
pages[page.id] = page;
pagesIndexes.push(page.id);
}
console.log(pages);
private frontOrigContentAttr = 'document_front';
readonly frontTagName = 'front';
readonly pageTagName = 'pb';
readonly bodyTagName = 'body';

parsePages(el: XMLElement): EditionStructure {
if (!el) { return { pages: [] }; }

const front: XMLElement = el.querySelector(this.frontTagName);
const body: XMLElement = el.querySelector(this.bodyTagName);

const pbs = Array.from(el.querySelectorAll(this.pageTagName));
const frontPbs = pbs.filter((p) => isNestedInElem(p, this.frontTagName));
const bodyPbs = pbs.filter((p) => isNestedInElem(p, this.bodyTagName));
const doc = el.firstElementChild.ownerDocument;

if (frontPbs.length > 0 && bodyPbs.length > 0) {
return {
pages: pbs.map((pb: XMLElement, idx, arr: XMLElement[]) => this.parseDocumentPage(doc, pb, arr[idx + 1], 'text')),
};
}

const frontPages = frontPbs.length === 0 && this.isMarkedAsOrigContent(front)
? [this.parseSinglePage(doc, front, 'page_front', this.frontTagName)]
: frontPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.frontTagName));

const bodyPages = bodyPbs.length === 0
? [this.parseSinglePage(doc, body, 'page1', 'mainText')] // TODO: tranlsate mainText
: bodyPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.bodyTagName));

return {
pages: [...frontPages, ...bodyPages],
};
}

parseDocumentPage(doc: Document, pb: XMLElement, nextPb: XMLElement, ancestorTagName: string): Page {

/* If there is a next page we retrieve the elements between two page nodes
otherweise we retrieve the nodes between the page node and the last node of the body node */
// TODO: check if querySelectorAll can return an empty array in this case
const nextNode = nextPb || Array.from(doc.querySelectorAll(ancestorTagName)).reverse()[0].lastChild;
const originalContent = getElementsBetweenTreeNode(pb, nextNode)
.filter((n) => n.tagName !== this.pageTagName)
.filter((c) => ![4, 7, 8].includes(c.nodeType)); // Filter comments, CDATAs, and processing instructions

return {
id: getID(pb, 'page'),
label: pb.getAttribute('n') || 'page',
originalContent,
parsedContent: this.parsePageContent(doc, originalContent),
};
}

private parseSinglePage(doc: Document, el: XMLElement, id: string, label: string): Page {
const originalContent: XMLElement[] = getElementsBetweenTreeNode(el.firstChild, el.lastChild);

return {
pages,
pagesIndexes,
id,
label,
originalContent,
parsedContent: this.parsePageContent(doc, originalContent),
};
}

parsePageContent(doc: Document, pageContent: OriginalEncodingNodeType[]): Array<ParseResult<GenericElement>> {
return pageContent
.map((node) => {
const origEl = getEditionOrigNode(node, doc);
if (origEl.nodeName === this.frontTagName || isNestedInElem(origEl, this.frontTagName)) {
if (this.hasOriginalContent(origEl)) {
return Array.from(node.querySelectorAll(`[type=${this.frontOrigContentAttr}]`))
.map((c) => this.genericParserService.parse(c as XMLElement));
}
if (this.isMarkedAsOrigContent(origEl)) {
return [this.genericParserService.parse(node)];
}

return [] as Array<ParseResult<GenericElement>>;
}

if (origEl.tagName === 'text' && origEl.querySelectorAll && origEl.querySelectorAll(this.frontTagName).length > 0) {
return this.parsePageContent(doc, Array.from(node.children) as HTMLElement[]);
}

return [this.genericParserService.parse(node)];
})
.reduce((x, y) => x.concat(y), []);
}

hasOriginalContent(el: XMLElement): boolean {
return el.querySelectorAll(`[type=${this.frontOrigContentAttr}]`).length > 0;
}

isMarkedAsOrigContent(el: XMLElement): boolean {
return el.nodeType !== 3 &&
(el.getAttribute('type') === this.frontOrigContentAttr ||
this.hasOriginalContent(el) ||
isNestedInElem(el, '', [{ key: 'type', value: this.frontOrigContentAttr }])
);
}
}

function getEditionOrigNode(el: XMLElement, doc: Document) {
if (el.getAttribute && el.getAttribute('xpath')) {
const path = doc.documentElement.namespaceURI ? el.getAttribute('xpath').replace(/\//g, '/ns:') : el.getAttribute('xpath');
const xpathRes = doc.evaluate(path, doc, createNsResolver(doc), XPathResult.ANY_TYPE, undefined);

return xpathRes.iterateNext() as XMLElement;
}

return el;
}
Loading

0 comments on commit bb91fcd

Please sign in to comment.