Skip to content

Commit

Permalink
Change way of looking for body contents in analyzer (#87)
Browse files Browse the repository at this point in the history
* Change way of looking for body contents

* Add comments

* Add title attachments fallback

* CC

---------

Co-authored-by: akulbii <[email protected]>
Co-authored-by: dvogel <[email protected]>
  • Loading branch information
3 people authored Jan 10, 2024
1 parent 550dfcb commit 55dc56d
Showing 1 changed file with 102 additions and 5 deletions.
107 changes: 102 additions & 5 deletions src/Analyzer/ConfluenceAnalyzer.php
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ public function __construct( $config, Workspace $workspace, DataBuckets $buckets
'users',
'title-files',
'additional-files',
'attachment-orig-filename-target-filename-map'
'attachment-orig-filename-target-filename-map',
'title-attachments'
] );
$this->logger = new NullLogger();

Expand Down Expand Up @@ -182,6 +183,7 @@ protected function doAnalyze( SplFileInfo $file ): bool {
$this->userMap();
$this->makeSpacesMap();
$this->makePagenamesMap();
$this->addTitleAttachmentsFallback();
$this->addAdditionalFiles();

return true;
Expand Down Expand Up @@ -304,16 +306,40 @@ private function makePagenamesMap() {
$revisionTimestamp = $this->buildRevisionTimestamp( $pageNode );
$bodyContentIds = $this->getBodyContentIds( $pageNode );

foreach ( $bodyContentIds as $bodyContentId ) {
// TODO: Add UserImpl-key or directly MediaWiki username
// (could also be done in `extract` as "metadata" )
$this->customBuckets->addData( 'body-contents-to-pages-map', $bodyContentId, $pageId, false, true );
if ( !empty( $bodyContentIds ) ) {
foreach ( $bodyContentIds as $bodyContentId ) {
// TODO: Add UserImpl-key or directly MediaWiki username
// (could also be done in `extract` as "metadata" )
$this->customBuckets->addData( 'body-contents-to-pages-map', $bodyContentId, $pageId, false, true );
}
} else {
$bodyContentIds = [];

$bodyContents = $this->helper->getObjectNodes( 'BodyContent' );
foreach ( $bodyContents as $bodyContent ) {
$bodyContentId = $this->helper->getIDNodeValue( $bodyContent );
$contentPageId = $this->helper->getPropertyValue( 'content', $bodyContent );

if ( $pageId === $contentPageId ) {
$bodyContentIds[] = $bodyContentId;

$this->customBuckets->addData(
'body-contents-to-pages-map',
$bodyContentId,
$pageId,
false,
true
);
}
}
}

$version = $this->helper->getPropertyValue( 'version', $pageNode );

$this->addTitleRevision( $targetTitle, implode( '/', $bodyContentIds ) . "@$version-$revisionTimestamp" );

// In case of ERM34465 this seems to be empty because
// title-attachments and missing-attachment-id-to-filename are empty
$attachmentRefs = $this->helper->getElementsFromCollection( 'attachments', $pageNode );
foreach ( $attachmentRefs as $attachmentRef ) {
$attachmentId = $this->helper->getIDNodeValue( $attachmentRef );
Expand All @@ -334,6 +360,7 @@ private function makePagenamesMap() {
);
continue;
}
// In case of ERM34465 no files are added to title-attachments
$this->addTitleAttachment( $targetTitle, $attachmentTargetFilename );
$this->addFile( $attachmentTargetFilename, $attachmentReference );
$this->customBuckets->addData( 'title-files', $targetTitle, $attachmentTargetFilename, false, true );
Expand All @@ -352,6 +379,76 @@ private function makePagenamesMap() {
}
}

private function addTitleAttachmentsFallback() {
$currentTitleAttachments = $this->customBuckets->getBucketData( 'title-attachments' );
if ( empty( $currentTitleAttachments ) ) {
$this->output->writeln( "\nFinding title attachments fallback" );

$spaceIdPrefixMap = $this->customBuckets->getBucketData( 'space-id-to-prefix-map' );
$spaceIdHomepages = $this->customBuckets->getBucketData( 'space-id-homepages' );
$titleBuilder = new TitleBuilder( $spaceIdPrefixMap, $spaceIdHomepages, $this->helper, $this->mainpage );

$attachmentObjs = $this->helper->getObjectNodes( 'Attachment' );
foreach ( $attachmentObjs as $attachmentObj ) {
$attachmentId = $this->helper->getIDNodeValue( $attachmentObj );
$containerContent = $this->helper->getPropertyNode( 'containerContent', $attachmentObj );
$containerContentId = $this->helper->getIDNodeValue( $containerContent );
$pageObj = $this->helper->getObjectNodeById( $containerContentId, 'Page' );
if ( $pageObj instanceof DOMElement === false ) {
continue;
}

if ( $containerContentId !== $this->helper->getIDNodeValue( $pageObj ) ) {
continue;
}

$attachmentObjContentStatus = $this->helper->getPropertyValue( 'contentStatus', $attachmentObj );
if ( strtolower( $attachmentObjContentStatus ) !== 'current' ) {
continue;
}

try {
$targetTitle = $titleBuilder->buildTitle( $pageObj );
} catch ( InvalidTitleException $ex ) {
continue;
}

$attachmentId = $this->helper->getIDNodeValue( $attachmentObj );
$attachmentTargetFilename = $this->makeAttachmentTargetFilename( $attachmentObj, $targetTitle );
$attachmentReference = $this->makeAttachmentReference( $attachmentObj );
if ( empty( $attachmentReference ) ) {
$this->output->writeln(
//phpcs:ignore Generic.Files.LineLength.TooLong
"\033[31m\t- File '$attachmentId' ($attachmentTargetFilename) not found\033[39m"
);
$this->customBuckets->addData(
'missing-attachment-id-to-filename',
$attachmentId,
$attachmentTargetFilename,
false,
true
);
continue;
}
$this->output->writeln( "- $attachmentTargetFilename" );
$this->addTitleAttachment( $targetTitle, $attachmentTargetFilename );
$this->addFile( $attachmentTargetFilename, $attachmentReference );
$this->customBuckets->addData( 'title-files', $targetTitle, $attachmentTargetFilename, false, true );
$this->addedAttachmentIds[$attachmentId] = true;

$fileName = $this->helper->getPropertyValue( 'fileName', $attachmentObj );
if ( $fileName === null ) {
$fileName = $this->helper->getPropertyValue( 'title', $attachmentObj );
}
$this->customBuckets->addData(
'attachment-orig-filename-target-filename-map',
$fileName,
$attachmentTargetFilename
);
}
}
}

/**
*
* @param DOMElement $attachment
Expand Down

0 comments on commit 55dc56d

Please sign in to comment.