Skip to content

Commit

Permalink
Make entity data extraction more robust
Browse files Browse the repository at this point in the history
 - it now ignores the query string on passed-in URLs
 - add a way to force files to be update even if md5s match
 - minor cleanups and bump version
  • Loading branch information
mikesname committed Dec 9, 2024
1 parent 65bb3ae commit c013292
Show file tree
Hide file tree
Showing 10 changed files with 302 additions and 588 deletions.
2 changes: 1 addition & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"ext-dom": "*"
},
"require-dev": {
"phpunit/phpunit": "^7"
"phpunit/phpunit": "^8"
},
"license": "EUPL-1.2",
"scripts": {
Expand Down
795 changes: 251 additions & 544 deletions composer.lock

Large diffs are not rendered by default.

8 changes: 1 addition & 7 deletions controllers/FilesController.php
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ public function importAction()
$mime,
$form->getElement('create_exhibit')->isChecked(),
$form->getElement('enhance')->isChecked(),
$form->getElement('force_refresh')->isChecked(),
$created,
$updated,
function () {
Expand Down Expand Up @@ -213,13 +214,6 @@ public function archiveAction()
$zip = new ZipStream\ZipStream($filename, $options);

foreach (get_db()->getTable('Item')->findAll() as $item) {
$files = $associated
? tei_editions_get_associated_files($item)
: (tei_editions_get_main_tei($item)
? [tei_editions_get_main_tei($item)]
: []
);

$files = [];
switch ($name) {
case 'tei':
Expand Down
9 changes: 8 additions & 1 deletion forms/TeiEditions_Form_Import.php
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ public function init()
if they contain \'ref\' attributes that point to the Geonames or EHRI data sources.')
]);

$this->addElement('checkbox', 'force_refresh', [
'id' => 'tei-editions-upload-force-refresh',
'label' => __('Force Metadata Refresh'),
'class' => 'checkbox',
'description' => __('Force the importer to update existing items, even if they have already been imported.')
]);

$this->addElement('file', 'enhance_dict', [
'id' => 'tei-editions-enhance-dict',
'required' => false,
Expand All @@ -69,7 +76,7 @@ public function init()
'id' => 'tei-editions-submit'
]);

$this->addDisplayGroup(['file', 'create_exhibit', 'enhance'], 'tei-editions-ingest-opts');
$this->addDisplayGroup(['file', 'create_exhibit', 'enhance', 'force_refresh'], 'tei-editions-ingest-opts');
$this->addDisplayGroup(['enhance_dict', 'enhance_lang'], 'tei-editions-enhance-opts');
$this->addDisplayGroup(['submit'], 'tei-editions_submit');
}
Expand Down
22 changes: 12 additions & 10 deletions helpers/TeiEditions_Helpers_DataFetcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ function __construct($dict_path = null, $lang = 'eng', $opts = array())
/**
* Fetch info about places given an array of URL references.
*
* @param array $nametourl an array mapping place names to URLs
* @param array $names_to_urls an array mapping place names to URLs
*/
public function fetchPlaces($nametourl)
public function fetchPlaces($names_to_urls)
{
$entities = [];
foreach ($nametourl as $name => $url) {
foreach ($names_to_urls as $name => $url) {
// HACK! if we can't get a place, try to look up the
// item as a concept
if (!($e = $this->_getPlace($url, $this->lang))
Expand All @@ -65,12 +65,12 @@ public function fetchPlaces($nametourl)
* Fetch info about people, corporate bodies, and families
* given an array of URL references.
*
* @param array $nametourl an array mapping agent names to URLs
* @param array $names_to_urls an array mapping agent names to URLs
*/
public function fetchHistoricalAgents($nametourl)
public function fetchHistoricalAgents($names_to_urls)
{
$entities = [];
foreach ($nametourl as $name => $url) {
foreach ($names_to_urls as $name => $url) {
if (!($e = $this->_getHistoricalAgent($url, $this->lang))) {
$e = TeiEditionsEntity::create($name, $url);
}
Expand All @@ -82,12 +82,12 @@ public function fetchHistoricalAgents($nametourl)
/**
* Fetch info about keywords/concepts given an array of URL references.
*
* @param array $nametourl an array mapping concept names to URLs
* @param array $names_to_urls an array mapping concept names to URLs
*/
public function fetchConcepts($nametourl)
public function fetchConcepts($names_to_urls)
{
$entities = [];
foreach ($nametourl as $name => $url) {
foreach ($names_to_urls as $name => $url) {
if (!($e = $this->_getConcept($url, $this->lang))) {
$e = TeiEditionsEntity::create($name, $url);
}
Expand Down Expand Up @@ -306,7 +306,9 @@ private function _getConcept($url, $lang = null)
}';

// execute query and extract JSON
$id = basename($url);
// get the last path component, minus the trailing slash and any query string
$url_parts = parse_url($url);
$id = isset($url_parts['path']) ? basename($url_parts['path']) : $url;
$result = $this->_makeGraphQLRequest($req, array("id" => $id, "lang" => $lang));
if (!isset($result['data']['CvocConcept']['description'])) {
// if $lang is set, try with default language
Expand Down
22 changes: 13 additions & 9 deletions helpers/TeiEditions_Helpers_DataImporter.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,19 @@ public function addReferences(TeiEditions_Helpers_DocumentProxy $tei)
* @param string $mime the mime type of the import file. Either text/xml or application/zip are supported
* @param boolean $neatline whether or not to create a Neatline item from the TEI data
* @param boolean $enhance whether or not to run enhancement on the input file by looking up entity metadata
* @param boolean $force whether or not to force the import, refreshing existing items
* @param int $created out-param for number of items created
* @param int $updated out-param for number of items updated
* @param callable $onDone function to call on completion
* @throws Omeka_Record_Exception
*/
public function importData($path, $mime, $neatline, $enhance, &$created, &$updated, $onDone)
public function importData($path, $mime, $neatline, $enhance, $force, &$created, &$updated, $onDone)
{
_log("Performing data import: " . json_encode([
"path" => $path,
"neatline" => $neatline,
"enhance" => $enhance,
"force" => $force,
"mime" => $mime
]));

Expand All @@ -64,10 +66,10 @@ public function importData($path, $mime, $neatline, $enhance, &$created, &$updat
switch ($mime) {
case "text/xml":
case "application/xml":
$this->updateItem($path, $name, $neatline, $enhance, $created, $updated);
$this->updateItem($path, $name, $neatline, $enhance, $force, $created, $updated);
break;
case "application/zip":
$this->readImportZip($path, $neatline, $enhance, $created, $updated);
$this->readImportZip($path, $neatline, $enhance, $force, $created, $updated);
break;
default:
throw new Exception("Unhandled file extension: $mime");
Expand Down Expand Up @@ -186,13 +188,14 @@ private function readImportZip($zipPath, $neatline, $enhance, &$created = 0, &$u
*
* @param string $path the path to the XML file
* @param string $name the item name
* @param $neatline
* @param $enhance
* @param $neatline boolean whether or not to create a Neatline exhibit
* @param $enhance boolean whether or not to enhance the TEI
* @param $force boolean whether or not to force the update
* @param int $created out-param for the number of created items
* @param int $updated out-param for the number of updated items
* @throws Omeka_Record_Exception
*/
private function updateItem($path, $name, $neatline, $enhance, &$created, &$updated)
private function updateItem($path, $name, $neatline, $enhance, $force, &$created, &$updated)
{
_log("Importing file: $path");
$create = false;
Expand Down Expand Up @@ -224,7 +227,7 @@ private function updateItem($path, $name, $neatline, $enhance, &$created, &$upda
$item = $this->getOrCreateItem($doc, $create);
$this->updateItemFromTEI($item, $doc, $neatline);

$this->addOrUpdateItemFile($item, $path, $name, true);
$this->addOrUpdateItemFile($item, $path, $name, true, $force);
if ($create) {
$created++;
} else {
Expand Down Expand Up @@ -301,8 +304,9 @@ private function updateItemFromTEI(Item $item, TeiEditions_Helpers_DocumentProxy
* @param string $path the file path
* @param string $name the file name
* @param bool $is_primary if this file is the primary TEI
* @param bool $force whether or not to force the update
*/
private function addOrUpdateItemFile(Item $item, $path, $name, $is_primary = false)
private function addOrUpdateItemFile(Item $item, $path, $name, $is_primary = false, $force = false)
{
$primaryXml = $is_primary ? $name : null;
$md5 = md5_file($path);
Expand All @@ -312,7 +316,7 @@ private function addOrUpdateItemFile(Item $item, $path, $name, $is_primary = fal
$primaryXml = $file->original_filename;
}
if ($file->original_filename == $name) {
if ($file->authentication == $md5) {
if ($force === false && $file->authentication == $md5) {
// We've already got the same md5 with the same
// name so no need to update it.
error_log("Not refreshing $name, file exists with the same md5");
Expand Down
12 changes: 6 additions & 6 deletions helpers/TeiEditions_Helpers_TeiEnhancer.php
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@ public function addReferences(TeiEditions_Helpers_DocumentProxy $tei)
foreach ($this::$TYPES as $typeSpec) {
list($listTag, $itemTag, $nameTag, $srcTag, $fetcherFunc) = $typeSpec;

$existing = $tei->getEntities($listTag, $itemTag, $nameTag);
$existing_entities = $tei->getEntities($listTag, $itemTag, $nameTag);
$refs = $tei->entityReferences($srcTag, $idx, $addRefs = true);
foreach ($this->dataSrc->{$fetcherFunc}($refs) as $ref) {
if (!in_array($ref, $existing)) {
error_log("Found $srcTag " . $ref->name);
$tei->addEntity($listTag, $itemTag, $nameTag, $ref);
foreach ($this->dataSrc->{$fetcherFunc}($refs) as $entity) {
if (!in_array($entity, $existing_entities)) {
error_log("Found $srcTag " . $entity->name . "\n");
$tei->addEntity($listTag, $itemTag, $nameTag, $entity);
$added++;
} else {
error_log("Not updating existing $srcTag " . $ref->name);
error_log("Not updating existing $srcTag " . $entity->name);
}
}
}
Expand Down
16 changes: 8 additions & 8 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "tei-editions",
"description": "An Omeka plugin for importing and rendering structured TEI files for EHRI digital editions.",
"version": "1.0.0-pre11",
"version": "1.0.0-pre12",
"devDependencies": {
"grunt": "^1.5.3",
"grunt-cli": "^1.4.3",
Expand Down
2 changes: 1 addition & 1 deletion plugin.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ link="https://github.com/EHRI/TeiEditions"
support_link="https://github.com/EHRI/TeiEditions/issues"
omeka_minimum_version="2.6"
omeka_target_version="2.6"
version="1.0.0-pre11"
version="1.0.0-pre12"

0 comments on commit c013292

Please sign in to comment.