Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement XML fetcher / PastPerfect mapper #468

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions metadata_fetcher/fetchers/xml_file_fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import json
from .Fetcher import Fetcher, FetchError
import requests
from xml.etree import ElementTree
import settings
import math

class XmlFileFetcher(Fetcher):
def __init__(self, params: dict[str]):
"""
Parameters:
params: dict[str]
"""
super(XmlFileFetcher, self).__init__(params)

self.collection_id = params.get("collection_id")
self.url = params.get("harvest_data").get("url")
self.per_page = 100

def fetch_page(self) -> int:
"""
Returns:
int
"""
page = {"url": self.url}
print(
f"[{self.collection_id}]: Fetching {page.get('url')}"
)
try:
response = requests.get(**page)
response.raise_for_status()
except requests.exceptions.HTTPError:
raise FetchError(
f"[{self.collection_id}]: unable to fetch ")

return self.fetch_all_pages(response)

def fetch_all_pages(self, response) -> int:
"""
Parameters:
response: Requests.response

Returns:
int
"""
xml = ElementTree.fromstring(response.text)
record_nodes = xml.findall(".//record")
pages = math.ceil(len(record_nodes) / self.per_page)

for page in range(pages):
skip = self.write_page * self.per_page
items = record_nodes[skip:(skip + self.per_page)]
content = "<records>" + \
"".join([ElementTree.tostring(item, encoding="unicode")
for item in items]) + "</records>"
if settings.DATA_DEST == 'local':
self.fetchtolocal(content)
else:
self.fetchtos3(content)
self.write_page += 1
return len(record_nodes)

def json(self) -> str:
"""
This fetcher is run once, then done

Returns: str
"""
return json.dumps({"finished": True})

Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import re

Check failure on line 1 in metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F401)

metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py:1:8: F401 `re` imported but unused
from xml.etree import ElementTree
from collections import defaultdict

Check failure on line 3 in metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F401)

metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py:3:25: F401 `collections.defaultdict` imported but unused
from ..mapper import Vernacular, Record


class PastperfectXmlRecord(Record):

def UCLDC_map(self):
return {
"calisphere-id": self.legacy_couch_db_id.split('--')[1],
"metadata/identifier": "identifier",
"isShownAt": self.map_is_shown_at,
"isShownBy": self.map_is_shown_by,
"title": self.source_metadata.get("title"),
"date": self.source_metadata.get("date"),
"description": self.source_metadata.get("title"),
"subject": self.map_subject,
"spatial": self.source_metadata.get("place"),
"temporal": self.source_metadata.get("coverage"),
"format": self.collate_fields(["medium", "material"]),
"creator": self.collate_fields(["creator", "author", "artist",
"photographer"]),
"identifier": self.collate_fields(["identifier", "objectid", "arkid"]),
"type": self.source_metadata.get("objectname"),
"relation": self.source_metadata.get("collection"),
"rights": self.source_metadata.get("rights")
}

def map_is_shown_at(self):
return self.source_metadata.get("url", None)

def map_is_shown_by(self):
thumbnail = self.source_metadata.get("thumbnail")
if ".tif" in thumbnail:
return thumbnail.replace(".tif", ".jpg")
return thumbnail

def map_subject(self):
# subject_list = []
# for field in ["subject", "people", "searchterms"]:
# subject_list.append([subject.strip() for subject
# in self.source_metadata.get(field, "").split("|")])
# return [{"name": subject} for subject in subject_list]
values = self.collate_fields(["subject", "people", "searchterms"])()
return [{"name": value} for value in values]


class PastperfectXmlVernacular(Vernacular):
record_cls = PastperfectXmlRecord

def skip(self, record):
return not record.get("thumbnail", False)

def parse(self, api_response):
xml = ElementTree.fromstring(api_response)
data_nodes = xml.findall('.//record/metadata/PPO-Data')

records = []
for node in data_nodes:
record = {"metadata/identifier": "identifier"}
for tag in node.iter():
record[tag.tag] = tag.text
records.append(record)

return self.get_records(records)
Loading