From f4f1e1e240543d181e95a36feb99fc9bd25622a8 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 19 Jul 2023 10:52:56 -0700 Subject: [PATCH 1/2] Implement xml_file fetcher --- metadata_fetcher/fetchers/xml_file_fetcher.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 metadata_fetcher/fetchers/xml_file_fetcher.py diff --git a/metadata_fetcher/fetchers/xml_file_fetcher.py b/metadata_fetcher/fetchers/xml_file_fetcher.py new file mode 100644 index 000000000..b797323d0 --- /dev/null +++ b/metadata_fetcher/fetchers/xml_file_fetcher.py @@ -0,0 +1,70 @@ +import json +from .Fetcher import Fetcher, FetchError +import requests +from xml.etree import ElementTree +import settings +import math + +class XmlFileFetcher(Fetcher): + def __init__(self, params: dict[str]): + """ + Parameters: + params: dict[str] + """ + super(XmlFileFetcher, self).__init__(params) + + self.collection_id = params.get("collection_id") + self.url = params.get("harvest_data").get("url") + self.per_page = 100 + + def fetch_page(self) -> int: + """ + Returns: + int + """ + page = {"url": self.url} + print( + f"[{self.collection_id}]: Fetching {page.get('url')}" + ) + try: + response = requests.get(**page) + response.raise_for_status() + except requests.exceptions.HTTPError: + raise FetchError( + f"[{self.collection_id}]: unable to fetch ") + + return self.fetch_all_pages(response) + + def fetch_all_pages(self, response) -> int: + """ + Parameters: + response: Requests.response + + Returns: + int + """ + xml = ElementTree.fromstring(response.text) + record_nodes = xml.findall(".//record") + pages = math.ceil(len(record_nodes) / self.per_page) + + for page in range(pages): + skip = self.write_page * self.per_page + items = record_nodes[skip:(skip + self.per_page)] + content = "" + \ + "".join([ElementTree.tostring(item, encoding="unicode") + for item in items]) + "" + if settings.DATA_DEST == 'local': + self.fetchtolocal(content) + else: + self.fetchtos3(content) + self.write_page += 1 + return len(record_nodes) + + def json(self) -> str: + """ + This fetcher is run once, then done + + Returns: str + """ + return json.dumps({"finished": True}) + From 8c7baaf462c2b4f2c2f435de1a16a4d3c037bd59 Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Thu, 20 Jul 2023 14:06:57 -0700 Subject: [PATCH 2/2] Implement pastperfect mapper --- .../pastperfect_xml/pastperfect_xml_mapper.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py diff --git a/metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py b/metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py new file mode 100644 index 000000000..968b3a076 --- /dev/null +++ b/metadata_mapper/mappers/pastperfect_xml/pastperfect_xml_mapper.py @@ -0,0 +1,66 @@ +import re +from xml.etree import ElementTree +from collections import defaultdict +from ..mapper import Vernacular, Record + + +class PastperfectXmlRecord(Record): + + def UCLDC_map(self): + return { + "calisphere-id": self.legacy_couch_db_id.split('--')[1], + "metadata/identifier": "identifier", + "isShownAt": self.map_is_shown_at, + "isShownBy": self.map_is_shown_by, + "title": self.source_metadata.get("title"), + "date": self.source_metadata.get("date"), + "description": self.source_metadata.get("title"), + "subject": self.map_subject, + "spatial": self.source_metadata.get("place"), + "temporal": self.source_metadata.get("coverage"), + "format": self.collate_fields(["medium", "material"]), + "creator": self.collate_fields(["creator", "author", "artist", + "photographer"]), + "identifier": self.collate_fields(["identifier", "objectid", "arkid"]), + "type": self.source_metadata.get("objectname"), + "relation": self.source_metadata.get("collection"), + "rights": self.source_metadata.get("rights") + } + + def map_is_shown_at(self): + return self.source_metadata.get("url", None) + + def map_is_shown_by(self): + thumbnail = self.source_metadata.get("thumbnail") + if ".tif" in thumbnail: + return thumbnail.replace(".tif", ".jpg") + return thumbnail + + def map_subject(self): + # subject_list = [] + # for field in ["subject", "people", "searchterms"]: + # subject_list.append([subject.strip() for subject + # in self.source_metadata.get(field, "").split("|")]) + # return [{"name": subject} for subject in subject_list] + values = self.collate_fields(["subject", "people", "searchterms"])() + return [{"name": value} for value in values] + + +class PastperfectXmlVernacular(Vernacular): + record_cls = PastperfectXmlRecord + + def skip(self, record): + return not record.get("thumbnail", False) + + def parse(self, api_response): + xml = ElementTree.fromstring(api_response) + data_nodes = xml.findall('.//record/metadata/PPO-Data') + + records = [] + for node in data_nodes: + record = {"metadata/identifier": "identifier"} + for tag in node.iter(): + record[tag.tag] = tag.text + records.append(record) + + return self.get_records(records)