diff --git a/ngr_spider/cli.py b/ngr_spider/cli.py index aa68ca4..c48e9c1 100755 --- a/ngr_spider/cli.py +++ b/ngr_spider/cli.py @@ -70,6 +70,7 @@ def main_services(args): if protocols: protocol_list = protocols.split(",") + LOGGER.info("main_services start.") if not show_warnings: cm = warnings.catch_warnings() warnings.simplefilter("ignore") @@ -119,6 +120,8 @@ def main_services(args): content = get_output(pretty, yaml_output, config, no_updated, jq_filter) write_output(output_file, az_conn_string, az_container, yaml_output, content) + LOGGER.info("main_services end.") + LOGGER.info(f"output written to {output_file}") @@ -143,6 +146,8 @@ def main_layers(args): setup_logger(log_level) protocol_list = PROTOCOLS + LOGGER.info("main_layers start.") + csw_client = CSWClient(csw_url) if protocols: @@ -244,6 +249,8 @@ def main_layers(args): LOGGER.info(f"failed to index {len(service_errors)} services") message = "\n".join(service_errors_string) LOGGER.info(f"failed service urls:\n{message}") + + LOGGER.info("main_layers end.") LOGGER.info(f"output written to {output_file}") diff --git a/ngr_spider/csw_client.py b/ngr_spider/csw_client.py index 89e14fe..e688030 100644 --- a/ngr_spider/csw_client.py +++ b/ngr_spider/csw_client.py @@ -9,7 +9,6 @@ LOGGER = logging.getLogger(__name__) - class CSWClient: def __init__(self, csw_url): self.csw_url = csw_url @@ -33,32 +32,41 @@ def _get_csw_records( self, query: str, maxresults: int = 0, no_filter: bool = False ) -> list[CswServiceRecord]: csw = CatalogueServiceWeb(self.csw_url) - result: list[CswServiceRecord] = [] - start = 1 - maxrecord = maxresults if (maxresults < 100 and maxresults != 0) else 100 while True: - csw.getrecords2( - maxrecords=maxrecord, - cql=query, - startposition=start, - esn="full", - outputschema="http://www.isotc211.org/2005/gmd", - ) - records = [CswServiceRecord(rec[1].xml) for rec in csw.records.items()] - result.extend(records) - if ( - maxresults != 0 and len(result) >= maxresults - ): # break only early when maxresults set - break - if csw.results["nextrecord"] != 0: - start = csw.results["nextrecord"] - continue - break - result_out: list[CswServiceRecord] = result - if not no_filter: - result_out = self._filter_service_records(result) - return sorted(result_out, key=lambda x: x.title) + result: list[CswServiceRecord] = [] + start = 1 + maxrecord = maxresults if (maxresults < 100 and maxresults != 0) else 100 + matched = 0 + while True: + csw.getrecords2( + maxrecords=maxrecord, + cql=query, + startposition=start, + esn="full", + outputschema="http://www.isotc211.org/2005/gmd", + sortby="CreationDate:A" + ) + if start == 1: + matched = csw.results["matches"] + LOGGER.info("Number of matched servcies before filtering: " + str(matched)) + elif matched != csw.results["matches"]: + LOGGER.info("Number of matched servcies has been changed: old = " + str(matched) + ", new = " + str(csw.results["matches"])) + break # inner loop + + records = [CswServiceRecord(rec[1].xml) for rec in csw.records.items()] + result.extend(records) + if ( + maxresults != 0 and len(result) >= maxresults + ): # break only early when maxresults set + break + if csw.results["nextrecord"] != 0: + start = csw.results["nextrecord"] + continue + result_out: list[CswServiceRecord] = result + if not no_filter: + result_out = self._filter_service_records(result) + return sorted(result_out, key=lambda x: x.title) def _get_csw_records_by_protocol( self,