From 765244e60d3fa6dc6c4c05a88a259d960d7e6a5f Mon Sep 17 00:00:00 2001 From: Marty Bode Date: Wed, 31 Jan 2024 13:56:48 -0500 Subject: [PATCH 1/2] use multithreading, scan all urls, change url_status when broken url found --- .github/workflows/update.yml | 4 +- cache_url.py | 193 ++++++++++++++++++++--------------- 2 files changed, 115 insertions(+), 82 deletions(-) diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index f5dcd2b..55bff79 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -30,7 +30,7 @@ jobs: pip install -r requirements.txt - name: Extract data - run: python cache_url.py cache_info.json + run: python cache_url.py env: - INPUT_FILE: cache_info.json + VUE_APP_PDAP_API_KEY: ${{ secrets.VUE_APP_PDAP_API_KEY }} diff --git a/cache_url.py b/cache_url.py index f6cd1b3..44c4336 100644 --- a/cache_url.py +++ b/cache_url.py @@ -1,104 +1,137 @@ import json from datetime import datetime, timedelta +from concurrent.futures import as_completed, ThreadPoolExecutor +from tqdm import tqdm import requests import os +import time -def match_freq(update_frequency): +API_KEY = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY") +UPDATE_FREQUENCY_MAPPING = { + "Incident-based": 7, + "< Hourly": 1 / 24, + "Hourly": 1 / 24, + "Daily": 1, + "Weekly": 7, + "Bi-weekly": 14, + "Monthly": 30, + "Quarterly": 90, + "Annually": 365, + "> Annually": 730, + "On request": None, + "No updates / rarely updated": None, + "Other": None, +} - update_frequency_mapping = { - "Incident-based": 7, - "< Hourly": 1/24, - "Hourly": 1/24, - "Daily": 1, - "Weekly": 7, - "Bi-weekly": 14, - "Monthly": 30, - "Quarterly": 90, - "Annually": 365, - "> Annually": 730, - "On request": None, - "No updates / rarely updated": None, - "Other": None, - } - update_delta = update_frequency_mapping.get(update_frequency) - - return update_delta - - -api_key = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY") -response = requests.get("https://data-sources.pdap.io/api/archives", headers={"Authorization": api_key}) -data = response.json() +def cache_url(entry): + entry["broken_source_url_as_of"] = None + source_url = entry.get("source_url") + if source_url is None: + entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d") + # try: + entry_json = json.dumps(entry) + requests.put( + f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives", + json=entry_json, + headers={"Authorization": API_KEY}, + ) + raise Exception("No source_url") + # except Exception as error: + # print(str(error)) + # exceptions.append({ + # "source_url": source_url, + # "exception": str(error)}) -# Extract url info and cache if needed -exceptions = [] -if data is not str: - for entry in data: - entry["broken_source_url_as_of"] = None - source_url = entry.get("source_url") - if source_url is None: - entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d") - try: - entry_json = json.dumps(entry) - response = requests.put("https://data-sources.pdap.io/api/archives", json=entry_json, headers={"Authorization": api_key}) - raise Exception("No source_url") - except Exception as error: - print(str(error)) - exceptions.append({"agency_name": entry.get("agency_name"), - "source_url": source_url, - "exception": str(error)}) - continue - update_delta = match_freq(entry.get("update_frequency")) - agency_name = entry.get("agency_name") - if update_delta is None: - update_delta = datetime.max - datetime.today() - else: - update_delta = timedelta(days=int(update_delta)) + update_delta = ( + UPDATE_FREQUENCY_MAPPING[entry.get("update_frequency")] + if entry.get("update_frequency") is not None + else None + ) + if update_delta is None: + update_delta = datetime.max - datetime.today() + else: + update_delta = timedelta(days=int(update_delta)) - last_cached = entry.get("last_cached") - if last_cached is not None: - last_cached = datetime.strptime(last_cached, "%Y-%m-%d") - else: - last_cached = datetime.min - + last_cached = entry.get("last_cached") + if last_cached is not None: + last_cached = datetime.strptime(last_cached, "%Y-%m-%d") + else: # Check if website exists in archive and compare archived website to current site + last_cached = datetime.min website_info_data = None try: - website_info = requests.get(f"https://archive.org/wayback/available?url={source_url}") + website_info = requests.get( + f"https://archive.org/wayback/available?url={source_url}" + ) website_info_data = website_info.json() if website_info_data["archived_snapshots"]: - website_info_data_last_cached = datetime.strptime(website_info_data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S") - website_info_data_source_url = website_info_data["archived_snapshots"]["closest"]["url"] + website_info_data_last_cached = datetime.strptime( + website_info_data["archived_snapshots"]["closest"]["timestamp"], + "%Y%m%d%H%M%S", + ) if website_info_data_last_cached > last_cached: last_cached = website_info_data_last_cached except Exception as error: - print(str(error)) + # print(str(error)) website_info_data = {"archived_snapshots": None} - - # Cache if never cached or more than update_delta days have passed since last_cache - if not website_info_data["archived_snapshots"] or last_cached + update_delta < datetime.today(): + # Cache if never cached or more than update_delta days have passed since last_cache + if last_cached + update_delta < datetime.today(): + try: + time.sleep(1) + api_url = f"http://web.archive.org/save/{source_url}" + requests.post(api_url) + # Update the last_cached date if cache is successful + entry["last_cached"] = datetime.now().strftime("%Y-%m-%d") + except Exception as error: try: - api_url = "http://web.archive.org/save/{}".format(source_url) - archive = requests.post(api_url) + time.sleep(3) + requests.post(api_url) # Update the last_cached date if cache is successful entry["last_cached"] = datetime.now().strftime("%Y-%m-%d") - except Exception as error: + except: print(str(error)) - exceptions.append({"agency_name": entry.get("agency_name"), - "source_url": source_url, - "exception": str(error)}) - else: - entry["last_cached"] = last_cached.strftime("%Y-%m-%d") - - # Send updated data to Data Sources - entry_json = json.dumps(entry) - response = requests.put("https://data-sources.pdap.io/api/archives", json=entry_json, headers={"Authorization": api_key}) + # exceptions.append({ + # "source_url": source_url, + # "exception": str(error)}) + else: + entry["last_cached"] = last_cached.strftime("%Y-%m-%d") + + # Send updated data to Data Sources + entry_json = json.dumps(entry) + requests.put( + f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives", + json=entry_json, + headers={"Authorization": API_KEY}, + ) + + +def main(): + response = requests.get( + f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives", + headers={"Authorization": API_KEY}, + ) + data = response.json() + + # Extract url info and cache if needed + exceptions = [] + if data is not str: + with ThreadPoolExecutor(max_workers=100) as executor: + print("Caching urls...") + future_cache = [executor.submit(cache_url, entry) for entry in data] + + for future in tqdm(as_completed(future_cache), total=len(future_cache)): + future.result() + + # Write any exceptions to a daily error log + # file_name = "ErrorLogs/" + datetime.now().strftime("%Y-%m-%d") + "_errorlog.txt" + # with open(file_name, "w") as error_log: + # if len(exceptions) > 0: + # json.dump(exceptions, fp=error_log, indent=4) + # else: + # error_log.write("no exceptions thrown") + -# Write any exceptions to a daily error log -file_name = "ErrorLogs/" + datetime.now().strftime("%Y-%m-%d") + "_errorlog.txt" -with open(file_name, "w") as error_log: - if len(exceptions) > 0: - json.dump(exceptions, fp=error_log, indent=4) - else: - error_log.write("no exceptions thrown") +if __name__ == "__main__": + main() From dc38d012910120cd8fb79d57a87220db87d27f8e Mon Sep 17 00:00:00 2001 From: Marty Bode Date: Thu, 1 Feb 2024 15:27:11 -0500 Subject: [PATCH 2/2] missing variable --- .github/workflows/update.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index 55bff79..d032cc6 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -33,4 +33,4 @@ jobs: run: python cache_url.py env: VUE_APP_PDAP_API_KEY: ${{ secrets.VUE_APP_PDAP_API_KEY }} - + VITE_VUE_APP_BASE_URL: ${{ secrets.VITE_VUE_APP_BASE_URL }}