Skip to content

Commit

Permalink
Merge pull request #15 from Police-Data-Accessibility-Project/url_status
Browse files Browse the repository at this point in the history
URL caching updates
  • Loading branch information
josh-chamberlain authored Feb 9, 2024
2 parents 6258519 + dc38d01 commit b56f027
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 83 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/update.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
pip install -r requirements.txt

- name: Extract data
run: python cache_url.py cache_info.json
run: python cache_url.py
env:
INPUT_FILE: cache_info.json

VUE_APP_PDAP_API_KEY: ${{ secrets.VUE_APP_PDAP_API_KEY }}
VITE_VUE_APP_BASE_URL: ${{ secrets.VITE_VUE_APP_BASE_URL }}
193 changes: 113 additions & 80 deletions cache_url.py
Original file line number Diff line number Diff line change
@@ -1,104 +1,137 @@
import json
from datetime import datetime, timedelta
from concurrent.futures import as_completed, ThreadPoolExecutor
from tqdm import tqdm
import requests
import os
import time

def match_freq(update_frequency):
API_KEY = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY")
UPDATE_FREQUENCY_MAPPING = {
"Incident-based": 7,
"< Hourly": 1 / 24,
"Hourly": 1 / 24,
"Daily": 1,
"Weekly": 7,
"Bi-weekly": 14,
"Monthly": 30,
"Quarterly": 90,
"Annually": 365,
"> Annually": 730,
"On request": None,
"No updates / rarely updated": None,
"Other": None,
}

update_frequency_mapping = {
"Incident-based": 7,
"< Hourly": 1/24,
"Hourly": 1/24,
"Daily": 1,
"Weekly": 7,
"Bi-weekly": 14,
"Monthly": 30,
"Quarterly": 90,
"Annually": 365,
"> Annually": 730,
"On request": None,
"No updates / rarely updated": None,
"Other": None,
}

update_delta = update_frequency_mapping.get(update_frequency)

return update_delta


api_key = "Bearer " + os.getenv("VUE_APP_PDAP_API_KEY")
response = requests.get("https://data-sources.pdap.io/api/archives", headers={"Authorization": api_key})
data = response.json()
def cache_url(entry):
entry["broken_source_url_as_of"] = None
source_url = entry.get("source_url")
if source_url is None:
entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d")
# try:
entry_json = json.dumps(entry)
requests.put(
f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives",
json=entry_json,
headers={"Authorization": API_KEY},
)
raise Exception("No source_url")
# except Exception as error:
# print(str(error))
# exceptions.append({
# "source_url": source_url,
# "exception": str(error)})

# Extract url info and cache if needed
exceptions = []
if data is not str:
for entry in data:
entry["broken_source_url_as_of"] = None
source_url = entry.get("source_url")
if source_url is None:
entry["broken_source_url_as_of"] = datetime.now().strftime("%Y-%m-%d")
try:
entry_json = json.dumps(entry)
response = requests.put("https://data-sources.pdap.io/api/archives", json=entry_json, headers={"Authorization": api_key})
raise Exception("No source_url")
except Exception as error:
print(str(error))
exceptions.append({"agency_name": entry.get("agency_name"),
"source_url": source_url,
"exception": str(error)})
continue
update_delta = match_freq(entry.get("update_frequency"))
agency_name = entry.get("agency_name")
if update_delta is None:
update_delta = datetime.max - datetime.today()
else:
update_delta = timedelta(days=int(update_delta))
update_delta = (
UPDATE_FREQUENCY_MAPPING[entry.get("update_frequency")]
if entry.get("update_frequency") is not None
else None
)
if update_delta is None:
update_delta = datetime.max - datetime.today()
else:
update_delta = timedelta(days=int(update_delta))

last_cached = entry.get("last_cached")
if last_cached is not None:
last_cached = datetime.strptime(last_cached, "%Y-%m-%d")
else:
last_cached = datetime.min

last_cached = entry.get("last_cached")
if last_cached is not None:
last_cached = datetime.strptime(last_cached, "%Y-%m-%d")
else:
# Check if website exists in archive and compare archived website to current site
last_cached = datetime.min
website_info_data = None
try:
website_info = requests.get(f"https://archive.org/wayback/available?url={source_url}")
website_info = requests.get(
f"https://archive.org/wayback/available?url={source_url}"
)
website_info_data = website_info.json()
if website_info_data["archived_snapshots"]:
website_info_data_last_cached = datetime.strptime(website_info_data["archived_snapshots"]["closest"]["timestamp"], "%Y%m%d%H%M%S")
website_info_data_source_url = website_info_data["archived_snapshots"]["closest"]["url"]
website_info_data_last_cached = datetime.strptime(
website_info_data["archived_snapshots"]["closest"]["timestamp"],
"%Y%m%d%H%M%S",
)
if website_info_data_last_cached > last_cached:
last_cached = website_info_data_last_cached
except Exception as error:
print(str(error))
# print(str(error))
website_info_data = {"archived_snapshots": None}


# Cache if never cached or more than update_delta days have passed since last_cache
if not website_info_data["archived_snapshots"] or last_cached + update_delta < datetime.today():
# Cache if never cached or more than update_delta days have passed since last_cache
if last_cached + update_delta < datetime.today():
try:
time.sleep(1)
api_url = f"http://web.archive.org/save/{source_url}"
requests.post(api_url)
# Update the last_cached date if cache is successful
entry["last_cached"] = datetime.now().strftime("%Y-%m-%d")
except Exception as error:
try:
api_url = "http://web.archive.org/save/{}".format(source_url)
archive = requests.post(api_url)
time.sleep(3)
requests.post(api_url)
# Update the last_cached date if cache is successful
entry["last_cached"] = datetime.now().strftime("%Y-%m-%d")
except Exception as error:
except:
print(str(error))
exceptions.append({"agency_name": entry.get("agency_name"),
"source_url": source_url,
"exception": str(error)})
else:
entry["last_cached"] = last_cached.strftime("%Y-%m-%d")

# Send updated data to Data Sources
entry_json = json.dumps(entry)
response = requests.put("https://data-sources.pdap.io/api/archives", json=entry_json, headers={"Authorization": api_key})
# exceptions.append({
# "source_url": source_url,
# "exception": str(error)})
else:
entry["last_cached"] = last_cached.strftime("%Y-%m-%d")

# Send updated data to Data Sources
entry_json = json.dumps(entry)
requests.put(
f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives",
json=entry_json,
headers={"Authorization": API_KEY},
)


def main():
response = requests.get(
f"{os.getenv('VITE_VUE_APP_BASE_URL')}/archives",
headers={"Authorization": API_KEY},
)
data = response.json()

# Extract url info and cache if needed
exceptions = []
if data is not str:
with ThreadPoolExecutor(max_workers=100) as executor:
print("Caching urls...")
future_cache = [executor.submit(cache_url, entry) for entry in data]

for future in tqdm(as_completed(future_cache), total=len(future_cache)):
future.result()

# Write any exceptions to a daily error log
# file_name = "ErrorLogs/" + datetime.now().strftime("%Y-%m-%d") + "_errorlog.txt"
# with open(file_name, "w") as error_log:
# if len(exceptions) > 0:
# json.dump(exceptions, fp=error_log, indent=4)
# else:
# error_log.write("no exceptions thrown")


# Write any exceptions to a daily error log
file_name = "ErrorLogs/" + datetime.now().strftime("%Y-%m-%d") + "_errorlog.txt"
with open(file_name, "w") as error_log:
if len(exceptions) > 0:
json.dump(exceptions, fp=error_log, indent=4)
else:
error_log.write("no exceptions thrown")
if __name__ == "__main__":
main()

0 comments on commit b56f027

Please sign in to comment.