diff --git a/.github/workflows/data-validation.yml b/.github/workflows/data-validation.yml index dd8de06..20b7ea8 100644 --- a/.github/workflows/data-validation.yml +++ b/.github/workflows/data-validation.yml @@ -1,22 +1,16 @@ name: datavalidator on: - workflow_dispatch: pull_request: branches: - main - schedule: - - cron: "0 0 * * 0" - jobs: run-workflow-script: runs-on: ubuntu-latest steps: - - name: Checkout main branch + - name: Checkout PR branch uses: actions/checkout@v2 - with: - ref: main - name: Set up Python uses: actions/setup-python@v2 diff --git a/src/tasks/toml_data.py b/src/tasks/toml_data.py index 7c92ed7..30a5fce 100644 --- a/src/tasks/toml_data.py +++ b/src/tasks/toml_data.py @@ -3,7 +3,6 @@ import sys import tomllib import requests -from typing import List from views.airport import Airport, AirportData @@ -14,6 +13,7 @@ def __init__(self, data_dir: str, output_dir: str, export: bool = True): self.output_dir = output_dir self.data = AirportData(airports=[]) + self.checked_urls = {} self.errors = [] self.load_toml_data() @@ -49,13 +49,25 @@ def process_toml(self, file_path: str): self.errors.append(f"Failed to process {file_path}: {e}") def validate_url(self, url: str): + if url in self.checked_urls: + print( + f"URL {url} has already been checked. Valid: {self.checked_urls[url]}" + ) + return self.checked_urls[url] + try: response = requests.head(url, allow_redirects=True, timeout=2) print(f"Checked {url}, status code: {response.status_code}") - return response.status_code != 404 + + is_valid = response.status_code != 404 + + self.checked_urls[url] = is_valid + + return is_valid + except requests.exceptions.RequestException as e: - # If there was an issue with the request, return False print(f"Error checking {url}: {e}") + self.checked_urls[url] = False return False def validate_data(self):