From 079d176aa8cd7c97ebe8db32d58152b65edff09f Mon Sep 17 00:00:00 2001 From: rickstaa Date: Sat, 4 Mar 2023 22:01:21 +0100 Subject: [PATCH] ci: add main database check action This commit adds the `check_main_databases` github action that is used to check whether the main databases are correctly formatted. This action checked whether: - The databases don't contain duplicate ICAOs. - The databases don't contain duplicate registration numbers. - The databases don't contain incorrect urls. --- .github/workflows/check_main_databases.yml | 25 ++++ .gitignore | 5 +- .vscode/settings.json | 18 --- scripts/README.md | 1 + scripts/check_main_databases.py | 150 +++++++++++++++++++++ scripts/derivative_changed.sh | 5 + 6 files changed, 185 insertions(+), 19 deletions(-) create mode 100644 .github/workflows/check_main_databases.yml delete mode 100644 .vscode/settings.json create mode 100644 scripts/check_main_databases.py create mode 100644 scripts/derivative_changed.sh diff --git a/.github/workflows/check_main_databases.yml b/.github/workflows/check_main_databases.yml new file mode 100644 index 00000000..92cd11ca --- /dev/null +++ b/.github/workflows/check_main_databases.yml @@ -0,0 +1,25 @@ +name: Check main databases CSV format + +on: + pull_request: + paths: + - "plane-alert-db.csv" + - "plane-alert-pia.csv" + - "plane-alert-twitter-blocked.csv" + - "plane-alert-ukraine.csv" + - "plane_images.txt" + +jobs: + checkMainDatabases: + runs-on: ubuntu-latest + name: Check whether the main databases are still valid CSVs + steps: + - name: Checkout + uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - run: pip install -r ./scripts/requirements.txt + + - name: Run main database checks + run: python ./scripts/check_main_databases.py diff --git a/.gitignore b/.gitignore index 9bea4330..b968bff9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ - .DS_Store + +# IDE +.vscode +*.code-workspace diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 26b5e211..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "workbench.colorCustomizations": { - "activityBar.activeBackground": "#78a5a5", - "activityBar.background": "#78a5a5", - "activityBar.foreground": "#15202b", - "activityBar.inactiveForeground": "#15202b99", - "activityBarBadge.background": "#865986", - "activityBarBadge.foreground": "#e7e7e7", - "commandCenter.border": "#e7e7e799", - "sash.hoverBorder": "#78a5a5", - "tab.activeBorder": "#78a5a5", - "titleBar.activeBackground": "#5e8c8c", - "titleBar.activeForeground": "#e7e7e7", - "titleBar.inactiveBackground": "#5e8c8c99", - "titleBar.inactiveForeground": "#e7e7e799" - }, - "peacock.color": "#5e8c8c" -} diff --git a/scripts/README.md b/scripts/README.md index 3b07b7f7..54fe7f94 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,5 +2,6 @@ This folder contains several scripts used in the GitHub actions: +- `check_main_databases`: A script that is used to check whether the main databases are correctly formatted. - `create_db_derivatives`: A script that can be used to create the derivative databases based on the `plane-alert-db.csv`, `plane_images.txt` and `blacklist.txt` files. - `create_images_reference`: A tiny little script that I used to create the new `plane_images.txt` file. This file will be removed when we are sure the file of the new image is correct. diff --git a/scripts/check_main_databases.py b/scripts/check_main_databases.py new file mode 100644 index 00000000..df938cf3 --- /dev/null +++ b/scripts/check_main_databases.py @@ -0,0 +1,150 @@ +"""Script that performs several tests on the main databases to see if they are still +valid CSVs. +""" + +import logging +import pandas as pd + +logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(name)s] %(message)s", level=logging.INFO +) + + +def isUrlValid(url, allow_nans=False): + """Check if a URL starts with http or https. + + Args: + url (str): The URL to check. + + Returns: + boolean: True if the URL starts with http or https, False otherwise. + """ + if allow_nans and pd.isna(url): + return True + return True if url.startswith(("http://", "https://")) else False + + +def duplicateICAOs(df): + """Check if the main database has any duplicate ICAO codes. + + Args: + df (pandas.Dataframe): The database to check. + + Raises: + Exception: When the main database has duplicate ICAO codes. + """ + duplicate_icao = df[df.duplicated(subset="$ICAO", keep=False)] + if len(duplicate_icao) > 0: + logging.error("The main database has duplicate ICAO codes.") + raise Exception(f"The main database has duplicate ICAO codes: {duplicate_icao}") + + +def duplicateRegs(df): + """Check if the main database has any duplicate registration numbers. + + Args: + df (pandas.Dataframe): The database to check. + + Raises: + Exception: When the main database has duplicate registration numbers. + """ + + duplicate_regs = df[df.duplicated(subset="$Registration", keep=False)] + if len(duplicate_regs) > 0: + logging.error("The main database has duplicate registration numbers.") + raise Exception( + f"The main database has '{duplicate_regs.shape[0]}'duplicate registration " + "numbers: {duplicate_regs}" + ) + + +def badLinks(df): + """Check if the main database has any links that don't start with http or https. + + Args: + df (pandas.Dataframe): The database to check. + + Raises: + Exception: When the main database has invalid links. + """ + + bad_links = df[df["$#Link"].apply(isUrlValid) == False]["$#Link"].tolist() + if len(bad_links) > 0: + logging.error("The main database has invalid links.") + raise Exception(f"The main database has invalid links: {bad_links}") + + +if __name__ == "__main__": + ########################################## + # Check main database. # + ########################################## + logging.info("Checking the main database...") + try: + main_df = pd.read_csv("plane-alert-db.csv") + except Exception as e: + logging.error("The main database is not a valid CSV.") + raise e + + # Preform database checks. + duplicateICAOs(main_df) + # duplicateRegs(main_df) # NOTE: This is commented out because there are duplicates. + badLinks(main_df) + logging.info("The main database is valid.") + + ########################################## + # Check 'plane-alert-twitter-blocked' db.# + ########################################## + logging.info("Checking the 'plane-alert-twitter-blocked' database...") + try: + twitter_blocked_df = pd.read_csv("plane-alert-twitter-blocked.csv") + except Exception as e: + logging.error("The 'plane-alert-twitter-blocked' database is not a valid CSV.") + raise e + + # Preform database checks. + duplicateICAOs(twitter_blocked_df) + duplicateRegs(twitter_blocked_df) + badLinks(twitter_blocked_df) + logging.info("The 'plane-alert-twitter-blocked' database is valid.") + + ########################################## + # Check 'plane-alert-ukraine' db. # + ########################################## + logging.info("Checking the 'plane-alert-ukraine' database...") + try: + ukraine_df = pd.read_csv("plane-alert-ukraine.csv") + except Exception as e: + logging.error("The 'plane-alert-ukraine' database is not a valid CSV.") + raise e + + # Preform database checks. + duplicateICAOs(ukraine_df) + duplicateRegs(ukraine_df) + badLinks(ukraine_df) + logging.info("The 'plane-alert-ukraine' database is valid.") + + ########################################## + # Check 'plane_images.txt' db. # + ########################################## + logging.info("Checking the 'plane_images.txt' database...") + try: + images_df = pd.read_csv("plane_images.txt") + except Exception as e: + logging.error("The 'plane_images.txt' database is not a valid CSV.") + raise e + + # Perform database checks. + # duplicateICAOs( + # images_df + # ) # NOTE: This is commented out because there are duplicates. + bad_links = pd.DataFrame() + for col in images_df.columns: # Check all link columns for bad links. + if col != "$ICAO": + bad_links = bad_links.append( + images_df[images_df[col].apply(isUrlValid, allow_nans=True) == False] + ) + if len(bad_links) > 0: + logging.error("The 'plane_images.txt' database has invalid links.") + raise Exception( + f"The 'plane_images.txt' database has invalid links: {bad_links}" + ) diff --git a/scripts/derivative_changed.sh b/scripts/derivative_changed.sh new file mode 100644 index 00000000..3a84263c --- /dev/null +++ b/scripts/derivative_changed.sh @@ -0,0 +1,5 @@ +# Check if the derivative files have changed. +main_database_files=("plane-alert-db.csv", "plane-alert-pia.csv", "plane-alert-twitter-blocked.csv", "plane-alert-ukraine.csv", "plane_images.txt") + +files_changed=$(gh pr view 99 --json files -q '.files[].path' | grep '.csv') +if