Skip to content

Commit

Permalink
ci: add main database check action
Browse files Browse the repository at this point in the history
  • Loading branch information
rickstaa committed Mar 4, 2023
1 parent 1f27658 commit 3e03808
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 19 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/main_databases_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Check main databases CSV format

on:
pull_request:
paths:
- "plane-alert-db.csv"
- "plane-alert-pia.csv"
- "plane-alert-twitter-blocked.csv"
- "plane-alert-ukraine.csv"
- "plane_images.txt"

jobs:
checkMainDatabases:
runs-on: ubuntu-latest
name: Check whether the main databases are still valid CSVs
steps:
- name: Checkout
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: "3.10"
- run: pip install -r ./scripts/requirements.txt

- name: Run main database checks
run: python ./scripts/check_main_databases.py
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@

.DS_Store

# IDE
.vscode
*.code-workspace
18 changes: 0 additions & 18 deletions .vscode/settings.json

This file was deleted.

150 changes: 150 additions & 0 deletions scripts/check_main_databases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Script that performs several tests on the main databases to see if they are still
valid CSVs.
"""

import logging
import pandas as pd

logging.basicConfig(
format="%(asctime)s %(levelname)-8s [%(name)s] %(message)s", level=logging.INFO
)


def isUrlValid(url, allow_nans=False):
"""Check if a URL starts with http or https.
Args:
url (str): The URL to check.
Returns:
boolean: True if the URL starts with http or https, False otherwise.
"""
if allow_nans and pd.isna(url):
return True
return True if url.startswith(("http://", "https://")) else False


def duplicateICAOs(df):
"""Check if the main database has any duplicate ICAO codes.
Args:
df (pandas.Dataframe): The database to check.
Raises:
Exception: When the main database has duplicate ICAO codes.
"""
duplicate_icao = df[df.duplicated(subset="$ICAO", keep=False)]
if len(duplicate_icao) > 0:
logging.error("The main database has duplicate ICAO codes.")
raise Exception(f"The main database has duplicate ICAO codes: {duplicate_icao}")


def duplicateRegs(df):
"""Check if the main database has any duplicate registration numbers.
Args:
df (pandas.Dataframe): The database to check.
Raises:
Exception: When the main database has duplicate registration numbers.
"""

duplicate_regs = df[df.duplicated(subset="$Registration", keep=False)]
if len(duplicate_regs) > 0:
logging.error("The main database has duplicate registration numbers.")
raise Exception(
f"The main database has '{duplicate_regs.shape[0]}'duplicate registration "
"numbers: {duplicate_regs}"
)


def badLinks(df):
"""Check if the main database has any links that don't start with http or https.
Args:
df (pandas.Dataframe): The database to check.
Raises:
Exception: When the main database has invalid links.
"""

bad_links = df[df["$#Link"].apply(isUrlValid) == False]["$#Link"].tolist()
if len(bad_links) > 0:
logging.error("The main database has invalid links.")
raise Exception(f"The main database has invalid links: {bad_links}")


if __name__ == "__main__":
##########################################
# Check main database. #
##########################################
logging.info("Checking the main database...")
try:
main_df = pd.read_csv("plane-alert-db.csv")
except Exception as e:
logging.error("The main database is not a valid CSV.")
raise e

# Preform database checks.
duplicateICAOs(main_df)
# duplicateRegs(main_df) # NOTE: This is commented out because there are duplicates.
badLinks(main_df)
logging.info("The main database is valid.")

##########################################
# Check 'plane-alert-twitter-blocked' db.#
##########################################
logging.info("Checking the 'plane-alert-twitter-blocked' database...")
try:
twitter_blocked_df = pd.read_csv("plane-alert-twitter-blocked.csv")
except Exception as e:
logging.error("The 'plane-alert-twitter-blocked' database is not a valid CSV.")
raise e

# Preform database checks.
duplicateICAOs(twitter_blocked_df)
duplicateRegs(twitter_blocked_df)
badLinks(twitter_blocked_df)
logging.info("The 'plane-alert-twitter-blocked' database is valid.")

##########################################
# Check 'plane-alert-ukraine' db. #
##########################################
logging.info("Checking the 'plane-alert-ukraine' database...")
try:
ukraine_df = pd.read_csv("plane-alert-ukraine.csv")
except Exception as e:
logging.error("The 'plane-alert-ukraine' database is not a valid CSV.")
raise e

# Preform database checks.
duplicateICAOs(ukraine_df)
duplicateRegs(ukraine_df)
badLinks(ukraine_df)
logging.info("The 'plane-alert-ukraine' database is valid.")

##########################################
# Check 'plane_images.txt' db. #
##########################################
logging.info("Checking the 'plane_images.txt' database...")
try:
images_df = pd.read_csv("plane_images.txt")
except Exception as e:
logging.error("The 'plane_images.txt' database is not a valid CSV.")
raise e

# Perform database checks.
# duplicateICAOs(
# images_df
# ) # NOTE: This is commented out because there are duplicates.
bad_links = pd.DataFrame()
for col in images_df.columns: # Check all link columns for bad links.
if col != "$ICAO":
bad_links = bad_links.append(
images_df[images_df[col].apply(isUrlValid, allow_nans=True) == False]
)
if len(bad_links) > 0:
logging.error("The 'plane_images.txt' database has invalid links.")
raise Exception(
f"The 'plane_images.txt' database has invalid links: {bad_links}"
)
5 changes: 5 additions & 0 deletions scripts/derivative_changed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Check if the derivative files have changed.
main_database_files=("plane-alert-db.csv", "plane-alert-pia.csv", "plane-alert-twitter-blocked.csv", "plane-alert-ukraine.csv", "plane_images.txt")

files_changed=$(gh pr view 99 --json files -q '.files[].path' | grep '.csv')
if

0 comments on commit 3e03808

Please sign in to comment.