Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Content Blocker Bot #653

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion mediadata_ai_blocklist/py/airtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,6 @@ async def batch_upsert_organizations(data):
logging.info('Upserting organizations in Airtable')
try:
table = at.table(base_id, content_table)
table.batch_upsert(records=data, key_fields=['URL',])
table.batch_upsert(records=data, key_fields=['id',])
except Exception as e:
logging.error(f'Error upserting organization: {e}')
43 changes: 40 additions & 3 deletions mediadata_ai_blocklist/py/database.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import sqlite3
from dataclasses import dataclass
from sqlite3 import Error
from typing import List
from dotenv import load_dotenv
import os

Expand All @@ -15,6 +14,10 @@ class MediaHouse:
url: str
airtable_id: str
id: str = None
site_status: str = None
site_reachable: bool = None
site_redirect: bool = None
final_url: str = None


@dataclass
Expand Down Expand Up @@ -56,8 +59,12 @@ def create_table(self):
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT NOT NULL,
country TEXT NOT NULL,
url TEXT NOT NULL UNIQUE,
airtable_id TEXT NOT NULL UNIQUE
url TEXT NOT NULL,
airtable_id TEXT NOT NULL UNIQUE,
site_status TEXT,
site_reachable BOOLEAN,
site_redirect BOOLEAN,
final_url TEXT
);
CREATE TABLE IF NOT EXISTS robots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
Expand Down Expand Up @@ -117,6 +124,36 @@ def select_all_media_houses(self):
finally:
cur.close()

def update_site_status(self, media_house_id, site_status, site_reachable, site_redirect, final_url):
try:
sql = """
UPDATE media_house
SET site_status = ?, site_reachable = ?, site_redirect = ?, final_url = ?
WHERE id = ?
"""
cur = self.conn.cursor()
cur.execute(sql, (site_status, site_reachable,
site_redirect, final_url, media_house_id))
self.conn.commit()
except Error as e:
print(e)
finally:
cur.close()

def get_reachable_sites(self):
try:
cur = self.conn.cursor()
cur.execute("SELECT * FROM media_house WHERE site_reachable = 1")
thepsalmist marked this conversation as resolved.
Show resolved Hide resolved
rows = cur.fetchall()
column_names = [column[0] for column in cur.description]
dict_rows = [dict(zip(column_names, row)) for row in rows]
return dict_rows
except Error as e:
print(e)
return None
finally:
cur.close()

def close_connection(self):
kelvinkipruto marked this conversation as resolved.
Show resolved Hide resolved
self.conn.close()

Expand Down
3 changes: 3 additions & 0 deletions mediadata_ai_blocklist/py/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ def diff_robot_files(media_house: MediaHouse, db: Database):
data['blocks_crawlers'] = True if found_crawlers else False
data['notes'] = 'Robots.txt has been updated to block AI crawlers' if found_crawlers and not archive_crawlers else None
data['latest_robots_url'] = latest_robots['url']
data['latest_robots_date'] = latest_robots['timestamp']
data['latest_robots_content'] = latest_robots['content']
data['archived_robots_url'] = oldest_archived_robots['url']
data['archived_date'] = oldest_archived_robots['archived_date']
data['archived_robots_content'] = oldest_archived_robots['content']
return data
65 changes: 58 additions & 7 deletions mediadata_ai_blocklist/py/main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
from yarl import URL
import random
import aiohttp
from airtable import get_organizations, batch_upsert_organizations
Expand All @@ -15,27 +16,48 @@


async def update_airtable(db: Database):
all_orgs = db.select_all_media_houses()
all_orgs = db.get_reachable_sites()
data_update = []
for org in all_orgs:
diff_data = diff_robot_files(org, db)
if (diff_data):
update_data = {
"fields": {
"URL": org['url'],
"Organisation Name": org['name'],
"id": org['airtable_id'],
"Blocks AI Crawlers": diff_data['blocks_crawlers'],
"Blocked Crawlers": diff_data['crawler'],
"Current Robots": diff_data['latest_robots_url'],
"Archived Robots": diff_data['archived_robots_url'],
"Current Robots URL": diff_data['latest_robots_url'],
"Checked": datetime.datetime.strptime(diff_data['latest_robots_date'], "%Y%m%d%H%M%S").date().isoformat(),
"Current Robots Content": diff_data['latest_robots_content'],
"Archived Robots URL": diff_data['archived_robots_url'],
"Archive Date": datetime.datetime.strptime(diff_data['archived_date'], "%Y%m%d%H%M%S").date().isoformat(),
"Archived Robots Content": diff_data['archived_robots_content'],
}
}
data_update.append(update_data)

await batch_upsert_organizations(data_update)


async def update_airtable_site_status(db: Database):
all_orgs = db.select_all_media_houses()
data_update = []
for org in all_orgs:
update_data = {
"fields": {
"id": org['airtable_id'],
"Organisation": [org['airtable_id']],
"URL": org['url'],
"Reachable": bool(org['site_reachable']),
"Redirects": bool(org['site_redirect']),
"Final URL": org['final_url'],
}
}
data_update.append(update_data)

await batch_upsert_organizations(data_update)


async def fetch_orgs(db: Database):
organizations = get_organizations()
for media_house in organizations:
Expand All @@ -44,8 +66,27 @@ async def fetch_orgs(db: Database):
db.insert_media_house(media_house_obj)


async def check_site_availability(url: str):
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, allow_redirects=True) as response:
return {
"status_code": response.status,
"reachable": True,
"redirect": URL(response.url).with_scheme('').with_path(response.url.path.rstrip('/')) != URL(url).with_scheme('').with_path(URL(url).path.rstrip('/')),
"final_url": str(response.url)
}
except Exception:
return {
"status_code": None,
"reachable": False,
"redirect": False,
"final_url": None
}


async def fetch_robots(db: Database):
media_houses = db.select_all_media_houses()
media_houses = db.get_reachable_sites()
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_current_robots(
db, session, media_house)) for media_house in media_houses]
Expand All @@ -54,16 +95,26 @@ async def fetch_robots(db: Database):


async def fetch_archived_robots(db: Database):
media_houses = db.select_all_media_houses()
media_houses = db.get_reachable_sites()
async with aiohttp.ClientSession() as session:
tasks = [asyncio.create_task(fetch_past_robots(
db, session, media_house)) for media_house in media_houses]
await asyncio.gather(*tasks)
await asyncio.sleep(random.uniform(1, 3))


async def check_org_sites(db: Database):
all_orgs = db.select_all_media_houses()
for org in all_orgs:
site_status = await check_site_availability(org['url'])
db.update_site_status(org['id'], site_status['status_code'],
site_status['reachable'], site_status['redirect'], site_status['final_url'])


async def main(db: Database):
await fetch_orgs(db)
await check_org_sites(db)
await update_airtable_site_status(db)
await asyncio.gather(fetch_robots(db), fetch_archived_robots(db))
await update_airtable(db)

Expand Down
6 changes: 0 additions & 6 deletions mediadata_ai_blocklist/py/robots.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import asyncio
import re
import aiohttp
Expand Down Expand Up @@ -123,10 +122,8 @@ async def fetch_current_robots(db: Database, session: aiohttp.ClientSession, med
try:
text = await fetch_robots(session, url)
if text:
print("Valid robots.txt")
robots = Robots(media_house['id'], robots_url,
datetime.now().strftime("%Y%m%d%H%M%S"), text, "200")
print(robots)
db.insert_robot(robots)
await asyncio.sleep(random.uniform(1, 3))
except Exception as e:
Expand All @@ -153,7 +150,6 @@ async def fetch_past_robots(db: Database, session: aiohttp.ClientSession, media_
return
snapshots = await fetch_internet_archive_snapshots(session, media_house['url'])
if snapshots:
print("Snapshots")
one_year_ago = (datetime.now() - timedelta(days=past_days)
).strftime("%Y%m%d%H%M%S")
closest_snapshot = find_closest_snapshot(snapshots, one_year_ago)
Expand All @@ -166,10 +162,8 @@ async def fetch_past_robots(db: Database, session: aiohttp.ClientSession, media_
media_house['name']}: {closest_snapshot_url}""")
archive_robots = await fetch_robots(session, closest_snapshot_url)
if archive_robots:
print("Valid robots.txt")
archive_robots = ArchivedRobots(media_house['id'], closest_snapshot_url,
closest_snapshot['timestamp'], archive_robots, datetime.now().strftime("%Y%m%d%H%M%S"), "200")
print(archive_robots)
db.insert_archived_robot(archive_robots)
await asyncio.sleep(random.uniform(1, 3))
else:
Expand Down