TechFrederick · david-wolgemuth · Oct 31, 2024 · Nov 1, 2024 · mblayman · Nov 1, 2024
diff --git a/.env.example b/.env.example
@@ -6,4 +6,5 @@ SECRET_KEY='django-insecure-kzrs=c35hmbb*nkd3wgn%3!!86v2+qr_)rw(7r%kya@joxv!!9'
 SENTRY_ENABLED=False
 SENTRY_DSN=dsn_example
 SESSION_COOKIE_SECURE=False
+YOUTUBE_API_KEY=
 TIME_ZONE=America/New_York
diff --git a/Makefile b/Makefile
@@ -32,3 +32,6 @@ build:
 
 shell:
 	docker compose run --rm web bash
+
+django_shell:
+	uv run manage.py shell_plus
diff --git a/docs/techcity/event-recording-ingestion.md b/docs/techcity/event-recording-ingestion.md
@@ -0,0 +1,96 @@
+# Event Recording Ingestion
+
+Ingests Playlist Items (YouTube Videos) from Youtube API, and stores them in the database as "Event Recordings".
+
+## Ingestion
+
+Command to fetch event recordings from Youtube API:
+
+```sh
+uv run manage.py fetch_event_recordings --help
+```
+
+## Youtube API
+
+Setup Youtube API Key: https://developers.google.com/youtube/v3/docs
+
+### Playlist List
+
+Example Query using API Explorer:
+
+https://developers.google.com/youtube/v3/docs/playlistItems/list?apix_params=%7B%22part%22%3A%5B%22id%22%2C%22contentDetails%22%2C%22snippet%22%2C%22status%22%5D%2C%22playlistId%22%3A%22PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF%22%7D#usage
+
+
+```json
+{
+  "kind": "youtube#playlistItemListResponse",
+  "etag": "WOkbLa1SbmIb8Fp9K6A0MNZFKSo",
+  "nextPageToken": "EAAajgFQVDpDQVVpRUVNeVJUZzFOalZCUVVaQk5qQXdNVGNvQVVqYmdMaVQ5YVNJQTFBQldrVWlRMmxLVVZSRldtcFRNRloyVGtkS1ptSnFSak5VVlZwdldXMXNiRnBJUWs1YU1tZDVWbXhLVlU1WVZrcFJNMVpIUldkM1NUVnZOMWwwWjFsUkxVMDNOM04zU1NJ",
+  "items": [
+    {
+      "kind": "youtube#playlistItem",
+      "etag": "G6NnaGF1mlFg2q_77NJfIn-F5oE",
+      "id": "UExGY0tFbzRiX24xd01GaGJpZWRwTWdoMlZSVDV1SUN1Ri45RjNFMDhGQ0Q2RkFCQTc1",
+      "snippet": {
+        "publishedAt": "2024-08-15T03:53:12Z",
+        "channelId": "UCA-ORpF9LEgECmkP3nvVLXQ",
+        "title": "PDF Text Extraction With Python",
+        "description": "Is your data locked up in portable document format (PDFs)? In this talk we're going to explore methods to extract text and other data from PDFs using readily-available, open-source Python tools (such as pypdf), as well as techniques such as OCR (optical character recognition) and table extraction. We will also discuss the philosophy of text extraction as a whole.\n\nSpeaker: Raju Rayavarapu\n\nRaju Rayavarapu is a scientist with a background in cancer biology, pharmacology, and drug development with a passion for understanding and using new technology. He is currently a data scientist at DNAnexus, a cloud-based data analysis and management platform. He also loves Disney Lorcana.",
+        "thumbnails": {
+          "default": {
+            "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/default.jpg",
+            "width": 120,
+            "height": 90
+          },
+          "medium": {
+            "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/mqdefault.jpg",
+            "width": 320,
+            "height": 180
+          },
+          "high": {
+            "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/hqdefault.jpg",
+            "width": 480,
+            "height": 360
+          }
+        },
+        "channelTitle": "Matt Layman",
+        "playlistId": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF",
+        "position": 0,
+        "resourceId": {
+          "kind": "youtube#video",
+          "videoId": "UlmyJl9_Gwc"
+        },
+        "videoOwnerChannelTitle": "Matt Layman",
+        "videoOwnerChannelId": "UCA-ORpF9LEgECmkP3nvVLXQ"
+      },
+      "contentDetails": {
+        "videoId": "UlmyJl9_Gwc",
+        "videoPublishedAt": "2024-08-15T05:19:54Z"
+      },
+      "status": {
+        "privacyStatus": "public"
+      }
+    },
+    // ...
+  ],
+  "pageInfo": {
+    "totalResults": 34,
+    "resultsPerPage": 5
+  }
+}
+```
+
+Example Embed Code:
+
+```html
+<iframe
+    width="1280"
+    height="720"
+    src="https://www.youtube.com/embed/UlmyJl9_Gwc?list=PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF"
+    title="PDF Text Extraction With Python"
+    frameborder="0"
+    allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
+    referrerpolicy="strict-origin-when-cross-origin"
+    allowfullscreen
+></iframe>
+```
diff --git a/project/settings.py b/project/settings.py
@@ -29,6 +29,7 @@
     "django.contrib.sessions",
     "django.contrib.messages",
     "django.contrib.staticfiles",
+    "django_extensions",
     "huey.contrib.djhuey",
     "techcity.core",
     "techcity.events",

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "sqids>=0.5.0",
     "urllib3>=2.2.3",
     "whitenoise[brotli]>=6.7.0",
+    "django-extensions>=3.2.3",
 ]
 
 [tool.uv]

diff --git a/techcity/events/admin.py b/techcity/events/admin.py
@@ -1,6 +1,6 @@
 from django.contrib import admin
 
-from .models import Event, Venue
+from .models import Event, EventRecording, Venue
 
 
 @admin.register(Event)
@@ -12,3 +12,9 @@ class EventAdmin(admin.ModelAdmin):
 @admin.register(Venue)
 class VenueAdmin(admin.ModelAdmin):
     ordering = ["address"]
+
+
+@admin.register(EventRecording)
+class EventRecordingAdmin(admin.ModelAdmin):
+    list_display = ["group", "event", "title", "description", "url"]
+    list_filter = ["group"]
diff --git a/techcity/events/connectors/youtube.py b/techcity/events/connectors/youtube.py
@@ -0,0 +1,150 @@
+import json
+import logging
+
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util import Retry
+
+from techcity.events.models import EventRecording, EventRecordingType
+from techcity.groups.models import Group
+
+# TODO - this could be a model / added to Group. For now, hardcoding.
+EVENT_RECORDING_SOURCES = {
+    "python-frederick": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF",
+    "frederick-open-source": "PLFcKEo4b_n1zz3pGwC8e0RVaQf0c07jyd",
+    "frederick-web-tech": "PLFcKEo4b_n1yistY9g3kyiXefYNPVZ1-X",
+}
+
+
+class YouTubeConnector:
+    """A connector to fetch event recordings from YouTube.com"""
+
+    def __init__(
+        self,
+        youtube_api_key: str,
+        cache_dir: str,
+        group_slugs: list[str] | None = None,
+    ):
+        self.youtube_api_key = youtube_api_key
+        if not self.youtube_api_key:
+            raise ValueError(
+                "A YouTube API key is required to fetch data."
+                "Set YOUTUBE_API_KEY in the environment."
+            )
+
+        self.cache_dir = cache_dir
+        group_slugs = group_slugs or EVENT_RECORDING_SOURCES.keys()
+        self.event_recording_sources = {
+            group_slug: playlist_id
+            for group_slug, playlist_id in EVENT_RECORDING_SOURCES.items()
+            if group_slug in group_slugs
+        }
+
+    def fetch(self, cached: bool) -> None:
+        print("Fetching event recordings from YouTube...")
+
+        for group_slug, playlist_id in self.event_recording_sources.items():
+            if cached:
+                print(f"Using cached data for {group_slug}...")
+            else:
+                self.fetch_to_cache(group_slug, playlist_id)
+
+            self.generate_event_recordings(group_slug, playlist_id)
+
+    def fetch_to_cache(self, group_slug: str, playlist_id: str):
+        """
+        Fetch the data from YouTube and save it to the cache.
+
+        Uses the following:
+            https://developers.google.com/youtube/v3/docs/playlistItems/list
+        """
+        retries = Retry(
+            total=3,
+            allowed_methods={"GET"},
+            status_forcelist=[502, 503, 504],
+            backoff_factor=0.1,
+        )
+        session = requests.Session()
+        session.mount("https://", HTTPAdapter(max_retries=retries))
+
+        results = []
+        paginating = True
+        page_token = ""
+
+        while paginating:
+            response = session.get(
+                "https://www.googleapis.com/youtube/v3/playlistItems",
+                params={
+                    "part": ["id", "contentDetails", "snippet", "status"],
+                    "playlistId": playlist_id,
+                    "maxResults": 50,
+                    "pageToken": page_token,
+                    "key": self.youtube_api_key,
+                },
+                headers={
+                    "Accept": "application/json",
+                    "Referer": "https://techfrederick.org",
+                },
+                timeout=5,
+            )
+            response.raise_for_status()
+            data = response.json()
+            results.extend(data["items"])
+            if "nextPageToken" in data:
+                page_token = data["nextPageToken"]
+            else:
+                paginating = False
+
+        with open(self.cache_dir / f"{group_slug}-youtube-videos.json", "w") as f:
+            json.dump(results, f)
+
+    def generate_event_recordings(self, group_slug: str, playlist_id: str) -> None:
+        """Generate any event recordings found in API data."""
+        group = Group.objects.get(slug=group_slug)
+        with open(self.cache_dir / f"{group_slug}-youtube-videos.json") as f:
+            youtube_video_data = json.load(f)
+
+        for video_data in youtube_video_data:
+            self.upsert_event_recording(group, video_data, playlist_id)
+
+    def upsert_event_recording(
+        self,
+        group: Group,
+        youtube_video_data: dict,
+        playlist_id: str,
+    ) -> None:
+        """Upsert an event recording into the database."""
+
+        snippet = youtube_video_data["snippet"]
+        content_details = youtube_video_data["contentDetails"]
+
+        # TODO - could extract Speaker ? Should probably be from the
+        #   _Meetup_ event ingestion, not YouTube.
+        #
+        # speaker_pattern = re.compile(r"(Speaker|Presenter): (.+)\n(.+)", re.DOTALL)
+        # try:
+        #     speaker_match = speaker_pattern.search(snippet["description"])
+        #     speaker_name = speaker_match.group(2).strip()
+        #     speaker_bio = speaker_match.group(3).strip()
+        # except AttributeError:
+
+        recording, created = EventRecording.objects.update_or_create(
+            recording_type=EventRecordingType.YOUTUBE,
+            group=group,
+            external_id=content_details["videoId"],
+            defaults={
+                "title": snippet["title"],
+                "description": snippet["description"],
+                "url": f"https://www.youtube.com/watch?v={content_details['videoId']}&list={playlist_id}",
+                "external_id": content_details["videoId"],
+                "external_playlist_id": playlist_id,
+                "metadata": {
+                    "snippet": snippet,
+                },
+            },
+        )
+
+        if created:
+            logging.info(f"Created new recording: {recording.title}")
+        else:
+            logging.info(f"Updated recording: {recording.title}")
diff --git a/techcity/events/management/commands/fetch_event_recordings.py b/techcity/events/management/commands/fetch_event_recordings.py
@@ -0,0 +1,37 @@
+import os
+
+from django.core.management.base import BaseCommand
+
+from techcity.constants import cache
+from techcity.events.connectors.youtube import YouTubeConnector
+
+
+class Command(BaseCommand):
+    help = "Fetch all event recordings from any of the source connectors"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--cached",
+            action="store_true",
+            default=False,
+            help="Use cached data instead of APIs if available",
+        )
+        parser.add_argument(
+            "--youtube-api-key",
+            type=str,
+            help="The YouTube API key to use for fetching data"
+            " (set in env YOUTUBE_API_KEY)",
+            default=os.getenv("YOUTUBE_API_KEY"),
+        )
+
+    def handle(self, *args, cached: bool = False, youtube_api_key: str = "", **kwargs):
+        self.stdout.write("Fetching events...")
+
+        cache.mkdir(exist_ok=True)
+        cache_dir = cache / "event_recordings"
+        cache_dir.mkdir(exist_ok=True)
+
+        connector = YouTubeConnector(
+            cache_dir=cache_dir, youtube_api_key=youtube_api_key
+        )
+        connector.fetch(cached)