diff --git a/.env.example b/.env.example index 40064d5..7dbaa49 100644 --- a/.env.example +++ b/.env.example @@ -6,4 +6,5 @@ SECRET_KEY='django-insecure-kzrs=c35hmbb*nkd3wgn%3!!86v2+qr_)rw(7r%kya@joxv!!9' SENTRY_ENABLED=False SENTRY_DSN=dsn_example SESSION_COOKIE_SECURE=False +YOUTUBE_API_KEY= TIME_ZONE=America/New_York diff --git a/Makefile b/Makefile index f12b619..6c41f62 100644 --- a/Makefile +++ b/Makefile @@ -32,3 +32,6 @@ build: shell: docker compose run --rm web bash + +django_shell: + uv run manage.py shell_plus diff --git a/docs/techcity/event-recording-ingestion.md b/docs/techcity/event-recording-ingestion.md new file mode 100644 index 0000000..db15cad --- /dev/null +++ b/docs/techcity/event-recording-ingestion.md @@ -0,0 +1,96 @@ +# Event Recording Ingestion + +Ingests Playlist Items (YouTube Videos) from Youtube API, and stores them in the database as "Event Recordings". + +## Ingestion + +Command to fetch event recordings from Youtube API: + +```sh +uv run manage.py fetch_event_recordings --help +``` + +## Youtube API + +Setup Youtube API Key: https://developers.google.com/youtube/v3/docs + +### Playlist List + +Example Query using API Explorer: + +https://developers.google.com/youtube/v3/docs/playlistItems/list?apix_params=%7B%22part%22%3A%5B%22id%22%2C%22contentDetails%22%2C%22snippet%22%2C%22status%22%5D%2C%22playlistId%22%3A%22PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF%22%7D#usage + + +```json +{ + "kind": "youtube#playlistItemListResponse", + "etag": "WOkbLa1SbmIb8Fp9K6A0MNZFKSo", + "nextPageToken": "EAAajgFQVDpDQVVpRUVNeVJUZzFOalZCUVVaQk5qQXdNVGNvQVVqYmdMaVQ5YVNJQTFBQldrVWlRMmxLVVZSRldtcFRNRloyVGtkS1ptSnFSak5VVlZwdldXMXNiRnBJUWs1YU1tZDVWbXhLVlU1WVZrcFJNMVpIUldkM1NUVnZOMWwwWjFsUkxVMDNOM04zU1NJ", + "items": [ + { + "kind": "youtube#playlistItem", + "etag": "G6NnaGF1mlFg2q_77NJfIn-F5oE", + "id": "UExGY0tFbzRiX24xd01GaGJpZWRwTWdoMlZSVDV1SUN1Ri45RjNFMDhGQ0Q2RkFCQTc1", + "snippet": { + "publishedAt": "2024-08-15T03:53:12Z", + "channelId": "UCA-ORpF9LEgECmkP3nvVLXQ", + "title": "PDF Text Extraction With Python", + "description": "Is your data locked up in portable document format (PDFs)? In this talk we're going to explore methods to extract text and other data from PDFs using readily-available, open-source Python tools (such as pypdf), as well as techniques such as OCR (optical character recognition) and table extraction. We will also discuss the philosophy of text extraction as a whole.\n\nSpeaker: Raju Rayavarapu\n\nRaju Rayavarapu is a scientist with a background in cancer biology, pharmacology, and drug development with a passion for understanding and using new technology. He is currently a data scientist at DNAnexus, a cloud-based data analysis and management platform. He also loves Disney Lorcana.", + "thumbnails": { + "default": { + "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/default.jpg", + "width": 120, + "height": 90 + }, + "medium": { + "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/mqdefault.jpg", + "width": 320, + "height": 180 + }, + "high": { + "url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/hqdefault.jpg", + "width": 480, + "height": 360 + } + }, + "channelTitle": "Matt Layman", + "playlistId": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF", + "position": 0, + "resourceId": { + "kind": "youtube#video", + "videoId": "UlmyJl9_Gwc" + }, + "videoOwnerChannelTitle": "Matt Layman", + "videoOwnerChannelId": "UCA-ORpF9LEgECmkP3nvVLXQ" + }, + "contentDetails": { + "videoId": "UlmyJl9_Gwc", + "videoPublishedAt": "2024-08-15T05:19:54Z" + }, + "status": { + "privacyStatus": "public" + } + }, + // ... + ], + "pageInfo": { + "totalResults": 34, + "resultsPerPage": 5 + } +} +``` + +Example Embed Code: + +```html + +``` diff --git a/project/settings.py b/project/settings.py index 7bb2381..98ebc4e 100644 --- a/project/settings.py +++ b/project/settings.py @@ -29,6 +29,7 @@ "django.contrib.sessions", "django.contrib.messages", "django.contrib.staticfiles", + "django_extensions", "huey.contrib.djhuey", "techcity.core", "techcity.events", diff --git a/pyproject.toml b/pyproject.toml index 0fe9603..24091f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "sqids>=0.5.0", "urllib3>=2.2.3", "whitenoise[brotli]>=6.7.0", + "django-extensions>=3.2.3", ] [tool.uv] diff --git a/techcity/events/admin.py b/techcity/events/admin.py index 2d125bd..c712ed0 100644 --- a/techcity/events/admin.py +++ b/techcity/events/admin.py @@ -1,6 +1,6 @@ from django.contrib import admin -from .models import Event, Venue +from .models import Event, EventRecording, Venue @admin.register(Event) @@ -12,3 +12,9 @@ class EventAdmin(admin.ModelAdmin): @admin.register(Venue) class VenueAdmin(admin.ModelAdmin): ordering = ["address"] + + +@admin.register(EventRecording) +class EventRecordingAdmin(admin.ModelAdmin): + list_display = ["group", "event", "title", "description", "url"] + list_filter = ["group"] diff --git a/techcity/events/connectors/youtube.py b/techcity/events/connectors/youtube.py new file mode 100644 index 0000000..c0fd66a --- /dev/null +++ b/techcity/events/connectors/youtube.py @@ -0,0 +1,151 @@ +import json +import logging + +import requests +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +from techcity.events.models import EventRecording, EventRecordingType +from techcity.groups.models import Group + +# TODO - this could be a model / added to Group. For now, hardcoding. +EVENT_RECORDING_SOURCES = { + "python-frederick": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF", + "frederick-open-source": "PLFcKEo4b_n1zz3pGwC8e0RVaQf0c07jyd", + "frederick-web-tech": "PLFcKEo4b_n1yistY9g3kyiXefYNPVZ1-X", +} + + +class YouTubeConnector: + """A connector to fetch event recordings from YouTube.com""" + + def __init__( + self, + youtube_api_key: str, + cache_dir: str, + group_slugs: list[str] | None = None, + ): + self.youtube_api_key = youtube_api_key + if not self.youtube_api_key: + raise ValueError( + "A YouTube API key is required to fetch data." + "Set YOUTUBE_API_KEY in the environment." + ) + + self.cache_dir = cache_dir + group_slugs = group_slugs or EVENT_RECORDING_SOURCES.keys() + self.event_recording_sources = { + group_slug: playlist_id + for group_slug, playlist_id in EVENT_RECORDING_SOURCES.items() + if group_slug in group_slugs + } + + def fetch(self, cached: bool) -> None: + print("Fetching event recordings from YouTube...") + + for group_slug, playlist_id in self.event_recording_sources.items(): + if cached: + print(f"Using cached data for {group_slug}...") + else: + self.fetch_to_cache(group_slug, playlist_id) + + self.generate_event_recordings(group_slug, playlist_id) + + def fetch_to_cache(self, group_slug: str, playlist_id: str): + """ + Fetch the data from YouTube and save it to the cache. + + Uses the following: + https://developers.google.com/youtube/v3/docs/playlistItems/list + """ + retries = Retry( + total=3, + allowed_methods={"GET"}, + status_forcelist=[502, 503, 504], + backoff_factor=0.1, + ) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retries)) + + results = [] + paginating = True + page_token = "" + + while paginating: + response = session.get( + "https://www.googleapis.com/youtube/v3/playlistItems", + params={ + "part": ["id", "contentDetails", "snippet", "status"], + "playlistId": playlist_id, + "maxResults": 50, + "pageToken": page_token, + "key": self.youtube_api_key, + }, + headers={ + "Accept": "application/json", + "Referer": "https://techfrederick.org", + }, + timeout=5, + ) + response.raise_for_status() + data = response.json() + results.extend(data["items"]) + if "nextPageToken" in data: + page_token = data["nextPageToken"] + else: + paginating = False + + with open(self.cache_dir / f"{group_slug}-youtube-videos.json", "w") as f: + json.dump(results, f) + + def generate_event_recordings(self, group_slug: str, playlist_id: str) -> None: + """Generate any event recordings found in API data.""" + group = Group.objects.get(slug=group_slug) + with open(self.cache_dir / f"{group_slug}-youtube-videos.json") as f: + youtube_video_data = json.load(f) + + for video_data in youtube_video_data: + self.upsert_event_recording(group, video_data, playlist_id) + + def upsert_event_recording( + self, + group: Group, + youtube_video_data: dict, + playlist_id: str, + ) -> None: + """Upsert an event recording into the database.""" + + snippet = youtube_video_data["snippet"] + content_details = youtube_video_data["contentDetails"] + + # TODO - could extract Speaker ? Should probably be from the + # _Meetup_ event ingestion, not YouTube. + # + # speaker_pattern = re.compile(r"(Speaker|Presenter): (.+)\n(.+)", re.DOTALL) + # try: + # speaker_match = speaker_pattern.search(snippet["description"]) + # speaker_name = speaker_match.group(2).strip() + # speaker_bio = speaker_match.group(3).strip() + # except AttributeError: + + recording, created = EventRecording.objects.update_or_create( + recording_type=EventRecordingType.YOUTUBE, + group=group, + external_id=content_details["videoId"], + defaults={ + "title": snippet["title"], + "description": snippet["description"], + "published_at": snippet["publishedAt"], + "url": f"https://www.youtube.com/watch?v={content_details['videoId']}&list={playlist_id}", + "external_id": content_details["videoId"], + "external_playlist_id": playlist_id, + "metadata": { + "snippet": snippet, + }, + }, + ) + + if created: + logging.info(f"Created new recording: {recording.title}") + else: + logging.info(f"Updated recording: {recording.title}") diff --git a/techcity/events/management/commands/fetch_event_recordings.py b/techcity/events/management/commands/fetch_event_recordings.py new file mode 100644 index 0000000..e404269 --- /dev/null +++ b/techcity/events/management/commands/fetch_event_recordings.py @@ -0,0 +1,37 @@ +import os + +from django.core.management.base import BaseCommand + +from techcity.constants import cache +from techcity.events.connectors.youtube import YouTubeConnector + + +class Command(BaseCommand): + help = "Fetch all event recordings from any of the source connectors" + + def add_arguments(self, parser): + parser.add_argument( + "--cached", + action="store_true", + default=False, + help="Use cached data instead of APIs if available", + ) + parser.add_argument( + "--youtube-api-key", + type=str, + help="The YouTube API key to use for fetching data" + " (set in env YOUTUBE_API_KEY)", + default=os.getenv("YOUTUBE_API_KEY"), + ) + + def handle(self, *args, cached: bool = False, youtube_api_key: str = "", **kwargs): + self.stdout.write("Fetching events...") + + cache.mkdir(exist_ok=True) + cache_dir = cache / "event_recordings" + cache_dir.mkdir(exist_ok=True) + + connector = YouTubeConnector( + cache_dir=cache_dir, youtube_api_key=youtube_api_key + ) + connector.fetch(cached) diff --git a/techcity/events/management/commands/resolve_event_recording_names.py b/techcity/events/management/commands/resolve_event_recording_names.py new file mode 100644 index 0000000..3480d2c --- /dev/null +++ b/techcity/events/management/commands/resolve_event_recording_names.py @@ -0,0 +1,122 @@ +import json +import os + +import requests +from django.core.management.base import BaseCommand + +from techcity.events.models import Event, EventRecording + + +class Command(BaseCommand): + help = "Attempt to resolve the names of event recordings" + + def add_arguments(self, parser): + parser.add_argument( + "--openai-api-key", + type=str, + help="The OpenAI API key to use for fetching data" + " (set in env OPENAI_API_KEY)", + default=os.getenv("OPENAI_API_KEY"), + ) + parser.add_argument( + "--openai-model", + type=str, + help="The OpenAI model to use for fetching data", + default="gpt-4o", + ) + + def handle(self, *args, openai_api_key: str, openai_model: str, **kwargs): + recording = EventRecording.objects.first() + + events = Event.objects.filter(group=recording.group) + + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers={ + "Authorization": f"Bearer {openai_api_key}", + "Content-Type": "application/json", + }, + json=dict( + model=openai_model, + messages=[ + { + "role": "system", + "content": """ + Resolve the name of this event recording + Return ID of the event it belongs to + """, + }, + { + "role": "user", + "content": f""" + Recording: {recording.__dict__} + + List of Events It may Belong To: + {[e.__dict__ for e in events]} + """, + }, + ], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "event_schema", + "schema": { + "type": "object", + "properties": { + "id": { + "description": "ID of the event", + "type": "number", + }, + "name": { + "description": "Name of the event", + "type": "string", + }, + }, + }, + }, + }, + ), + timeout=15, + ) + # Example: + + # { + # "id": "chatcmpl-AOmuw33UyJoiZlFBFZvwWe5Hy8nTf", + # "object": "chat.completion", + # "created": 1730471274, + # "model": "gpt-4o-2024-08-06", + # "choices": [ + # { + # "index": 0, + # "message": { + # "role": "assistant", + # "content": + # "{\"id\":75,\"name\":\"PDF Text Extraction (2nd Wed Talk)\"}", + # "refusal": null + # }, + # "logprobs": null, + # "finish_reason": "stop" + # } + # ], + # "usage": { + # "prompt_tokens": 11248, + # "completion_tokens": 17, + # "total_tokens": 11265, + # "prompt_tokens_details": { + # "cached_tokens": 0 + # }, + # "completion_tokens_details": { + # "reasoning_tokens": 0 + # } + # }, + # "system_fingerprint": "fp_45cf54deae" + # } + + response.raise_for_status() + data = response.json() + content = json.loads(data["choices"][0]["message"]["content"]) + + event = Event.objects.get(id=content["id"]) + + print(f"Recording: {recording.title=} {recording.published_at=}") + print(f"Event: {event.name=} {event.start_at=}") diff --git a/techcity/events/migrations/0002_eventrecording.py b/techcity/events/migrations/0002_eventrecording.py new file mode 100644 index 0000000..3910c11 --- /dev/null +++ b/techcity/events/migrations/0002_eventrecording.py @@ -0,0 +1,91 @@ +# Generated by Django 5.1.1 on 2024-10-31 13:41 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("events", "0001_initial"), + ("groups", "0003_alter_group_kind"), + ] + + operations = [ + migrations.CreateModel( + name="EventRecording", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "recording_type", + models.CharField( + choices=[("YOUTUBE", "YouTube")], + help_text="The type of recording", + max_length=16, + ), + ), + ( + "title", + models.CharField( + help_text="The title of the recording", max_length=256 + ), + ), + ( + "description", + models.TextField(help_text="A description of the recording"), + ), + ("url", models.URLField(help_text="The URL to the recording")), + ( + "external_playlist_id", + models.CharField( + help_text="The ID of the playlist on the platform" + " (i.e. list=<...> on YouTube)", + max_length=256, + null=True, + ), + ), + ( + "external_id", + models.CharField( + help_text="The ID of the recording on the platform" + " (i.e. v=<...> on YouTube)", + max_length=256, + null=True, + ), + ), + ( + "metadata", + models.JSONField( + default=dict, help_text="Misc metadata about the recording" + ), + ), + ("created_at", models.DateTimeField(auto_now_add=True)), + ("updated_at", models.DateTimeField(auto_now=True)), + ( + "event", + models.ForeignKey( + blank=True, + help_text="The event that was recorded", + null=True, + on_delete=django.db.models.deletion.SET_NULL, + to="events.event", + ), + ), + ( + "group", + models.ForeignKey( + help_text="The group that recorded the event", + on_delete=django.db.models.deletion.CASCADE, + to="groups.group", + ), + ), + ], + ), + ] diff --git a/techcity/events/migrations/0003_eventrecording_published_at_and_more.py b/techcity/events/migrations/0003_eventrecording_published_at_and_more.py new file mode 100644 index 0000000..17a4b88 --- /dev/null +++ b/techcity/events/migrations/0003_eventrecording_published_at_and_more.py @@ -0,0 +1,50 @@ +# Generated by Django 5.1.1 on 2024-11-01 14:25 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("events", "0002_eventrecording"), + ] + + operations = [ + migrations.AddField( + model_name="eventrecording", + name="published_at", + field=models.DateTimeField( + blank=True, + help_text="The date and time the recording was published", + null=True, + ), + ), + migrations.AlterField( + model_name="eventrecording", + name="external_id", + field=models.CharField( + blank=True, + help_text="The ID of the recording on the platform" + " (i.e. v=<...> on YouTube)", + max_length=256, + null=True, + ), + ), + migrations.AlterField( + model_name="eventrecording", + name="external_playlist_id", + field=models.CharField( + blank=True, + help_text="The ID of the playlist on the platform" + " (i.e. list=<...> on YouTube)", + max_length=256, + null=True, + ), + ), + migrations.AlterField( + model_name="eventrecording", + name="metadata", + field=models.JSONField( + blank=True, default=dict, help_text="Misc metadata about the recording" + ), + ), + ] diff --git a/techcity/events/models.py b/techcity/events/models.py index 0de6d68..435910c 100644 --- a/techcity/events/models.py +++ b/techcity/events/models.py @@ -108,3 +108,86 @@ class Venue(models.Model): def __str__(self): return f"{self.address}, {self.city}, {self.state}" + + +class EventRecordingType(models.TextChoices): + YOUTUBE = "YOUTUBE", "YouTube" + + +class EventRecording(models.Model): + """ + A recording of an event, such as a video or podcast. + """ + + # Related Models + group = models.ForeignKey( + "groups.Group", + on_delete=models.CASCADE, + help_text="The group that recorded the event", + ) + event = models.ForeignKey( + Event, + on_delete=models.SET_NULL, + # Nullable, b/c there requires manual work to match the event + # after the recording is ingested. + null=True, + blank=True, + help_text="The event that was recorded", + ) + + # Recording Details + recording_type = models.CharField( + max_length=16, + choices=EventRecordingType.choices, + help_text="The type of recording", + ) + title = models.CharField( + max_length=256, + help_text="The title of the recording", + ) + description = models.TextField( + help_text="A description of the recording", + ) + published_at = models.DateTimeField( + null=True, blank=True, help_text="The date and time the recording was published" + ) + # Including the URL here allows for more flexibility in the future, + # such as linking to a podcast feed. + # But for YouTube this is redundant with the external_id. + url = models.URLField( + help_text="The URL to the recording", + ) + external_playlist_id = models.CharField( + max_length=256, + help_text="The ID of the playlist on the platform (i.e. list=<...> on YouTube)", + null=True, + blank=True, + ) + external_id = models.CharField( + max_length=256, + help_text="The ID of the recording on the platform (i.e. v=<...> on YouTube)", + null=True, + blank=True, + ) + metadata = models.JSONField( + null=False, + blank=True, + default=dict, + help_text="Misc metadata about the recording", + ) + + # Timestamps + created_at = models.DateTimeField(auto_now_add=True) + updated_at = models.DateTimeField(auto_now=True) + + def get_embed_url(self) -> str | None: + if self.recording_type == EventRecordingType.YOUTUBE: + return f"https://www.youtube.com/embed/{self.external_id}" + ( + f"?list={self.external_playlist_id}" + if self.external_playlist_id + else "" + ) + return None + + def __str__(self): + return self.title diff --git a/uv.lock b/uv.lock index dedc5aa..68f460e 100644 --- a/uv.lock +++ b/uv.lock @@ -177,6 +177,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/33/2036a472eedfbe49240dffea965242b3f444de4ea4fbeceb82ccea33a2ce/django_debug_toolbar-4.4.6-py3-none-any.whl", hash = "sha256:3beb671c9ec44ffb817fad2780667f172bd1c067dbcabad6268ce39a81335f45", size = 229621 }, ] +[[package]] +name = "django-extensions" +version = "3.2.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "django" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/f1/318684c9466968bf9a9c221663128206e460c1a67f595055be4b284cde8a/django-extensions-3.2.3.tar.gz", hash = "sha256:44d27919d04e23b3f40231c4ab7af4e61ce832ef46d610cc650d53e68328410a", size = 277216 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/7e/ba12b9660642663f5273141018d2bec0a1cae1711f4f6d1093920e157946/django_extensions-3.2.3-py3-none-any.whl", hash = "sha256:9600b7562f79a92cbf1fde6403c04fee314608fefbb595502e34383ae8203401", size = 229868 }, +] + [[package]] name = "environs" version = "11.0.0" @@ -520,6 +532,7 @@ source = { virtual = "." } dependencies = [ { name = "django" }, { name = "django-debug-toolbar" }, + { name = "django-extensions" }, { name = "environs" }, { name = "gunicorn" }, { name = "huey" }, @@ -546,6 +559,7 @@ dev = [ requires-dist = [ { name = "django", specifier = ">=5.1.1" }, { name = "django-debug-toolbar", specifier = ">=4.4.6" }, + { name = "django-extensions", specifier = ">=3.2.3" }, { name = "environs", specifier = ">=11.0.0" }, { name = "gunicorn", specifier = ">=23.0.0" }, { name = "huey", specifier = ">=2.5.2" },