Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fetch event recordings from youtube #94

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ SECRET_KEY='django-insecure-kzrs=c35hmbb*nkd3wgn%3!!86v2+qr_)rw(7r%kya@joxv!!9'
SENTRY_ENABLED=False
SENTRY_DSN=dsn_example
SESSION_COOKIE_SECURE=False
YOUTUBE_API_KEY=
TIME_ZONE=America/New_York
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ build:

shell:
docker compose run --rm web bash

django_shell:
uv run manage.py shell_plus
96 changes: 96 additions & 0 deletions docs/techcity/event-recording-ingestion.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Event Recording Ingestion

Ingests Playlist Items (YouTube Videos) from Youtube API, and stores them in the database as "Event Recordings".

## Ingestion

Command to fetch event recordings from Youtube API:

```sh
uv run manage.py fetch_event_recordings --help
```

## Youtube API

Setup Youtube API Key: https://developers.google.com/youtube/v3/docs

### Playlist List

Example Query using API Explorer:

https://developers.google.com/youtube/v3/docs/playlistItems/list?apix_params=%7B%22part%22%3A%5B%22id%22%2C%22contentDetails%22%2C%22snippet%22%2C%22status%22%5D%2C%22playlistId%22%3A%22PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF%22%7D#usage


```json
{
"kind": "youtube#playlistItemListResponse",
"etag": "WOkbLa1SbmIb8Fp9K6A0MNZFKSo",
"nextPageToken": "EAAajgFQVDpDQVVpRUVNeVJUZzFOalZCUVVaQk5qQXdNVGNvQVVqYmdMaVQ5YVNJQTFBQldrVWlRMmxLVVZSRldtcFRNRloyVGtkS1ptSnFSak5VVlZwdldXMXNiRnBJUWs1YU1tZDVWbXhLVlU1WVZrcFJNMVpIUldkM1NUVnZOMWwwWjFsUkxVMDNOM04zU1NJ",
"items": [
{
"kind": "youtube#playlistItem",
"etag": "G6NnaGF1mlFg2q_77NJfIn-F5oE",
"id": "UExGY0tFbzRiX24xd01GaGJpZWRwTWdoMlZSVDV1SUN1Ri45RjNFMDhGQ0Q2RkFCQTc1",
"snippet": {
"publishedAt": "2024-08-15T03:53:12Z",
"channelId": "UCA-ORpF9LEgECmkP3nvVLXQ",
"title": "PDF Text Extraction With Python",
"description": "Is your data locked up in portable document format (PDFs)? In this talk we're going to explore methods to extract text and other data from PDFs using readily-available, open-source Python tools (such as pypdf), as well as techniques such as OCR (optical character recognition) and table extraction. We will also discuss the philosophy of text extraction as a whole.\n\nSpeaker: Raju Rayavarapu\n\nRaju Rayavarapu is a scientist with a background in cancer biology, pharmacology, and drug development with a passion for understanding and using new technology. He is currently a data scientist at DNAnexus, a cloud-based data analysis and management platform. He also loves Disney Lorcana.",
"thumbnails": {
"default": {
"url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/default.jpg",
"width": 120,
"height": 90
},
"medium": {
"url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/mqdefault.jpg",
"width": 320,
"height": 180
},
"high": {
"url": "https://i.ytimg.com/vi/UlmyJl9_Gwc/hqdefault.jpg",
"width": 480,
"height": 360
}
},
"channelTitle": "Matt Layman",
"playlistId": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF",
"position": 0,
"resourceId": {
"kind": "youtube#video",
"videoId": "UlmyJl9_Gwc"
},
"videoOwnerChannelTitle": "Matt Layman",
"videoOwnerChannelId": "UCA-ORpF9LEgECmkP3nvVLXQ"
},
"contentDetails": {
"videoId": "UlmyJl9_Gwc",
"videoPublishedAt": "2024-08-15T05:19:54Z"
},
"status": {
"privacyStatus": "public"
}
},
// ...
],
"pageInfo": {
"totalResults": 34,
"resultsPerPage": 5
}
}
```

Example Embed Code:

```html
<iframe
width="1280"
height="720"
src="https://www.youtube.com/embed/UlmyJl9_Gwc?list=PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF"
title="PDF Text Extraction With Python"
frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
referrerpolicy="strict-origin-when-cross-origin"
allowfullscreen
></iframe>
```
1 change: 1 addition & 0 deletions project/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"django_extensions",
"huey.contrib.djhuey",
"techcity.core",
"techcity.events",
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies = [
"sqids>=0.5.0",
"urllib3>=2.2.3",
"whitenoise[brotli]>=6.7.0",
"django-extensions>=3.2.3",
]

[tool.uv]
Expand Down
8 changes: 7 additions & 1 deletion techcity/events/admin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from django.contrib import admin

from .models import Event, Venue
from .models import Event, EventRecording, Venue


@admin.register(Event)
Expand All @@ -12,3 +12,9 @@ class EventAdmin(admin.ModelAdmin):
@admin.register(Venue)
class VenueAdmin(admin.ModelAdmin):
ordering = ["address"]


@admin.register(EventRecording)
class EventRecordingAdmin(admin.ModelAdmin):
list_display = ["group", "event", "title", "description", "url"]
list_filter = ["group"]
150 changes: 150 additions & 0 deletions techcity/events/connectors/youtube.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import json
import logging

import requests
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

from techcity.events.models import EventRecording, EventRecordingType
from techcity.groups.models import Group

# TODO - this could be a model / added to Group. For now, hardcoding.
EVENT_RECORDING_SOURCES = {
"python-frederick": "PLFcKEo4b_n1wMFhbiedpMgh2VRT5uICuF",
"frederick-open-source": "PLFcKEo4b_n1zz3pGwC8e0RVaQf0c07jyd",
"frederick-web-tech": "PLFcKEo4b_n1yistY9g3kyiXefYNPVZ1-X",
}


class YouTubeConnector:
"""A connector to fetch event recordings from YouTube.com"""

def __init__(
self,
youtube_api_key: str,
cache_dir: str,
group_slugs: list[str] | None = None,
):
self.youtube_api_key = youtube_api_key
if not self.youtube_api_key:
raise ValueError(
"A YouTube API key is required to fetch data."
"Set YOUTUBE_API_KEY in the environment."
)

self.cache_dir = cache_dir
group_slugs = group_slugs or EVENT_RECORDING_SOURCES.keys()
self.event_recording_sources = {
group_slug: playlist_id
for group_slug, playlist_id in EVENT_RECORDING_SOURCES.items()
if group_slug in group_slugs
}

def fetch(self, cached: bool) -> None:
print("Fetching event recordings from YouTube...")

for group_slug, playlist_id in self.event_recording_sources.items():
if cached:
print(f"Using cached data for {group_slug}...")
else:
self.fetch_to_cache(group_slug, playlist_id)

self.generate_event_recordings(group_slug, playlist_id)

def fetch_to_cache(self, group_slug: str, playlist_id: str):
"""
Fetch the data from YouTube and save it to the cache.

Uses the following:
https://developers.google.com/youtube/v3/docs/playlistItems/list
"""
retries = Retry(
total=3,
allowed_methods={"GET"},
status_forcelist=[502, 503, 504],
backoff_factor=0.1,
)
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

results = []
paginating = True
page_token = ""

while paginating:
response = session.get(
"https://www.googleapis.com/youtube/v3/playlistItems",
params={
"part": ["id", "contentDetails", "snippet", "status"],
"playlistId": playlist_id,
"maxResults": 50,
"pageToken": page_token,
"key": self.youtube_api_key,
},
headers={
"Accept": "application/json",
"Referer": "https://techfrederick.org",
},
timeout=5,
)
response.raise_for_status()
data = response.json()
results.extend(data["items"])
if "nextPageToken" in data:
page_token = data["nextPageToken"]
else:
paginating = False

with open(self.cache_dir / f"{group_slug}-youtube-videos.json", "w") as f:
json.dump(results, f)

def generate_event_recordings(self, group_slug: str, playlist_id: str) -> None:
"""Generate any event recordings found in API data."""
group = Group.objects.get(slug=group_slug)
with open(self.cache_dir / f"{group_slug}-youtube-videos.json") as f:
youtube_video_data = json.load(f)

for video_data in youtube_video_data:
self.upsert_event_recording(group, video_data, playlist_id)

def upsert_event_recording(
self,
group: Group,
youtube_video_data: dict,
playlist_id: str,
) -> None:
"""Upsert an event recording into the database."""

snippet = youtube_video_data["snippet"]
content_details = youtube_video_data["contentDetails"]

# TODO - could extract Speaker ? Should probably be from the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't reliably set the speaker in a consistent way, so I agree that we should probably get speaker data from a different source.

# _Meetup_ event ingestion, not YouTube.
#
# speaker_pattern = re.compile(r"(Speaker|Presenter): (.+)\n(.+)", re.DOTALL)
# try:
# speaker_match = speaker_pattern.search(snippet["description"])
# speaker_name = speaker_match.group(2).strip()
# speaker_bio = speaker_match.group(3).strip()
# except AttributeError:

recording, created = EventRecording.objects.update_or_create(
recording_type=EventRecordingType.YOUTUBE,
group=group,
external_id=content_details["videoId"],
defaults={
"title": snippet["title"],
"description": snippet["description"],
"url": f"https://www.youtube.com/watch?v={content_details['videoId']}&list={playlist_id}",
"external_id": content_details["videoId"],
"external_playlist_id": playlist_id,
"metadata": {
"snippet": snippet,
},
},
)

if created:
logging.info(f"Created new recording: {recording.title}")
else:
logging.info(f"Updated recording: {recording.title}")
37 changes: 37 additions & 0 deletions techcity/events/management/commands/fetch_event_recordings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os

from django.core.management.base import BaseCommand

from techcity.constants import cache
from techcity.events.connectors.youtube import YouTubeConnector


class Command(BaseCommand):
help = "Fetch all event recordings from any of the source connectors"

def add_arguments(self, parser):
parser.add_argument(
"--cached",
action="store_true",
default=False,
help="Use cached data instead of APIs if available",
)
parser.add_argument(
"--youtube-api-key",
type=str,
help="The YouTube API key to use for fetching data"
" (set in env YOUTUBE_API_KEY)",
default=os.getenv("YOUTUBE_API_KEY"),
)

def handle(self, *args, cached: bool = False, youtube_api_key: str = "", **kwargs):
self.stdout.write("Fetching events...")

cache.mkdir(exist_ok=True)
cache_dir = cache / "event_recordings"
cache_dir.mkdir(exist_ok=True)

connector = YouTubeConnector(
cache_dir=cache_dir, youtube_api_key=youtube_api_key
)
connector.fetch(cached)
Loading