Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: differentiate shorts, lives and long videos #371

Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Diffrentiate shorts, lives, & normal videos (#367)
- corrected the short video resolution in the UI (#366)

### Changed

- Raise exception if there are no videos in the playlists (#347)
Expand Down
1 change: 1 addition & 0 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ dependencies = [
"pydantic==2.9.1",
"pyhumps==3.8.0",
"schedule==1.2.2",
"isodate==0.7.2",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
4 changes: 4 additions & 0 deletions scraper/src/youtube2zim/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class Video(CamelModel):
subtitle_path: str | None = None
subtitle_list: list[Subtitle]
duration: str
is_short: bool


class VideoPreview(CamelModel):
Expand Down Expand Up @@ -107,6 +108,9 @@ class Channel(CamelModel):
joined_date: str
collection_type: str
main_playlist: str | None = None
long_videos_playlist: str | None=None
shorts_playlist: str | None=None
lives_playlist: str | None=None
playlist_count: int


Expand Down
65 changes: 52 additions & 13 deletions scraper/src/youtube2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
skip_deleted_videos,
skip_non_public_videos,
skip_outofrange_videos,
is_short,
)

MAXIMUM_YOUTUBEID_LENGTH = 24
Expand Down Expand Up @@ -179,6 +180,9 @@ def __init__(
# process-related
self.playlists = []
self.uploads_playlist_id = None
self.long_videos_playlist_id = None
self.shorts_playlist_id = None
self.lives_playlist_id = None
self.videos_ids = []
self.video_ids_count = 0
self.videos_processed = 0
Expand Down Expand Up @@ -590,6 +594,9 @@ def extract_playlists(self):
self.playlists,
self.main_channel_id,
self.uploads_playlist_id,
self.long_videos_playlist_id,
self.shorts_playlist_id,
self.lives_playlist_id,
) = extract_playlists_details_from(self.collection_type, self.youtube_id)

def extract_videos_list(self):
Expand Down Expand Up @@ -1077,6 +1084,13 @@ def generate_video_object(video) -> Video:
author = videos_channels[video_id]
subtitles_list = get_subtitles(video_id)
channel_data = get_channel_json(author["channelId"])

channel_id=author["channelId"]
duration=videos_channels[video_id]["duration"],
publication_date=video["contentDetails"]["videoPublishedAt"],
# Check if the video is short
is_short_video = is_short(video_id,channel_id,duration,publication_date) # can be True or None
is_short_flag = True if is_short_video is True else False # Set True if is_short is True, otherwise False
return Video(
id=video_id,
title=video["snippet"]["title"],
Expand All @@ -1095,6 +1109,7 @@ def generate_video_object(video) -> Video:
subtitle_path=f"videos/{video_id}" if len(subtitles_list) > 0 else None,
subtitle_list=subtitles_list,
duration=videos_channels[video_id]["duration"],
is_short=is_short_flag,
)

def generate_video_preview_object(video) -> VideoPreview:
Expand Down Expand Up @@ -1187,6 +1202,9 @@ def get_playlist_slug(playlist) -> str:
home_playlist_list = []

main_playlist_slug = None
long_videos_playlist_slug = None
shorts_playlist_slug = None
lives_playlist_slug = None
if len(self.playlists) > 0:
main_playlist_slug = get_playlist_slug(
self.playlists[0]
Expand Down Expand Up @@ -1216,6 +1234,16 @@ def get_playlist_slug(playlist) -> str:
# modify playlist object for preview on homepage
playlist_obj.videos = playlist_obj.videos[:12]

if playlist.playlist_id == self.long_videos_playlist_id:
long_videos_playlist_slug = (playlist_slug)

if playlist.playlist_id == self.shorts_playlist_id:
shorts_playlist_slug = (playlist_slug)

if playlist.playlist_id == self.lives_playlist_id:
lives_playlist_slug= (playlist_slug)


if playlist.playlist_id == self.uploads_playlist_id:
main_playlist_slug = (
playlist_slug # set uploads playlist as main playlist
Expand Down Expand Up @@ -1251,22 +1279,33 @@ def get_playlist_slug(playlist) -> str:

# write channel.json file
channel_data = get_channel_json(self.main_channel_id)
channel_data_dict = {
benoit74 marked this conversation as resolved.
Show resolved Hide resolved
"id":str(self.main_channel_id),
"title":str(self.title),
"description":str(self.description),
"channel_name":channel_data["snippet"]["title"],
"channel_description":channel_data["snippet"]["description"],
"profile_path":"profile.jpg",
"banner_path":"banner.jpg",
"collection_type":self.collection_type,
"main_playlist":main_playlist_slug,
"playlist_count":len(self.playlists),
"joined_date":channel_data["snippet"]["publishedAt"],
}

if long_videos_playlist_slug is not None :
channel_data_dict["long_videos_playlist"] = long_videos_playlist_slug

if shorts_playlist_slug is not None :
channel_data_dict["shorts_playlist"] = shorts_playlist_slug

if lives_playlist_slug is not None :
channel_data_dict["lives_playlist"] = lives_playlist_slug

self.zim_file.add_item_for(
path="channel.json",
title=self.title,
content=Channel(
id=str(self.main_channel_id),
title=str(self.title),
description=str(self.description),
channel_name=channel_data["snippet"]["title"],
channel_description=channel_data["snippet"]["description"],
profile_path="profile.jpg",
banner_path="banner.jpg",
collection_type=self.collection_type,
main_playlist=main_playlist_slug,
playlist_count=len(self.playlists),
joined_date=channel_data["snippet"]["publishedAt"],
).model_dump_json(by_alias=True, indent=2),
content = Channel(**channel_data_dict).model_dump_json(by_alias=True, indent=2, exclude_none=True),
mimetype="application/json",
is_front=False,
)
Expand Down
Loading