From 934b6eef768e57f4e5de7b8600fdb24889199923 Mon Sep 17 00:00:00 2001 From: jan Mikowa <66056347+Maritsu@users.noreply.github.com> Date: Fri, 5 Jul 2024 01:27:14 +0200 Subject: [PATCH] Various search improvements (#375) * remove "Lyrics" keyword from search * search with Youtube Music for better results (#373,#374) - add ytmusicapi as dependency - modify dump_json() and find_and_download_songs() to use YT Music * add function to get closest matching string via Levenshtein edit distance * implement getting closest match in search results * update CI and package setup to python>=3.8 * Update .github/workflows/tests.yml --------- Co-authored-by: Sathyajith Bhat --- .github/workflows/tests.yml | 2 +- requirements.txt | 4 +++- setup.py | 5 +++-- spotify_dl/utils.py | 17 +++++++++++++++++ spotify_dl/youtube.py | 34 +++++++++++++++++++++++++++------- 5 files changed, 51 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dad84317..d9849a2f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: [3.8, 3.9, '3.10', 3.11, 3.12] steps: - uses: actions/checkout@v2 diff --git a/requirements.txt b/requirements.txt index 9edf4a78..0b731740 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ yt-dlp>=2023.3.4 spotipy~=2.21 mutagen~=1.45 rich~=12.0 -urllib3~=1.26 \ No newline at end of file +urllib3~=1.26 +ytmusicapi~=1.6.0 +Levenshtein~=0.25.1 diff --git a/setup.py b/setup.py index 2fce4a57..40e3dd75 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setup( name="spotify_dl", version=VERSION, - python_requires=">=3.7", + python_requires=">=3.8", install_requires=requirements, author="Sathya Bhat", author_email="sathya@sathyasays.com", @@ -36,10 +36,11 @@ "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Internet", "Topic :: Software Development :: Libraries", "Topic :: Software Development :: Libraries :: Python Modules", diff --git a/spotify_dl/utils.py b/spotify_dl/utils.py index 3a2e9d0a..89a2fe80 100644 --- a/spotify_dl/utils.py +++ b/spotify_dl/utils.py @@ -1,3 +1,20 @@ +import Levenshtein + + +def get_closest_match(results, expected) -> str: + """ + Returns closest matching result based on Levenshtein edit distance. + """ + best_r = "" + min_distance = float('inf') + for r in results: + curr_distance = Levenshtein.distance(r, expected) + if (curr_distance < min_distance): + min_distance = curr_distance + best_r = r + return best_r + + def sanitize(name, replace_with=""): """ Removes some of the reserved characters from the name so it can be saved diff --git a/spotify_dl/youtube.py b/spotify_dl/youtube.py index f3b68467..de37389c 100644 --- a/spotify_dl/youtube.py +++ b/spotify_dl/youtube.py @@ -7,11 +7,12 @@ import mutagen import csv import yt_dlp +import ytmusicapi from mutagen.easyid3 import EasyID3 from mutagen.id3 import APIC, ID3 from mutagen.mp3 import MP3 from spotify_dl.scaffold import log -from spotify_dl.utils import sanitize +from spotify_dl.utils import sanitize, get_closest_match from spotify_dl.constants import DOWNLOAD_LIST @@ -33,7 +34,7 @@ def dump_json(songs): :param songs: the songs for which the JSON should be output """ for song in songs: - query = f"{song.get('artist')} - {song.get('name')} Lyrics".replace( + query = f"{song.get('artist')} - {song.get('name')}".replace( ":", "" ).replace('"', "") @@ -41,8 +42,18 @@ def dump_json(songs): with yt_dlp.YoutubeDL(ydl_opts) as ydl: try: - ytJson = ydl.extract_info("ytsearch:" + query, False) - print(json.dumps(ytJson.get("entries"))) + ytJson = {} + with ytmusicapi.YTMusic() as ym: + # Reduce results to array of titles and video IDs + result_titles, result_ids = zip(*map( + lambda d: (f"{d['artists'][0]['name']} - {d['title']}".replace(":", "").replace('"', ""), d["videoId"]), + ym.search(query, filter="songs") + )) + # Get ID of closest matching result by finding index in titles list + videoId = result_ids[result_titles.index(get_closest_match(result_titles, query))] + + ytJson = ydl.extract_info(f"https://music.youtube.com/watch?v={videoId}", False) + print(json.dumps([ytJson])) # insert into array so that the format stays the same except Exception as e: # skipcq: PYL-W0703 log.debug(e) print( @@ -144,7 +155,7 @@ def set_tags(temp, filename, kwargs): def find_and_download_songs(kwargs): """ function handles actual download of the songs - the youtube_search lib is used to search for songs and get best url + the ytmusicapi lib is used to search for songs and get best url via YT Music :param kwargs: dictionary of key value arguments to be used in download """ sponsorblock_postprocessor = [] @@ -160,7 +171,7 @@ def find_and_download_songs(kwargs): int(temp[-1].replace("\n", "")), ) - query = f"{artist} - {name} Lyrics".replace(":", "").replace('"', "") + query = f"{artist} - {name}".replace(":", "").replace('"', "") print(f"Initiating download for {query}.") file_name = kwargs["file_name_f"]( @@ -200,6 +211,15 @@ def find_and_download_songs(kwargs): print(f"File {mp3file_path} already exists, we do not overwrite it ") continue + with ytmusicapi.YTMusic() as ym: + # Reduce search results to array of titles and video IDs + result_titles, result_ids = zip(*map( + lambda d: (f"{d['artists'][0]['name']} - {d['title']}".replace(":", "").replace('"', ""), d["videoId"]), + ym.search(query, filter="songs") + )) + # Get ID of closest matching result by finding index in titles list + video_id = result_ids[result_titles.index(get_closest_match(result_titles, query))] + outtmpl = f"{file_path}.%(ext)s" ydl_opts = { "proxy": kwargs.get("proxy"), @@ -227,7 +247,7 @@ def find_and_download_songs(kwargs): ydl_opts["postprocessors"].append(mp3_postprocess_opts.copy()) with yt_dlp.YoutubeDL(ydl_opts) as ydl: try: - ydl.download([query]) + ydl.download([f"https://music.youtube.com/watch?v={video_id}"]) except Exception as e: # skipcq: PYL-W0703 log.debug(e) print(f"Failed to download {name}, make sure yt_dlp is up to date")