-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
94 lines (87 loc) · 3.81 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import logging
import sys
import random
from playwright.sync_api import sync_playwright, Page
from utils.load import load_data
from utils.last_fm_add_genres import enter_genres
month = sys.argv[1]
year = sys.argv[2]
url = f"https://doyoutrackid.com/archive?month={month}&year={year}"
data_list=[]
logging.info("Scrapping data start successfully")
def scrap_tracks(page: Page, href: str, data_list: list):
page.wait_for_selector("li[class^='Tracks_listItem']")
list_items = page.query_selector_all("li[class^='Tracks_listItem']")
# Iterate through the list items and scrape track information
for item in list_items:
title_elem = item.query_selector("[class^='Track_title']")
title = title_elem.inner_text().strip() if title_elem else "No title"
artist_elem = item.query_selector("[class^='Track_artist']")
artist = artist_elem.inner_text().strip() if artist_elem else "No artist"
album_elem = item.query_selector("[class^='Track_album'] span[class^='Track_value']")
album_text = album_elem.inner_text().strip() if album_elem else "No album"
release_date_elem = item.query_selector("p[class^='Track_releaseDate'] span[class^='Track_value']")
date_text = release_date_elem.inner_text().strip() if release_date_elem else "No release date"
# Log scrapping tracks
logging.info('Starting data extract...')
logging.info(f"Title: {title}")
logging.info(f"Artist: {artist}")
logging.info(f"Album: {album_text}")
logging.info(f"Release Date: {date_text}")
logging.info(f"URL: {href}")
logging.info("-" * 40)
data_list.append({
'Title': title,
'Artist': artist,
'Album': album_text,
'Release_Date': date_text,
'URL': href
})
return data_list
def run(page: Page, data_list: list):
# Go to the URL stored in the variable
page.goto(url)
page.wait_for_selector('li.BananaDates_listItem__SDPAB')
list_items = page.query_selector_all('li.BananaDates_listItem__SDPAB')
href_links = []
for item in list_items:
# Find the anchor (<a>) tag inside the list item
link = item.query_selector('a')
if link:
href = link.get_attribute('href')
href_links.append(href)
logging.info(f'Link: {href}')
else:
# If there is no <a> tag inside, print that no link was found
logging.info('No link found in this list item.')
short=href_links[:1]
for href in short:
full_url = "https://doyoutrackid.com" + href
logging.info(f'Navigating to: {full_url}')
page.goto(full_url) # Navigate to the stored href link
v=random.randint(3000, 6000)
page.wait_for_timeout(v)
logging.info("wait for timeout"+ str(v))
page.wait_for_selector("li[class^='Tracks_listItem']")
tracks=scrap_tracks(page, full_url, data_list)
logging.info('Starting update genre process...')
genres=enter_genres(tracks)
return genres
def main():
try:
with sync_playwright() as playwright:
logging.info("Launching browser with Playwright.")
browser = playwright.chromium.launch(headless=False)
context = browser.new_context()
page = context.new_page()
logging.info("Browser launched successfully.")
tracks = run(page, data_list)
logging.info(f"Scraped a total of {len(tracks)} tracks.")
load_data(tracks)
logging.info("Data successfully loaded using load_data().")
browser.close()
logging.info("Browser closed successfully.")
except Exception as e:
logging.critical(f"An unexpected error occurred in the main function: {e}", exc_info=True)
if __name__ == "__main__":
main()