diff --git a/app/__pycache__/beautiful_soup_data.cpython-312.pyc b/app/__pycache__/beautiful_soup_data.cpython-312.pyc index 306986b..57cd5af 100644 Binary files a/app/__pycache__/beautiful_soup_data.cpython-312.pyc and b/app/__pycache__/beautiful_soup_data.cpython-312.pyc differ diff --git a/app/__pycache__/rss_to_json.cpython-312.pyc b/app/__pycache__/rss_to_json.cpython-312.pyc index c8a5e7c..aa3b2c6 100644 Binary files a/app/__pycache__/rss_to_json.cpython-312.pyc and b/app/__pycache__/rss_to_json.cpython-312.pyc differ diff --git a/app/beautiful_soup_data.py b/app/beautiful_soup_data.py index 9039cb1..cb56fd0 100644 --- a/app/beautiful_soup_data.py +++ b/app/beautiful_soup_data.py @@ -12,6 +12,7 @@ db = client['tech_news_db'] tools_collection = db['tools'] +# Get tools data def fetch_tools_data_to_json(): tools_collection.delete_many({}) # Scrape GitHub Trending page @@ -58,3 +59,84 @@ def fetch_tools_data_to_json(): repositories.append(repository) return repositories + +# Function to get the icon url from the webpage +def fetch_icon_url(page_url): + try: + response = requests.get(page_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + icon_tag = soup.find("link", rel="icon") or soup.find("link", rel="shortcut icon") + if icon_tag and icon_tag.get("href"): + icon_url = icon_tag["href"] + # Handle relative URLs + if icon_url.startswith('/'): + return page_url.rstrip('/') + icon_url + return icon_url + return None + except requests.RequestException as e: + print(f"Failed to fetch icon for {page_url}: {e}") + return None + +# products icon +def fetch_product_icon_url(page_url): + try: + response = requests.get(page_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + + # Look for an tag first + img_tag = soup.find("img", class_="relative z-0 rounded") + if img_tag: + # Prefer srcset if available; fallback to src + icon_url = img_tag.get("srcset") or img_tag.get("src") + + # If srcset exists, get the first URL (before the first space) + if icon_url and " " in icon_url: + icon_url = icon_url.split()[0] + + return icon_url + + # If no tag is found, then go to the cur link to get the web icon + + return None + except requests.RequestException as e: + print(f"Failed to fetch icon for {page_url}: {e}") + return None + +# product tag line +def fetch_tagline(page_url): + try: + response = requests.get(page_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + + # Find the

tag with the specified class + tagline_tag = soup.find("h2", class_="text-24 font-light text-light-gray styles_tagline__Mhn2j") + if tagline_tag: + return tagline_tag.text.strip() # Extract and clean up the text content + + # If no matching tag is found + print("No matching

tag found.") + return None + + except requests.RequestException as e: + print(f"Failed to fetch tagline for {page_url}: {e}") + return None + +# fetch description +def fetch_descriptions(page_url): + try: + response = requests.get(page_url) + if response.status_code == 200: + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all
tags with the specified class + description_tags = soup.find_all("div", class_="styles_htmlText__eYPgj text-16 font-normal text-dark-gray") + descriptions = [tag.text.strip() for tag in description_tags] # Extract and clean text from each tag + + return descriptions if descriptions else "No descriptions found." + + except requests.RequestException as e: + print(f"Failed to fetch descriptions for {page_url}: {e}") + return None \ No newline at end of file diff --git a/app/rss_to_json.py b/app/rss_to_json.py index b80dbe3..a50f515 100644 --- a/app/rss_to_json.py +++ b/app/rss_to_json.py @@ -1,6 +1,8 @@ import feedparser import os +import requests from pymongo import MongoClient +import beautiful_soup_data as bsd from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() @@ -12,6 +14,7 @@ products_collection = db['products'] tools_collection = db['tools'] + # Function to fetch and convert RSS feed to JSON for news def fetch_rss_to_json(feed_url): # Clear the existing data in the collection @@ -22,10 +25,11 @@ def fetch_rss_to_json(feed_url): # Extract relevant data from the RSS feed for entry in feed.entries: + icon_url = bsd.fetch_icon_url(entry.link) article = { 'title': entry.title, 'link': entry.link, - # 'summary': entry.summary, + 'icon_url': icon_url, 'published': entry.published } articles.append(article) @@ -48,13 +52,20 @@ def fetch_products_to_json(feed_url): # Extract relevant data from the RSS feed for entry in feed.entries: + icon_url = bsd.fetch_product_icon_url(entry.link) + tag_line = bsd.fetch_tagline(entry.link) + description = bsd.fetch_descriptions(entry.link) + if not icon_url: + icon_url = icon_url = bsd.fetch_icon_url(entry.link) product = { 'guid': entry.get('guid'), 'url': entry.get('link'), 'title': entry.get('title'), - 'content_html': entry.get('content_html', ''), 'date_published': entry.get('published'), - 'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', '') + 'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', ''), + 'icon_url': icon_url, + 'tag_line': tag_line, + 'description': description } products.append(product)