update icon and decription

wangsnoopy · Nov 19, 2024 · b657640 · b657640
1 parent 10dbde2
commit b657640
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 3 deletions.
diff --git a/app/__pycache__/beautiful_soup_data.cpython-312.pyc b/app/__pycache__/beautiful_soup_data.cpython-312.pyc
diff --git a/app/__pycache__/rss_to_json.cpython-312.pyc b/app/__pycache__/rss_to_json.cpython-312.pyc
diff --git a/app/beautiful_soup_data.py b/app/beautiful_soup_data.py
@@ -12,6 +12,7 @@
 db = client['tech_news_db']
 tools_collection = db['tools']
 
+# Get tools data
 def fetch_tools_data_to_json():
     tools_collection.delete_many({})
     # Scrape GitHub Trending page
@@ -58,3 +59,84 @@ def fetch_tools_data_to_json():
             repositories.append(repository)
 
     return repositories
+
+# Function to get the icon url from the webpage
+def fetch_icon_url(page_url):
+    try:
+        response = requests.get(page_url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, 'html.parser')
+            icon_tag = soup.find("link", rel="icon") or soup.find("link", rel="shortcut icon")
+            if icon_tag and icon_tag.get("href"):
+                icon_url = icon_tag["href"]
+                # Handle relative URLs
+                if icon_url.startswith('/'):
+                    return page_url.rstrip('/') + icon_url
+                return icon_url
+        return None
+    except requests.RequestException as e:
+        print(f"Failed to fetch icon for {page_url}: {e}")
+        return None
+
+# products icon
+def fetch_product_icon_url(page_url):
+    try:
+        response = requests.get(page_url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # Look for an <img> tag first
+            img_tag = soup.find("img", class_="relative z-0 rounded")
+            if img_tag:
+                # Prefer srcset if available; fallback to src
+                icon_url = img_tag.get("srcset") or img_tag.get("src")
+
+                # If srcset exists, get the first URL (before the first space)
+                if icon_url and " " in icon_url:
+                    icon_url = icon_url.split()[0]
+
+                return icon_url
+
+            # If no <img> tag is found, then go to the cur link to get the web icon
+
+        return None
+    except requests.RequestException as e:
+        print(f"Failed to fetch icon for {page_url}: {e}")
+        return None
+
+# product tag line
+def fetch_tagline(page_url):
+    try:
+        response = requests.get(page_url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # Find the <h2> tag with the specified class
+            tagline_tag = soup.find("h2", class_="text-24 font-light text-light-gray styles_tagline__Mhn2j")
+            if tagline_tag:
+                return tagline_tag.text.strip()  # Extract and clean up the text content
+
+            # If no matching tag is found
+            print("No matching <h2> tag found.")
+            return None
+
+    except requests.RequestException as e:
+        print(f"Failed to fetch tagline for {page_url}: {e}")
+        return None
+
+# fetch description
+def fetch_descriptions(page_url):
+    try:
+        response = requests.get(page_url)
+        if response.status_code == 200:
+            soup = BeautifulSoup(response.content, 'html.parser')
+
+            # Find all <div> tags with the specified class
+            description_tags = soup.find_all("div", class_="styles_htmlText__eYPgj text-16 font-normal text-dark-gray")
+            descriptions = [tag.text.strip() for tag in description_tags]  # Extract and clean text from each tag
+
+            return descriptions if descriptions else "No descriptions found."
+
+    except requests.RequestException as e:
+        print(f"Failed to fetch descriptions for {page_url}: {e}")
+        return None
diff --git a/app/rss_to_json.py b/app/rss_to_json.py
@@ -1,6 +1,8 @@
 import feedparser
 import os
+import requests
 from pymongo import MongoClient
+import beautiful_soup_data as bsd
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
@@ -12,6 +14,7 @@
 products_collection = db['products']
 tools_collection = db['tools']
 
+
 # Function to fetch and convert RSS feed to JSON for news
 def fetch_rss_to_json(feed_url):
     # Clear the existing data in the collection
@@ -22,10 +25,11 @@ def fetch_rss_to_json(feed_url):
 
     # Extract relevant data from the RSS feed
     for entry in feed.entries:
+        icon_url = bsd.fetch_icon_url(entry.link)
         article = {
             'title': entry.title,
             'link': entry.link,
-            # 'summary': entry.summary,
+            'icon_url': icon_url,
             'published': entry.published
         }
         articles.append(article)
@@ -48,13 +52,20 @@ def fetch_products_to_json(feed_url):
 
     # Extract relevant data from the RSS feed
     for entry in feed.entries:
+        icon_url = bsd.fetch_product_icon_url(entry.link)
+        tag_line = bsd.fetch_tagline(entry.link)
+        description = bsd.fetch_descriptions(entry.link)
+        if not icon_url:
+            icon_url = icon_url = bsd.fetch_icon_url(entry.link)
         product = {
             'guid': entry.get('guid'),
             'url': entry.get('link'),
             'title': entry.get('title'),
-            'content_html': entry.get('content_html', ''),
             'date_published': entry.get('published'),
-            'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', '')
+            'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', ''),
+            'icon_url': icon_url,
+            'tag_line': tag_line,
+            'description': description
         }
         products.append(product)