Skip to content

Commit

Permalink
update icon and decription
Browse files Browse the repository at this point in the history
  • Loading branch information
wangsnoopy committed Nov 19, 2024
1 parent 10dbde2 commit b657640
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 3 deletions.
Binary file modified app/__pycache__/beautiful_soup_data.cpython-312.pyc
Binary file not shown.
Binary file modified app/__pycache__/rss_to_json.cpython-312.pyc
Binary file not shown.
82 changes: 82 additions & 0 deletions app/beautiful_soup_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
db = client['tech_news_db']
tools_collection = db['tools']

# Get tools data
def fetch_tools_data_to_json():
tools_collection.delete_many({})
# Scrape GitHub Trending page
Expand Down Expand Up @@ -58,3 +59,84 @@ def fetch_tools_data_to_json():
repositories.append(repository)

return repositories

# Function to get the icon url from the webpage
def fetch_icon_url(page_url):
try:
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
icon_tag = soup.find("link", rel="icon") or soup.find("link", rel="shortcut icon")
if icon_tag and icon_tag.get("href"):
icon_url = icon_tag["href"]
# Handle relative URLs
if icon_url.startswith('/'):
return page_url.rstrip('/') + icon_url
return icon_url
return None
except requests.RequestException as e:
print(f"Failed to fetch icon for {page_url}: {e}")
return None

# products icon
def fetch_product_icon_url(page_url):
try:
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')

# Look for an <img> tag first
img_tag = soup.find("img", class_="relative z-0 rounded")
if img_tag:
# Prefer srcset if available; fallback to src
icon_url = img_tag.get("srcset") or img_tag.get("src")

# If srcset exists, get the first URL (before the first space)
if icon_url and " " in icon_url:
icon_url = icon_url.split()[0]

return icon_url

# If no <img> tag is found, then go to the cur link to get the web icon

return None
except requests.RequestException as e:
print(f"Failed to fetch icon for {page_url}: {e}")
return None

# product tag line
def fetch_tagline(page_url):
try:
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')

# Find the <h2> tag with the specified class
tagline_tag = soup.find("h2", class_="text-24 font-light text-light-gray styles_tagline__Mhn2j")
if tagline_tag:
return tagline_tag.text.strip() # Extract and clean up the text content

# If no matching tag is found
print("No matching <h2> tag found.")
return None

except requests.RequestException as e:
print(f"Failed to fetch tagline for {page_url}: {e}")
return None

# fetch description
def fetch_descriptions(page_url):
try:
response = requests.get(page_url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')

# Find all <div> tags with the specified class
description_tags = soup.find_all("div", class_="styles_htmlText__eYPgj text-16 font-normal text-dark-gray")
descriptions = [tag.text.strip() for tag in description_tags] # Extract and clean text from each tag

return descriptions if descriptions else "No descriptions found."

except requests.RequestException as e:
print(f"Failed to fetch descriptions for {page_url}: {e}")
return None
17 changes: 14 additions & 3 deletions app/rss_to_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import feedparser
import os
import requests
from pymongo import MongoClient
import beautiful_soup_data as bsd
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
Expand All @@ -12,6 +14,7 @@
products_collection = db['products']
tools_collection = db['tools']


# Function to fetch and convert RSS feed to JSON for news
def fetch_rss_to_json(feed_url):
# Clear the existing data in the collection
Expand All @@ -22,10 +25,11 @@ def fetch_rss_to_json(feed_url):

# Extract relevant data from the RSS feed
for entry in feed.entries:
icon_url = bsd.fetch_icon_url(entry.link)
article = {
'title': entry.title,
'link': entry.link,
# 'summary': entry.summary,
'icon_url': icon_url,
'published': entry.published
}
articles.append(article)
Expand All @@ -48,13 +52,20 @@ def fetch_products_to_json(feed_url):

# Extract relevant data from the RSS feed
for entry in feed.entries:
icon_url = bsd.fetch_product_icon_url(entry.link)
tag_line = bsd.fetch_tagline(entry.link)
description = bsd.fetch_descriptions(entry.link)
if not icon_url:
icon_url = icon_url = bsd.fetch_icon_url(entry.link)
product = {
'guid': entry.get('guid'),
'url': entry.get('link'),
'title': entry.get('title'),
'content_html': entry.get('content_html', ''),
'date_published': entry.get('published'),
'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', '')
'author': entry.get('author') if isinstance(entry.get('author'), str) else entry.get('author', {}).get('name', ''),
'icon_url': icon_url,
'tag_line': tag_line,
'description': description
}
products.append(product)

Expand Down

0 comments on commit b657640

Please sign in to comment.