Skip to content

Commit

Permalink
Download scraped images from instagram profile (#373)
Browse files Browse the repository at this point in the history
  • Loading branch information
DEENUU1 committed Nov 24, 2023
1 parent 2e8b7f9 commit 425351b
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 26 deletions.
24 changes: 12 additions & 12 deletions metaspy/src/facebook/account/account_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,6 @@ def generate_image_file_name() -> str:
random_name = "".join(random.choice(string.ascii_letters) for _ in range(10))
return f"{random_name}.jpg"

@staticmethod
def check_image_type(image_content) -> bool:
"""
Check if file is an image
"""
try:
_ = Image.open(BytesIO(image_content))
return True
except Exception as e:
logs.log_error(f"Skipping image, Exception: {e}")
return False

def extract_image_urls(self) -> List[str]:
"""
Return a list of all the image urls
Expand Down Expand Up @@ -86,6 +74,18 @@ def extract_callback(driver):

return extracted_image_urls

@staticmethod
def check_image_type(image_content) -> bool:
"""
Check if file is an image
"""
try:
_ = Image.open(BytesIO(image_content))
return True
except Exception as e:
logs.log_error(f"Skipping image, Exception: {e}")
return False

def save_images(self, image_urls: List[str]) -> List[str]:
"""
Download and save images from url
Expand Down
113 changes: 106 additions & 7 deletions metaspy/src/instagram/instagram_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@
from rich import print as rprint
from typing import List
from ..repository.instagram_image_repository import create_image, image_exists
import os
import random
import string
import requests
from rich.progress import Progress
from ..utils import output, save_to_json
from ..repository import instagram_image_repository, instagram_account_repository
from io import BytesIO
from PIL import Image


logs = Logs()
Expand Down Expand Up @@ -33,6 +42,78 @@ def _refresh_driver(self) -> None:
def is_pipeline_successful(self) -> bool:
return self.success

@staticmethod
def generate_image_file_name() -> str:
"""
Generate a random image file name
"""
random_name = "".join(random.choice(string.ascii_letters) for _ in range(10))
return f"{random_name}.jpg"

@staticmethod
def check_image_type(image_content) -> bool:
"""
Check if file is an image
"""
try:
_ = Image.open(BytesIO(image_content))
return True
except Exception as e:
logs.log_error(f"Skipping image, Exception: {e}")
return False

def save_images(self, image_urls: List[str]) -> List[str]:
"""
Download and save images from url
"""
downloaded_image_paths = []
try:
with Progress() as progress:
task = progress.add_task("[cyan]Downloading...", total=len(image_urls))
for index, url in enumerate(image_urls, 1):
response = requests.get(url)
response.raise_for_status()

image_content = response.content

image_type = self.check_image_type(image_content)
if not image_type:
continue

image_directory = os.path.dirname(Config.IMAGE_PATH)
if not os.path.exists(image_directory):
os.makedirs(image_directory)

user_image_directory = os.path.dirname(
f"{Config.IMAGE_PATH}/{self._user_id}/"
)
if not os.path.exists(user_image_directory):
os.makedirs(user_image_directory)

image_filename = self.generate_image_file_name()
image_path = os.path.join(user_image_directory, image_filename)

downloaded_image_paths.append(image_path)

with open(image_path, "wb") as file:
file.write(image_content)

progress.update(
task,
advance=1,
description=f"[cyan]Downloading... ({index}/{len(image_urls)})",
)

except requests.exceptions.HTTPError as http_err:
logs.log_error(f"Request error: {http_err}")

except requests.exceptions.RequestException as req_err:
logs.log_error(f"Request error: {req_err}")
except Exception as e:
logs.log_error(f"An error occurred: {e}")

return downloaded_image_paths

def extract_images(self):
extracted_image_urls = []
try:
Expand All @@ -55,18 +136,36 @@ def extract_callback(driver):

return extracted_image_urls

def pipeline_images(self) -> List[str]:
def pipeline_images(self) -> None:
try:
rprint(f"[bold]Step 1 of 2 - Loading profile page[/bold]")
image_urls = self.extract_images()
self.success = True

rprint(f"[bold]Step 2 of 2 - Saving images to the database [/bold]")
for image_url in image_urls:
if not image_exists(image_url):
create_image(image_url)
if not image_urls:
output.print_no_data_info()
self._driver.quit()
self.success = False
else:
rprint(f"[bold]Step 2 of 2 - Downloading and saving images [/bold]")
image_paths = self.save_images(image_urls)

output.print_list(image_paths)

rprint(
"[bold red]Don't close the app![/bold red] Saving scraped data to database, it can take a while!"
)

save_to_json.SaveJSON(self._user_id, image_urls).save()

if not instagram_account_repository.account_exists(self._user_id):
instagram_account_repository.create_account(self._user_id)

account_id = instagram_account_repository.get_account(self._user_id).id
for image_path in image_paths:
instagram_image_repository.create_image(image_path, account_id)

return image_urls
self._driver.quit()
self.success = True

except Exception as e:
logs.log_error(f"An error occurred: {e}")
Expand Down
11 changes: 8 additions & 3 deletions metaspy/src/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,13 +237,18 @@ class InstagramAccount(Base):
number_of_posts = Column(Integer, nullable=True)
number_of_followers = Column(Integer, nullable=True)
number_of_following = Column(Integer, nullable=True)
images = relationship("InstagramImages", backref="account")

# Relationship
images = relationship("InstagramImages", back_populates="account")


class InstagramImages(Base):
__tablename__ = "iimages"

id = Column(Integer, primary_key=True, autoincrement=True)
url = Column(String, nullable=False)
path = Column(String, nullable=False)
downloaded = Column(Boolean, default=False)
account_id = Column(Integer, ForeignKey("iaccounts.id"), nullable=False)
account_id = Column(Integer, ForeignKey("iaccounts.id"))

# Relationship
account = relationship("InstagramAccount", back_populates="images")
4 changes: 2 additions & 2 deletions metaspy/src/repository/instagram_account_repository.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import List, Optional
from typing import Optional
from ..database import get_session
from ..models import InstagramAccount


def account_exists(username: str) -> bool:
session = get_session()
account = session.query(InstagramAccount).filter_by(username=instagram_id).first()
account = session.query(InstagramAccount).filter_by(username=username).first()
return account is not None


Expand Down
4 changes: 2 additions & 2 deletions metaspy/src/repository/instagram_image_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ def image_exists(url: str) -> bool:
return image is not None


def create_image(url: str) -> InstagramImages:
def create_image(path: str, account_id: int) -> InstagramImages:
session = get_session()
image = InstagramImages(url=url)
image = InstagramImages(path=path, account_id=account_id)
session.add(image)
session.commit()
return image
Expand Down

0 comments on commit 425351b

Please sign in to comment.