-
-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c5b15e9
commit 3f994ef
Showing
8 changed files
with
1,245 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import warnings | ||
import pandas as pd | ||
from .core.scrapers import ScraperInput | ||
from .utils import process_result, ordered_properties, validate_input, validate_dates | ||
from .core.scrapers.realtor import RealtorScraper | ||
from .core.scrapers.models import ListingType | ||
|
||
|
||
def scrape_property( | ||
location: str, | ||
listing_type: str = "for_sale", | ||
radius: float = None, | ||
mls_only: bool = False, | ||
past_days: int = None, | ||
proxy: str = None, | ||
date_from: str = None, | ||
date_to: str = None, | ||
foreclosure: bool = None, | ||
) -> pd.DataFrame: | ||
""" | ||
Scrape properties from Realtor.com based on a given location and listing type. | ||
:param location: Location to search (e.g. "Dallas, TX", "85281", "2530 Al Lipscomb Way") | ||
:param listing_type: Listing Type (for_sale, for_rent, sold) | ||
:param radius: Get properties within _ (e.g. 1.0) miles. Only applicable for individual addresses. | ||
:param mls_only: If set, fetches only listings with MLS IDs. | ||
:param past_days: Get properties sold or listed (dependent on your listing_type) in the last _ days. | ||
:param date_from, date_to: Get properties sold or listed (dependent on your listing_type) between these dates. format: 2021-01-28 | ||
:param proxy: Proxy to use for scraping | ||
""" | ||
validate_input(listing_type) | ||
validate_dates(date_from, date_to) | ||
|
||
scraper_input = ScraperInput( | ||
location=location, | ||
listing_type=ListingType[listing_type.upper()], | ||
proxy=proxy, | ||
radius=radius, | ||
mls_only=mls_only, | ||
last_x_days=past_days, | ||
date_from=date_from, | ||
date_to=date_to, | ||
foreclosure=foreclosure, | ||
) | ||
|
||
site = RealtorScraper(scraper_input) | ||
results = site.search() | ||
|
||
properties_dfs = [process_result(result) for result in results] | ||
if not properties_dfs: | ||
return pd.DataFrame() | ||
|
||
with warnings.catch_warnings(): | ||
warnings.simplefilter("ignore", category=FutureWarning) | ||
return pd.concat(properties_dfs, ignore_index=True, axis=0)[ordered_properties] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import argparse | ||
import datetime | ||
from homeharvest import scrape_property | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Home Harvest Property Scraper") | ||
parser.add_argument("location", type=str, help="Location to scrape (e.g., San Francisco, CA)") | ||
|
||
parser.add_argument( | ||
"-l", | ||
"--listing_type", | ||
type=str, | ||
default="for_sale", | ||
choices=["for_sale", "for_rent", "sold", "pending"], | ||
help="Listing type to scrape", | ||
) | ||
|
||
parser.add_argument( | ||
"-o", | ||
"--output", | ||
type=str, | ||
default="excel", | ||
choices=["excel", "csv"], | ||
help="Output format", | ||
) | ||
|
||
parser.add_argument( | ||
"-f", | ||
"--filename", | ||
type=str, | ||
default=None, | ||
help="Name of the output file (without extension)", | ||
) | ||
|
||
parser.add_argument("-p", "--proxy", type=str, default=None, help="Proxy to use for scraping") | ||
parser.add_argument( | ||
"-d", | ||
"--days", | ||
type=int, | ||
default=None, | ||
help="Sold/listed in last _ days filter.", | ||
) | ||
|
||
parser.add_argument( | ||
"-r", | ||
"--radius", | ||
type=float, | ||
default=None, | ||
help="Get comparable properties within _ (eg. 0.0) miles. Only applicable for individual addresses.", | ||
) | ||
parser.add_argument( | ||
"-m", | ||
"--mls_only", | ||
action="store_true", | ||
help="If set, fetches only MLS listings.", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
result = scrape_property( | ||
args.location, | ||
args.listing_type, | ||
radius=args.radius, | ||
proxy=args.proxy, | ||
mls_only=args.mls_only, | ||
past_days=args.days, | ||
) | ||
|
||
if not args.filename: | ||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") | ||
args.filename = f"HomeHarvest_{timestamp}" | ||
|
||
if args.output == "excel": | ||
output_filename = f"{args.filename}.xlsx" | ||
result.to_excel(output_filename, index=False) | ||
print(f"Excel file saved as {output_filename}") | ||
elif args.output == "csv": | ||
output_filename = f"{args.filename}.csv" | ||
result.to_csv(output_filename, index=False) | ||
print(f"CSV file saved as {output_filename}") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from dataclasses import dataclass | ||
import requests | ||
from requests.adapters import HTTPAdapter | ||
from urllib3.util.retry import Retry | ||
import uuid | ||
from .models import Property, ListingType, SiteName | ||
|
||
|
||
@dataclass | ||
class ScraperInput: | ||
location: str | ||
listing_type: ListingType | ||
radius: float | None = None | ||
mls_only: bool | None = None | ||
proxy: str | None = None | ||
last_x_days: int | None = None | ||
date_from: str | None = None | ||
date_to: str | None = None | ||
foreclosure: bool | None = None | ||
|
||
|
||
class Scraper: | ||
session = None | ||
|
||
def __init__( | ||
self, | ||
scraper_input: ScraperInput, | ||
): | ||
self.location = scraper_input.location | ||
self.listing_type = scraper_input.listing_type | ||
|
||
if not self.session: | ||
Scraper.session = requests.Session() | ||
print("Session created") | ||
retries = Retry( | ||
total=3, backoff_factor=3, status_forcelist=[429, 403], allowed_methods=frozenset(["GET", "POST"]) | ||
) | ||
|
||
adapter = HTTPAdapter(max_retries=retries) | ||
Scraper.session.mount("http://", adapter) | ||
Scraper.session.mount("https://", adapter) | ||
Scraper.session.headers.update( | ||
{ | ||
"auth": f"Bearer {self.get_access_token()}", | ||
"apollographql-client-name": "com.move.Realtor-apollo-ios", | ||
} | ||
) | ||
|
||
if scraper_input.proxy: | ||
proxy_url = scraper_input.proxy | ||
proxies = {"http": proxy_url, "https": proxy_url} | ||
self.session.proxies.update(proxies) | ||
|
||
self.listing_type = scraper_input.listing_type | ||
self.radius = scraper_input.radius | ||
self.last_x_days = scraper_input.last_x_days | ||
self.mls_only = scraper_input.mls_only | ||
self.date_from = scraper_input.date_from | ||
self.date_to = scraper_input.date_to | ||
self.foreclosure = scraper_input.foreclosure | ||
|
||
def search(self) -> list[Property]: ... | ||
|
||
@staticmethod | ||
def _parse_home(home) -> Property: ... | ||
|
||
def handle_location(self): ... | ||
|
||
def get_access_token(self): | ||
url = "https://graph.realtor.com/auth/token" | ||
|
||
payload = f'{{"client_app_id":"rdc_mobile_native,24.20.4.149916,iphone","device_id":"{str(uuid.uuid4()).upper()}","grant_type":"device_mobile"}}' | ||
headers = { | ||
"Host": "graph.realtor.com", | ||
"x-client-version": "24.20.4.149916", | ||
"accept": "*/*", | ||
"content-type": "Application/json", | ||
"user-agent": "Realtor.com/24.20.4.149916 CFNetwork/1410.0.3 Darwin/22.6.0", | ||
"accept-language": "en-US,en;q=0.9", | ||
} | ||
response = requests.post(url, headers=headers, data=payload) | ||
|
||
data = response.json() | ||
try: | ||
access_token = data["access_token"] | ||
except Exception: | ||
raise Exception("Could not get access token, use a proxy/vpn or wait") | ||
return access_token |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
from dataclasses import dataclass | ||
from enum import Enum | ||
from typing import Optional | ||
|
||
|
||
class SiteName(Enum): | ||
ZILLOW = "zillow" | ||
REDFIN = "redfin" | ||
REALTOR = "realtor.com" | ||
|
||
@classmethod | ||
def get_by_value(cls, value): | ||
for item in cls: | ||
if item.value == value: | ||
return item | ||
raise ValueError(f"{value} not found in {cls}") | ||
|
||
|
||
class ListingType(Enum): | ||
FOR_SALE = "FOR_SALE" | ||
FOR_RENT = "FOR_RENT" | ||
PENDING = "PENDING" | ||
SOLD = "SOLD" | ||
|
||
|
||
@dataclass | ||
class Agent: | ||
name: str | None = None | ||
phone: str | None = None | ||
|
||
|
||
class PropertyType(Enum): | ||
APARTMENT = "APARTMENT" | ||
BUILDING = "BUILDING" | ||
COMMERCIAL = "COMMERCIAL" | ||
CONDO_TOWNHOME = "CONDO_TOWNHOME" | ||
CONDO_TOWNHOME_ROWHOME_COOP = "CONDO_TOWNHOME_ROWHOME_COOP" | ||
CONDO = "CONDO" | ||
CONDOS = "CONDOS" | ||
COOP = "COOP" | ||
DUPLEX_TRIPLEX = "DUPLEX_TRIPLEX" | ||
FARM = "FARM" | ||
INVESTMENT = "INVESTMENT" | ||
LAND = "LAND" | ||
MOBILE = "MOBILE" | ||
MULTI_FAMILY = "MULTI_FAMILY" | ||
RENTAL = "RENTAL" | ||
SINGLE_FAMILY = "SINGLE_FAMILY" | ||
TOWNHOMES = "TOWNHOMES" | ||
OTHER = "OTHER" | ||
|
||
|
||
@dataclass | ||
class Address: | ||
street: str | None = None | ||
unit: str | None = None | ||
city: str | None = None | ||
state: str | None = None | ||
zip: str | None = None | ||
|
||
|
||
@dataclass | ||
class Description: | ||
primary_photo: str | None = None | ||
alt_photos: list[str] | None = None | ||
style: PropertyType | None = None | ||
beds: int | None = None | ||
baths_full: int | None = None | ||
baths_half: int | None = None | ||
sqft: int | None = None | ||
lot_sqft: int | None = None | ||
sold_price: int | None = None | ||
year_built: int | None = None | ||
garage: float | None = None | ||
stories: int | None = None | ||
text: str | None = None | ||
|
||
|
||
@dataclass | ||
class Agent: | ||
name: str | None = None | ||
phone: str | None = None | ||
|
||
|
||
@dataclass | ||
class Property: | ||
property_url: str | ||
mls: str | None = None | ||
mls_id: str | None = None | ||
status: str | None = None | ||
address: Address | None = None | ||
|
||
list_price: int | None = None | ||
list_date: str | None = None | ||
pending_date: str | None = None | ||
last_sold_date: str | None = None | ||
prc_sqft: int | None = None | ||
hoa_fee: int | None = None | ||
days_on_mls: int | None = None | ||
description: Description | None = None | ||
|
||
latitude: float | None = None | ||
longitude: float | None = None | ||
neighborhoods: Optional[str] = None | ||
county: Optional[str] = None | ||
fips_code: Optional[str] = None | ||
agents: list[Agent] = None | ||
nearby_schools: list[str] = None | ||
assessed_value: int | None = None | ||
estimated_value: int | None = None |
Oops, something went wrong.