-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
148 lines (115 loc) · 4.57 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from typing import Dict, List, Set, Union
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
def generate_postcodes() -> List[str]:
"""
Generate a list of valid Australian postcodes based on state ranges.
Returns:
List[str]: A list of valid postcodes as strings.
"""
postcode_ranges = {
'NSW': [(2000, 2599), (2619, 2899), (2921, 2999)],
'ACT': [(2600, 2618), (2900, 2920)],
'VIC': [(3000, 3999)],
'QLD': [(4000, 4999)],
'SA': [(5000, 5799)],
'WA': [(6000, 6797)],
'TAS': [(7000, 7799)],
'NT': [(800, 899)]
}
valid_postcodes: List[str] = []
for state, ranges in postcode_ranges.items():
for start, end in ranges:
valid_postcodes.extend([str(code).zfill(4) for code in range(start, end + 1)])
return valid_postcodes
def get_soup(url: str) -> Union[BeautifulSoup, None]:
"""
Fetch a webpage and return a BeautifulSoup object.
Args:
url (str): The URL of the webpage to fetch.
Returns:
Union[BeautifulSoup, None]: A BeautifulSoup object if the request is successful, None otherwise.
"""
try:
headers = {
"User-Agent": "Mozilla/5.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Pragma": "no-cache",
"Cache-Control": "no-cache"
}
params = {
"noimg": 1 # Custom parameter to indicate no images
}
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching URL {url}: {e}")
return None
return BeautifulSoup(response.text, 'html.parser')
def convert_to_weekly_price(price: str) -> Union[int, None]:
"""
Convert a price string to a weekly price in integer form.
Args:
price (str): The price string to convert.
Returns:
Union[int, None]: The weekly price as an integer, or None if the price cannot be converted.
"""
price = price.lower().replace(",", "").strip()
match = re.search(r"\d+(\.\d+)?", price)
if not match:
print(f"Error: Unable to extract numeric value from '{price}'")
return None
price_value = float(match.group())
if "pw" in price or "p/w" in price or "per week" in price or "/week" in price:
return round(price_value)
elif "pm" in price or "p/m" in price or "per month" in price:
return round(price_value / 4.345)
elif "py" in price or "p/y" in price or "per year" in price or "per annum" in price:
return round(price_value / 52)
elif "$" in price or price.isdigit():
return round(price_value)
else:
print(f"Warning: Unable to determine price frequency from '{price}'")
return None
def read_csv_into_set(file_path: str) -> Set[str]:
"""
Read a CSV file into a set of listing IDs.
Args:
file_path (str): The path to the CSV file.
Returns:
Set[str]: A set of listing IDs.
"""
try:
df = pd.read_csv(file_path, header=0)
listing_id_set = set(df['listing_id'].astype(str))
return listing_id_set
except (FileNotFoundError, pd.errors.EmptyDataError):
return set()
def write_listings_to_csv(original_file_path: str, new_listing_info: Dict[str, Dict[str, str]]) -> None:
"""
Write new listing information to a CSV file.
Args:
original_file_path (str): The path to the original CSV file.
new_listing_info (Dict[str, Dict[str, str]]): A dictionary containing new listing information.
"""
try:
df = pd.read_csv(original_file_path, header=0)
except (FileNotFoundError, pd.errors.EmptyDataError):
headers = ['source', 'postcode', 'listing_id', 'url', 'beds', 'bath', 'weekly_price', 'scrape_timestamp']
df = pd.DataFrame(columns=headers)
df = df.astype(str)
data_to_append = []
for postcode, info in new_listing_info.items():
for listing_id, listing_info in info.items():
listing_info.pop('html', None)
data_to_append.append(listing_info)
new_df = pd.DataFrame(data_to_append)
updated_df = pd.concat([df, new_df], ignore_index=True)
updated_df.sort_values(by='postcode', inplace=True)
updated_df.to_csv(original_file_path, index=False)