-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathus_odp.py
93 lines (81 loc) · 3 KB
/
us_odp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import pandas as pd
import os
import string
import logging
def clean_filename(filename):
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
cleaned_filename = ''.join(c for c in filename if c in valid_chars)
cleaned_filename = cleaned_filename.replace(' ', '_') # Replace spaces with underscore
return cleaned_filename
# Set up logging
logging.basicConfig(filename='log.txt', level=logging.INFO, format='%(message)s')
# Make a GET request to the data.gov API
response = requests.get("https://catalog.data.gov/api/3/action/package_search")
# Convert the response to JSON
data = response.json()
# Create a directory for the datasets
path = "/Volumes/YING/Datasets/Feature_Discovery/us_odp"
os.makedirs(path, exist_ok=True)
# Download the CSV files
count = 0
for dataset in data['result']['results']:
for resource in dataset['resources']:
if resource['format'].lower() == 'csv':
url = resource['url']
filename = clean_filename(dataset['title']) + '.csv'
logging.info(f'Downloading {url} to {filename}')
try:
data = pd.read_csv(url)
data.to_csv(f'{path}/{filename}', index=False)
count += 1
except Exception as e:
logging.info(f'Could not download {url} because {str(e)}')
break # only download the first CSV file for each dataset
if count >= 10: # limit the number of datasets for this demo
break
logging.info('Finished downloading datasets')
# import requests
# import pandas as pd
# import os
# import string
#
#
# def clean_filename(filename):
# valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
# cleaned_filename = ''.join(c for c in filename if c in valid_chars)
# cleaned_filename = cleaned_filename.replace(' ', '_') # Replace spaces with underscore
# return cleaned_filename
#
#
# # Make a GET request to the data.gov API
# response = requests.get("https://catalog.data.gov/api/3/action/package_search")
#
# # Convert the response to JSON
# data = response.json()
#
# # Create a directory for the datasets
# path = "/Volumes/YING/Datasets/Feature_Discovery/us_odp"
# os.makedirs(path, exist_ok=True)
#
# # Download the CSV files
# count = 0
# for dataset in data['result']['results']:
# for resource in dataset['resources']:
# if resource['format'].lower() == 'csv':
# url = resource['url']
# filename = clean_filename(dataset['title']) + '.csv'
# print(f'Downloading {url} to {filename}')
#
# try:
# data = pd.read_csv(url)
# data.to_csv(f'{path}/{filename}', index=False)
# count += 1
# except Exception as e:
# print(f'Could not download {url} because {str(e)}')
#
# break # only download the first CSV file for each dataset
# if count >= 1000: # limit the number of datasets for this demo
# break
#
# print('Finished downloading datasets')