-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
151 lines (128 loc) · 6.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import requests
import json
import hashlib
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
# Function to compute MD5 hash of a file
def calculate_md5(file_path):
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
# Function to download a single file in chunks (for multi-connection downloads)
def download_chunk(url, start, end, dest_path, index):
headers = {'Range': f'bytes={start}-{end}'}
response = requests.get(url, headers=headers, stream=True)
with open(dest_path, f'r+b') as f:
f.seek(start)
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
return index
# Function to download a file using multiple connections
def download_file_multi_connection(url, dest_path, total_size, num_connections=4):
chunk_size = total_size // num_connections
chunks = [(i * chunk_size, (i + 1) * chunk_size - 1) for i in range(num_connections)]
chunks[-1] = (chunks[-1][0], total_size - 1) # Ensure the last chunk goes to the end
# Prepare the file with the expected size
with open(dest_path, 'wb') as f:
f.truncate(total_size)
# Download the file in chunks using multiple threads
with ThreadPoolExecutor(max_workers=num_connections) as executor:
futures = [
executor.submit(download_chunk, url, start, end, dest_path, idx)
for idx, (start, end) in enumerate(chunks)
]
for future in tqdm(futures, desc="Downloading file with multiple connections"):
future.result()
# Function to download and verify a single file
def download_and_verify_file(resource, main_url, base_dir, failed_resources, use_multi_connection, num_connections):
dest_path = os.path.join(base_dir, resource['dest'].lstrip('/'))
url = f"{main_url}/{resource['dest'].lstrip('/')}"
# Ensure the directory exists
os.makedirs(os.path.dirname(dest_path), exist_ok=True)
# Check if the file already exists and has a valid hash
if os.path.exists(dest_path):
existing_md5 = calculate_md5(dest_path)
if existing_md5 == resource['md5']:
print(f"File {resource['dest']} already exists and is valid. Skipping download.")
return True
if use_multi_connection:
# Multi-connection download
response = requests.head(url)
total_size = int(response.headers.get('content-length', 0))
download_file_multi_connection(url, dest_path, total_size, num_connections=num_connections)
else:
# Single connection download
response = requests.get(url, stream=True)
total_size = int(response.headers.get('content-length', 0))
block_size = 1024 # 1 Kibibyte
t = tqdm(total=total_size, unit='iB', unit_scale=True)
with open(dest_path, 'wb') as f:
for data in response.iter_content(block_size):
t.update(len(data))
f.write(data)
t.close()
# Verify file integrity using MD5 hash
downloaded_md5 = calculate_md5(dest_path)
if downloaded_md5 != resource['md5']:
print(f"Hash mismatch for {resource['dest']}. Expected {resource['md5']}, got {downloaded_md5}")
failed_resources.append(resource)
return False
return True
# Main function to orchestrate the download process
def download_resources(resources, main_url, version, json_name, use_multi_connection, num_connections, max_concurrent_files):
# Define base directory for successful downloads
base_dir = os.path.join(os.getcwd(), 'download', version)
# Define directory for failed downloads log
failed_dir = os.path.join(os.getcwd(), 'failed', version)
os.makedirs(failed_dir, exist_ok=True)
failed_resources = []
# Use ThreadPoolExecutor to handle multiple concurrent file downloads
with ThreadPoolExecutor(max_workers=max_concurrent_files) as executor:
futures = [
executor.submit(
download_and_verify_file, resource, main_url, base_dir, failed_resources, use_multi_connection, num_connections
)
for resource in resources
]
# Wait for all futures to complete
for future in tqdm(futures, desc="Downloading files"):
future.result()
# Save failed resources to a JSON file
if failed_resources:
failed_json_path = os.path.join(failed_dir, json_name)
with open(failed_json_path, 'w') as f:
json.dump({'resource': failed_resources}, f, indent=4)
print(f"Failed resources logged in {failed_json_path}")
def load_resources(json_url=None, json_file=None):
if json_url:
response = requests.get(json_url)
return response.json(), os.path.basename(json_url)
elif json_file:
with open(json_file, 'r') as f:
return json.load(f), os.path.basename(json_file)
else:
raise ValueError("Either json_url or json_file must be provided.")
if __name__ == "__main__":
# Input parameters
source_type = input("Enter the source of the JSON (url/file): ").strip().lower()
if source_type == "url":
json_url = input("Enter the URL of the JSON file: ").strip()
resources, json_name = load_resources(json_url=json_url)
elif source_type == "file":
json_file = input("Enter the path to the JSON file: ").strip()
resources, json_name = load_resources(json_file=json_file)
else:
print("Invalid input. Please enter 'url' or 'file'.")
exit(1)
main_url = input("Enter the main path URL (e.g., https://example.com/zip): ").strip()
version = input("Enter the version to be used as a folder name (e.g., 0.9.0): ").strip()
# Option to use multi-connection download for a single file
use_multi_connection = input("Use multi-connection download for a single file? (y/n): ").strip().lower() == 'y'
num_connections = int(input("Enter the number of connections per file (if multi-connection is enabled): ").strip())
# Number of files to download concurrently
max_concurrent_files = int(input("Enter the maximum number of files to download concurrently: ").strip())
# Start the download process
download_resources(resources['resource'], main_url, version, json_name, use_multi_connection=use_multi_connection, num_connections=num_connections, max_concurrent_files=max_concurrent_files)