Skip to content

Commit

Permalink
many changes and build out webstie
Browse files Browse the repository at this point in the history
  • Loading branch information
henrypinkard committed Dec 15, 2023
1 parent 8942b13 commit 5ebe1ed
Show file tree
Hide file tree
Showing 15 changed files with 506 additions and 275 deletions.
205 changes: 97 additions & 108 deletions Getting_started.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions bsccm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
__email__ = '[email protected]'

from bsccm.bsccm import BSCCM
from bsccm.bsccm import download_dataset
from bsccm.phase.util import *
from bsccm.phase.functional_dpc import *
from ._version import __version__, version_info
2 changes: 1 addition & 1 deletion bsccm/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
version_info = (0, 3, 0)
version_info = (1, 0, 0)
__version__ = ".".join(map(str, version_info))
201 changes: 123 additions & 78 deletions bsccm/bsccm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,99 +9,143 @@
import io
from tqdm import tqdm
import shutil
import requests



class _ProgressFileObject(io.FileIO):
def __init__(self, path, *args, **kwargs):
self._total_size = os.path.getsize(path)
io.FileIO.__init__(self, path, *args, **kwargs)
def download_dataset(location='.', coherent=False, tiny=True, mnist=False, token=None):
"""
Downloads the BSCCM dataset to the specified location.
If location is not specified, the current directory is used.
def read(self, size):
print("Extracting: {:.2f}%\r".format(self.tell() / self._total_size * 100), end="")
return io.FileIO.read(self, size)
Args:
location (str): The location to download the dataset to.
coherent (bool): Whether to download the coherent (single LED illumination patterns) version of the dataset.
tiny (bool): Whether to download the tiny version of the dataset, a subsample of the full dataset.
MNIST (bool): Whether to download the version of the dataset with MNIST sized images
token: (Debugging only) for accessing versions of the dataset not yet released on Dryad.
Returns:
The path to the downloaded dataset.
"""

def _combine_and_extract_chunks(chunk_dir, chunk_size=2**30):
# get the number of chunks in the directory
chunk_files = [f for f in os.listdir(chunk_dir) if f.endswith('.bin')]
# add trailing slash if not there
if location[-1] != os.sep:
location += os.sep

# Combine chunks
total_chunks = len(chunk_files)
total_size = total_chunks * chunk_size
dataset_name = 'BSCCM' if not mnist else 'BSCCMNIST'
if coherent:
dataset_name += '-coherent'
if tiny:
dataset_name += '-tiny'
dataset_name += '.tar.gz'

# get parent directory
parent_dir = os.path.dirname(chunk_dir.rstrip('/')) + os.sep
if not chunk_dir.endswith(os.sep):
chunk_dir += os.sep

print('Combining {} chunks into {}'.format(total_chunks, parent_dir + 'combined.tar.gz'))
with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
with open(parent_dir + 'combined.tar.gz', 'wb') as out_file:
for i in range(total_chunks):
with open(chunk_dir + 'chunk{:05d}.bin'.format(i), 'rb') as in_file:
shutil.copyfileobj(in_file, out_file)
pbar.update(chunk_size)

# Unzip
extract_dir = chunk_dir.split('_chunks')[0]
os.mkdir(extract_dir)
tar = tarfile.open(fileobj=_ProgressFileObject(parent_dir + 'combined.tar.gz'))
tar.extractall(path=extract_dir)
tar.close()

# Delete chunks
print('Cleaning up')
shutil.rmtree(chunk_dir)

doi = 'doi%3A10.5061%2Fdryad.sxksn038s'
base_url = "https://datadryad.org"

def download_data(mnist=True, coherent=False, tiny=False):
"""
Download one of the 6 possible versions of BSCCM dataset
mnist: download BSCCMNIST (downsized and downsampled version of BSCCM)
coherent: download BSCCM-coherent or BSCCM-coherent-tiny
tiny: the tiny version or the full version
"""
# Set up the headers
headers = { "Authorization": f"Bearer {token}"} if token is not None else None

versions = requests.get(base_url + f'/api/v2/datasets/{doi}/versions', headers=headers)
version_id = versions.json()['_embedded']['stash:versions'][-1]['_links']['self']['href'].split('/')[-1]

# Function to get all files, handling pagination
def get_all_files(version_id):
all_files = []
url = base_url + '/api/v2/versions/' + version_id + '/files'
while url:
print(f'Fetching file metadata {len(all_files)}...', end='\r')
response = requests.get(url, headers=headers)

# Check if the response status code indicates success
if response.status_code != 200:
print(f"Failed to fetch data: {response.status_code}")
break

# Try to decode JSON only if the response contains content
if response.content:
data = response.json()
all_files.extend(data['_embedded']['stash:files'])
links = data.get('_links', {})
next_link = links.get('next', {}).get('href')

if next_link:
if next_link.startswith('/'):
url = base_url + next_link
else:
url = next_link
else:
url = None
else:
print("No content in response")
break

return all_files

files = get_all_files(version_id)

# find files relevant to this dataset
files = [f for f in files if dataset_name in f['path']]



download_chunk_size = 1024 * 1024 * 8 # 8 MB
total_size = sum(f['size'] for f in files)

# Create a tqdm progress bar for the total download progress
print(f'Downloading...')
with tqdm(total=total_size, desc='Total Download Progress', unit='B', unit_scale=True, unit_divisor=1024) as progress_bar:
for k, file_info in enumerate(files):

download_url = 'https://datadryad.org' + file_info['_links']['stash:file-download']['href']
with requests.get(download_url, stream=True, headers=headers) as r:
r.raise_for_status()
with open(location + file_info['path'], 'wb') as file:
for chunk in r.iter_content(chunk_size=download_chunk_size):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
# Update the progress bar by the size of the chunk
progress_bar.update(len(chunk))


# get all file names
chunks = [f['path'] for f in files]
# organize alphabetically
chunks.sort()

# Recombine the chunks into a single file
combined_file_name = chunks[0].split('_chunk')[0]
with open(location + combined_file_name, 'wb') as combined_file:
for chunk in tqdm(chunks, desc='Combining File chunks'):
with open(location + chunk, 'rb') as file_part:
combined_file.write(file_part.read())

# Extract the tar.gz file
with tarfile.open(location + combined_file_name) as file:
# Create a tqdm progress bar without a total
members = []
with tqdm(desc='Reading compressed files', unit=' files') as progress_bar:
# Iterate over each member
for member in file:
members.append(member)
# Update the progress bar for each member
progress_bar.update(1)

# Now extract the files
loc = location + combined_file_name[:-7] # Remove .tar.gz for the extraction location
print('Decompressing to {}...'.format(loc))
with tarfile.open(location + combined_file_name) as file:
for member in tqdm(members, desc='Extracting Files', unit='file'):
file.extract(member, loc)

location = '/home/hpinkard_waller/2tb_ssd/'
doi_url = 'doi%3A10.5061%2Fdryad.9pg8d'
version_index = -1
file_index = 1

# Get the version ID of the dataset
api_url = "https://datadryad.org/api/v2/"
versions = requests.get(api_url + 'datasets/{}/versions'.format(doi_url))
version_id = versions.json()['_embedded']['stash:versions'][version_index]['_links']['self']['href'].split('/')[version_index]

# Get the URL to download one particular file
file = requests.get(api_url + 'versions/' + version_id + '/files').json()['_embedded']['stash:files'][file_index]
file_name = file['path']
download_url = 'https://datadryad.org' + file['_links']['stash:download']['href']

# Download in chunks (so that really big files can be downloaded)
chunk_size = 1024 * 1024 * 8
iters = file['size'] / chunk_size
with requests.get(download_url, stream=True) as r:
r.raise_for_status()
with open(location + file_name, 'wb') as f:
for i, chunk in enumerate(r.iter_content(chunk_size=chunk_size)):
print('Downloading {}, {:.1f}%\r'.format(file_name, 100 * i / iters ), end='')
f.write(chunk)
print('Finished downloading')


loc = location + file_name[:-7] #remove .tar.gz
print('Extracting to {}...'.format(loc))
file = tarfile.open(location + file_name)
file.extractall(loc)
file.close()
print('Cleaning up')
os.remove(location + file_name)
os.remove(location + combined_file_name)
for chunk in chunks:
os.remove(location + chunk)
print('Complete')


return loc

class BSCCM:

Expand Down Expand Up @@ -156,6 +200,7 @@ def read_image(self, index, channel, copy=False, convert_histology_rgb32=True):
Returns:
numpy.ndarray: The image as a numpy array.
"""
index = int(index)
if index not in self.index_dataframe.index:
raise Exception('{} is not a valid index into this dataset. Try using .get_indices to find a valid index'.format(index))

Expand Down
55 changes: 0 additions & 55 deletions bsccm/creation/dryad_upload.ipynb
Original file line number Diff line number Diff line change
@@ -1,60 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"def download_data(mnist=True, coherent=False, tiny=False):\n",
" \"\"\"\n",
" Download one of the 6 possible versions of BSCCM dataset\n",
" \n",
" mnist: download BSCCMNIST (downsized and downsampled version of BSCCM)\n",
" coherent: download BSCCM-coherent or BSCCM-coherent-tiny\n",
" tiny: the tiny version or the full version\n",
" \"\"\"\n",
"\n",
"\n",
" location = '/home/hpinkard_waller/2tb_ssd/'\n",
" doi_url = 'doi%3A10.5061%2Fdryad.9pg8d'\n",
" version_index = -1\n",
" file_index = 1\n",
"\n",
" # Get the version ID of the dataset\n",
" api_url = \"https://datadryad.org/api/v2/\"\n",
" versions = requests.get(api_url + 'datasets/{}/versions'.format(doi_url))\n",
" version_id = versions.json()['_embedded']['stash:versions'][version_index]['_links']['self']['href'].split('/')[version_index]\n",
"\n",
" # Get the URL to download one particular file\n",
" file = requests.get(api_url + 'versions/' + version_id + '/files').json()['_embedded']['stash:files'][file_index]\n",
" file_name = file['path']\n",
" download_url = 'https://datadryad.org' + file['_links']['stash:download']['href']\n",
"\n",
" # Download in chunks (so that really big files can be downloaded)\n",
" chunk_size = 1024 * 1024 * 8\n",
" iters = file['size'] / chunk_size\n",
" with requests.get(download_url, stream=True) as r:\n",
" r.raise_for_status()\n",
" with open(location + file_name, 'wb') as f:\n",
" for i, chunk in enumerate(r.iter_content(chunk_size=chunk_size)): \n",
" print('Downloading {}, {:.1f}%\\r'.format(file_name, 100 * i / iters ), end='')\n",
" f.write(chunk)\n",
" print('Finished downloading')\n",
"\n",
"\n",
" loc = location + file_name[:-7] #remove .tar.gz\n",
" print('Extracting to {}...'.format(loc))\n",
" file = tarfile.open(location + file_name)\n",
" file.extractall(loc)\n",
" file.close()\n",
" print('Cleaning up')\n",
" os.remove(location + file_name)\n",
" print('Complete')"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Binary file added dataset_variants_table.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added fig1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added fig2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
180 changes: 152 additions & 28 deletions figure_making/animation_maker.ipynb

Large diffs are not rendered by default.

Binary file not shown.
Loading

0 comments on commit 5ebe1ed

Please sign in to comment.