Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up DiffuserCam dataset upload script. #112

Merged
merged 17 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions configs/upload_diffusercam_huggingface.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ hydra:
repo_id: "bezzam/DiffuserCam-Lensless-Mirflickr-Dataset"
dir_diffuser: "/scratch/bezzam/DiffuserCam_mirflickr/dataset/diffuser_images"
dir_lensed: "/scratch/bezzam/DiffuserCam_mirflickr/dataset/ground_truth_lensed"
psf_fp: "data/psf/diffusercam_psf.tiff"
psf_fp: "/home/bezzam/LenslessPiCam/data/psf/diffusercam_psf.tiff"
hf_token: null
file_ext: ".npy"
n_files: null
n_jobs: 4 # for parallelizing conversion to PNG
n_jobs: 8 # for parallelizing conversion to PNG
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
sphinx==4.0.1
sphinx==5.0.2
sphinx_rtd_theme==0.4.3
docutils==0.16 # >0.17 doesn't render bullets
numpy>=1.22 # so that default dtype are correctly rendered
Expand Down
19 changes: 5 additions & 14 deletions docs/source/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,12 @@ They both correspond to the PSF which can be found `here <https://drive.switch.c
distance of 2 mm).


DiffuserCam Lensless Mirflickr Dataset (DLMD)
---------------------------------------------
DiffuserCam Lensless Mirflickr Dataset
--------------------------------------

You can download a subset for the `DiffuserCam Lensless Mirflickr
Dataset <https://waller-lab.github.io/LenslessLearning/dataset.html>`__
that we've prepared
`here <https://drive.switch.ch/index.php/s/vmAZzryGI8U8rcE>`__ with
``scripts/prepare_mirflickr_subset.py``. The original dataset is quite
large (25000 files, 100 GB). So we've prepared a more manageable
dataset (200 files, 725 MB). It was prepared with the following script:

.. code:: bash

python scripts/prepare_mirflickr_subset.py \
--data ~/Documents/DiffuserCam/DiffuserCam_Mirflickr_Dataset
The original dataset is available `here <https://waller-lab.github.io/LenslessLearning/dataset.html>`__.
However, it is quite large (100 GB). We've prepared a more manageable (6GB)
and viewable version on `Hugging Face <https://huggingface.co/datasets/bezzam/DiffuserCam-Lensless-Mirflickr-Dataset>`__.


3D data
Expand Down
8 changes: 5 additions & 3 deletions lensless/utils/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,17 +536,19 @@ def load_data(
return psf, data


def save_image(img, fp, max_val=255):
def save_image(img, fp, max_val=255, normalize=True):
"""Save as uint8 image."""

img_tmp = img.copy()

if img_tmp.dtype == np.uint16:
if img_tmp.dtype == np.uint16 or img_tmp.dtype == np.uint8:
img_tmp = img_tmp.astype(np.float32)

if img_tmp.dtype == np.float64 or img_tmp.dtype == np.float32:
if normalize:
img_tmp -= img_tmp.min()
img_tmp /= img_tmp.max()

if img_tmp.dtype == np.float64 or img_tmp.dtype == np.float32:
img_tmp *= max_val
img_tmp = img_tmp.astype(np.uint8)

Expand Down
70 changes: 46 additions & 24 deletions scripts/data/upload_diffusercam_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import glob
from lensless.utils.io import save_image
import cv2
import PIL
from datasets import Dataset, DatasetDict, Image
from huggingface_hub import upload_file
from lensless.utils.dataset import natural_sort
Expand All @@ -44,17 +43,12 @@ def upload_dataset(config):
file_ext = config.file_ext
n_files = config.n_files
n_jobs = config.n_jobs
normalize = False

assert hf_token is not None, "Please provide a HuggingFace token."

start_time = time.time()

# load PSF, convert to RGB, save as PNG
psf_img = np.array(PIL.Image.open(psf_fp))
psf_img = cv2.cvtColor(psf_img, cv2.COLOR_BGR2RGB) # convert to RGB
psf_fp_png = psf_fp.replace(".tiff", ".png")
save_image(psf_img, psf_fp_png)

# get all lensless-lensed pairs
files_diffuser = glob.glob(os.path.join(dir_diffuser, "*" + file_ext))
files_lensed = glob.glob(os.path.join(dir_lensed, "*" + file_ext))
Expand All @@ -69,50 +63,78 @@ def upload_dataset(config):
print(f"Only keeping {n_files} files...")
common_files = common_files[:n_files]

# load PSF, convert to RGB, save as PNG
# psf_img = np.array(PIL.Image.open(psf_fp))
psf_img = cv2.imread(psf_fp, cv2.IMREAD_UNCHANGED)
psf_img = cv2.cvtColor(psf_img, cv2.COLOR_BGR2RGB) # convert to RGB
psf_fp_png = psf_fp.replace(".tiff", ".png")
save_image(psf_img, psf_fp_png, normalize=True) # need normalize=True

# save as PNG
dir_diffuser_png = dir_diffuser.replace("diffuser_images", "diffuser_png")
os.makedirs(dir_diffuser_png, exist_ok=True)
dir_lensed_png = dir_lensed.replace("ground_truth_lensed", "lensed_png")
os.makedirs(dir_lensed_png, exist_ok=True)
diffuser_png_files = []
lensed_png_files = []

# -- parallelize with joblib
def save_png(f, dir_diffuser, dir_diffuser_png, dir_lensed, dir_lensed_png):

diffuser_img = np.load(os.path.join(dir_diffuser, f))
diffuser_img = cv2.cvtColor(diffuser_img, cv2.COLOR_BGR2RGB) # convert to RGB
diffuser_fn = os.path.join(dir_diffuser_png, f.replace(file_ext, ".png"))
diffuser_png_files.append(diffuser_fn)
save_image(diffuser_img, diffuser_fn)
save_image(diffuser_img, diffuser_fn, normalize=normalize)

lensed_img = np.load(os.path.join(dir_lensed, f))
lensed_img = cv2.cvtColor(lensed_img, cv2.COLOR_BGR2RGB) # convert to RGB
lensed_fn = os.path.join(dir_lensed_png, f.replace(file_ext, ".png"))
lensed_png_files.append(lensed_fn)
save_image(lensed_img, lensed_fn)
save_image(lensed_img, lensed_fn, normalize=normalize)

Parallel(n_jobs=n_jobs)(
delayed(save_png)(f, dir_diffuser, dir_diffuser_png, dir_lensed, dir_lensed_png)
for f in tqdm(common_files)
)

# get file paths
diffuser_files = [
os.path.join(dir_diffuser_png, f.replace(file_ext, ".png")) for f in common_files
]
lensed_files = [os.path.join(dir_lensed_png, f.replace(file_ext, ".png")) for f in common_files]
diffuser_files = natural_sort(diffuser_files)
lensed_files = natural_sort(lensed_files)

# step 1: create Dataset objects
dataset = Dataset.from_dict(
{
"lensless": diffuser_png_files,
"lensed": lensed_png_files,
}
)
dataset = dataset.cast_column("lensless", Image())
dataset = dataset.cast_column("lensed", Image())
def create_dataset(diffuser_files, lensed_files):
dataset = Dataset.from_dict(
{
"lensless": diffuser_files,
"lensed": lensed_files,
}
)
dataset = dataset.cast_column("lensless", Image())
dataset = dataset.cast_column("lensed", Image())
return dataset

# according to original split test files are up to idx=1000, for some reason im1 is missing?
test_dataset = create_dataset(diffuser_files[:999], lensed_files[:999])
train_dataset = create_dataset(diffuser_files[999:], lensed_files[999:])

# step 2: create DatasetDict
dataset_dict = DatasetDict(
{
"all": dataset,
"train": train_dataset,
"test": test_dataset,
}
)

# step 3: push to hub
upload_file(
path_or_fileobj=psf_fp,
path_in_repo="psf.tiff",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)

# -- dataset
dataset_dict.push_to_hub(
repo_id,
Expand All @@ -126,14 +148,14 @@ def save_png(f, dir_diffuser, dir_diffuser_png, dir_lensed, dir_lensed_png):
token=hf_token,
)
upload_file(
path_or_fileobj=diffuser_png_files[0],
path_or_fileobj=diffuser_files[0],
path_in_repo="lensless_example.png",
repo_id=repo_id,
repo_type="dataset",
token=hf_token,
)
upload_file(
path_or_fileobj=lensed_png_files[0],
path_or_fileobj=lensed_files[0],
path_in_repo="lensed_example.png",
repo_id=repo_id,
repo_type="dataset",
Expand Down
Loading