diff --git a/preprocessing/README.md b/preprocessing/README.md
new file mode 100644
index 0000000..2869519
--- /dev/null
+++ b/preprocessing/README.md
@@ -0,0 +1,5 @@
+### Preprocessing
+
+pre_CT_MR.py is the example code to preprocess from nifti files to png files.
+
+Once images and masks are preprocessed to png files in the folders train, train_mask, test, test_mask, run create-custom-coco-dataset_standard.py to create the json file for data loading.
\ No newline at end of file
diff --git a/preprocessing/create-custom-coco-dataset_standard.py b/preprocessing/create-custom-coco-dataset_standard.py
new file mode 100644
index 0000000..d5d7eba
--- /dev/null
+++ b/preprocessing/create-custom-coco-dataset_standard.py
@@ -0,0 +1,147 @@
+import glob
+from tqdm import tqdm
+import pandas as pd
+
+from create_annotations import *
+
+
+# provide the path to the dataset. There should be train, train_mask, test, test_mask under this folder
+targetpath = '/storage/data/MedSegmentation/BiomedSeg/amos22'
+
+
+image_size = 1024
+
+
+### Load Biomed Label Base
+# provide path to predefined label base
+with open(os.path.join('/storage/data/MedSegmentation/BiomedSeg/', 'label_base.json'), 'r') as f:
+    label_base = json.load(f)
+    
+    
+    
+# get parent class for the names
+parent_class = {}
+for i in label_base:
+    subnames = [label_base[i]['name']] + label_base[i].get('child', [])
+    for label in subnames:
+        parent_class[label] = int(i)
+    
+# Label ids of the dataset
+category_ids = {label_base[i]['name']: int(i) for i in label_base if 'name' in label_base[i]}
+
+
+
+# Get "images" and "annotations" info 
+def images_annotations_info(maskpath):
+    
+    imagepath = maskpath.replace('_mask', '')
+    # This id will be automatically increased as we go
+    annotation_id = 0
+    
+    sent_id = 0
+    ref_id = 0
+    
+    annotations = []
+    images = []
+    image_to_id = {}
+    n_total = len(glob.glob(maskpath + "*.png"))
+    n_errors = 0
+    
+    def extra_annotation(ann, file_name, target):
+        nonlocal sent_id, ref_id
+        ann['file_name'] = file_name
+        ann['split'] = keyword
+        
+        ### modality
+        mod = file_name.split('.')[0].split('_')[-2]
+        ### site
+        site = file_name.split('.')[0].split('_')[-1]
+        
+        task = {'target': target, 'modality': mod, 'site': site}
+        if 'T1' in mod or 'T2' in mod or 'FLAIR' in mod or 'ADC' in mod:
+            task['modality'] = 'MRI'
+            if 'MRI' not in mod:
+                task['sequence'] = mod
+            else:
+                task['sequence'] = mod[4:]
+            
+        prompts = [f'{target} in {site} {mod}']
+        
+        ann['sentences'] = []
+        for p in prompts:
+            ann['sentences'].append({'raw': p, 'sent': p, 'sent_id': sent_id})
+            sent_id += 1
+        ann['sent_ids'] = [s['sent_id'] for s in ann['sentences']]
+        
+        ann['ann_id'] = ann['id']
+        ann['ref_id'] = ref_id
+        ref_id += 1
+        
+        return ann
+    
+    for mask_image in tqdm(glob.glob(maskpath + "*.png")):
+        # The mask image is *.png but the original image is *.jpg.
+        # We make a reference to the original file in the COCO JSON file
+        filename_parsed = os.path.basename(mask_image).split("_")
+        target_name = filename_parsed[-1].split(".")[0].replace("+", " ")
+        
+        original_file_name = "_".join(filename_parsed[:-1]) + ".png"
+        
+        if original_file_name not in os.listdir(imagepath):
+            print("Original file not found: {}".format(original_file_name))
+            n_errors += 1
+            continue
+        
+        if original_file_name not in image_to_id:
+            image_to_id[original_file_name] = len(image_to_id)
+
+            # "images" info 
+            image_id = image_to_id[original_file_name]
+            image = create_image_annotation(original_file_name, image_size, image_size, image_id)
+            images.append(image)
+            
+        
+        annotation = {
+            "mask_file": os.path.basename(mask_image),
+            "iscrowd": 0,
+            "image_id": image_to_id[original_file_name],
+            "category_id": parent_class[target_name],
+            "id": annotation_id,
+        }
+
+        annotation = extra_annotation(annotation, original_file_name, target_name)
+                
+        annotations.append(annotation)
+        annotation_id += 1
+            
+    #print(f"Number of errors in conversion: {n_errors}/{n_total}")
+    return images, annotations, annotation_id
+
+
+
+
+if __name__ == "__main__":
+    # Get the standard COCO JSON format
+    coco_format = get_coco_json_format()
+
+    for keyword in ['train', 'test']:
+        mask_path = os.path.join(targetpath, "{}_mask/".format(keyword))
+        
+        # Create category section
+        coco_format["categories"] = create_category_annotation(category_ids)
+    
+        # Create images and annotations sections
+        coco_format["images"], coco_format["annotations"], annotation_cnt = images_annotations_info(mask_path)
+
+        # post-process file
+        images_with_ann = set()
+        for ann in coco_format['annotations']:
+            images_with_ann.add(ann['file_name'])
+        for im in coco_format['images']:
+            if im["file_name"] not in images_with_ann:
+                coco_format['images'].remove(im)
+
+        with open(os.path.join(targetpath, "{}.json".format(keyword)),"w") as outfile:
+            json.dump(coco_format, outfile)
+        
+        print("Created %d annotations for %d images in folder: %s" % (annotation_cnt, len(coco_format['images']), mask_path))
diff --git a/preprocessing/pre_CT_MR.py b/preprocessing/pre_CT_MR.py
new file mode 100755
index 0000000..03532ee
--- /dev/null
+++ b/preprocessing/pre_CT_MR.py
@@ -0,0 +1,281 @@
+#%% import packages
+import numpy as np
+import os
+join = os.path.join 
+from skimage import io, transform
+from tqdm import tqdm
+import SimpleITK as sitk
+
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+from PIL import Image     
+import json
+import nibabel as nib
+import pandas as pd
+import shutil
+
+
+# convert 2D data to png files, including images and corresponding masks
+data_name = 'amos22'
+#modality = 'X-Ray' # e.g., 'Dermoscopy 
+anatomy = 'abdomen'  # e.g., 'SkinCancer'
+
+CT_WINDOWS = {'liver': [-25, 230],
+              'abdomen': [-150, 250],
+              'colon': [-68, 187],
+              'pancreas': [-100, 200],
+              'lung': [-1000, 1000],}
+
+labels = {"1": "spleen", "2": "right kidney", "3": "left kidney", "4": "gallbladder", "5": "esophagus", "6": "liver", "7": "stomach", "8": "aorta", "9": "postcava", "10": "pancreas", "11": "right adrenal gland", "12": "left adrenal gland", "13": "duodenum", "14": "bladder", "15": "prostate/uterus"}
+
+merge_classes = {(2,3): 'kidney', (11,12): 'adrenal gland'}
+
+### Dataset specific settings
+split_names = {'Tr': 'train', 'Va': 'test'}
+
+# record image and mask statistics
+splits = []
+fnames = []
+original_shapes = []
+mask_areas = []
+mask_bboxes = []
+intersect_portion = []
+
+
+
+image_size = 1024
+# set label ids that are excluded
+remove_label_ids = [15]     # remove prostate/uterus since the name is not specific
+
+basepath = '/storage/data/MedSegmentation/'
+datapath = os.path.join(basepath, data_name)
+basetarget = '/storage/data/MedSegmentation/BiomedSeg/'
+targetpath = os.path.join(basetarget, data_name)
+
+def get_mask_stats(mask):
+    masked_points = np.nonzero(mask)
+    area = len(masked_points[0])
+    min_x = np.min(masked_points[0])
+    max_x = np.max(masked_points[0])
+    min_y = np.min(masked_points[1])
+    max_y = np.max(masked_points[1])
+    bbox = [min_x, min_y, max_x-min_x, max_y-min_y]
+    return area, bbox
+
+
+# preprocessing
+for split in split_names.keys():
+    
+    files = os.listdir(os.path.join(datapath, f'images{split}'))
+    label_files = os.listdir(os.path.join(datapath, f'labels{split}'))
+    for file in tqdm(files):
+        if '.nii.gz' not in file:
+            continue
+        
+        if file not in label_files:
+            continue
+        
+        target_split = split_names[split]
+        extra_train = int(file[5:9]) <= 228 or (int(file[5:9]) > 500 and int(file[5:9]) <= 556)
+        if target_split == 'test' and extra_train:
+            target_split = 'train'
+        
+        ### Image INFO
+        if int(file[5:9]) <= 500:
+            modality = 'CT'
+        else:
+            modality = 'MRI'
+            
+        f = file.split('.')[0]
+        
+        # load image and preprocess
+        image_data = nib.load(os.path.join(datapath, f'images{split}', file)).get_fdata()
+        
+        # nii preprocess start
+        if modality == "CT":
+            lower_bound = CT_WINDOWS['abdomen'][0]
+            upper_bound = CT_WINDOWS['abdomen'][1]
+            image_data_pre = np.clip(image_data, lower_bound, upper_bound)
+            image_data_pre = (
+                (image_data_pre - np.min(image_data_pre))
+                / (np.max(image_data_pre) - np.min(image_data_pre))
+                * 255.0
+            )
+            # different processing for liver CT
+            lower_bound = CT_WINDOWS['liver'][0]
+            upper_bound = CT_WINDOWS['liver'][1]
+            image_data_liver = np.clip(image_data, lower_bound, upper_bound)
+            image_data_liver = (
+                (image_data_liver - np.min(image_data_liver))
+                / (np.max(image_data_liver) - np.min(image_data_liver))
+                * 255.0
+            )
+        else:
+            lower_bound, upper_bound = np.percentile(
+                image_data[image_data > 0], 0.5
+            ), np.percentile(image_data[image_data > 0], 99.5)
+            image_data_pre = np.clip(image_data, lower_bound, upper_bound)
+            image_data_pre = (
+                (image_data_pre - np.min(image_data_pre))
+                / (np.max(image_data_pre) - np.min(image_data_pre))
+                * 255.0
+            )
+            image_data_pre[image_data == 0] = 0
+        
+        # process mask annotations
+        mask = nib.load(os.path.join(datapath, f'labels{split}', file)).get_fdata()
+        shape = list(mask.shape[:2])
+        n_slices = mask.shape[2]
+        
+        # remove label ids
+        for remove_label_id in remove_label_ids:
+            mask[mask==remove_label_id] = 0
+        
+        # pad to square with equal padding on both sides
+        if shape[0] > shape[1]:
+            pad = (shape[0]-shape[1])//2
+            pad_width = ((0,0), (pad, pad), (0,0))
+        elif shape[0] < shape[1]:
+            pad = (shape[1]-shape[0])//2
+            pad_width = ((pad, pad), (0,0), (0,0))
+        else:
+            pad_width = None
+        
+        if pad_width is not None:
+            image_data_pre = np.pad(image_data_pre, pad_width, 'constant', constant_values=0)
+            if modality=='CT':
+                image_data_liver = np.pad(image_data_liver, pad_width, 'constant', constant_values=0)
+            mask = np.pad(mask, pad_width, 'constant', constant_values=0)
+        # preprocessing ends
+            
+            
+        # find largest slice area for each class
+        max_area = {}
+        for c in labels:
+            mask_c = 1*(mask==int(c))
+            max_area[c] = mask_c.sum(axis=(0,1)).max()
+        for c in merge_classes:
+            mask_c = 1*(mask==int(c[0]))
+            for cc in c[1:]:
+                mask_c += 1*(mask==int(cc))
+            mask_c = 1*(mask_c>0)
+            max_area[c] = mask_c.sum(axis=(0,1)).max()  
+        
+        
+        # process by slice
+        for i in range(n_slices):
+            # remove masks of class that are too small
+            for c in labels:
+                if np.sum(mask[:,:,i]==int(c)) < max_area[c]/10:
+                    mask[:,:,i][mask[:,:,i]==int(c)] = 0
+            # skip slices without mask
+            if int(mask[:,:,i].max()) == 0:
+                continue
+            
+            # resize image to 1024x1024
+            resize_image = transform.resize(image_data_pre[:,:,i], (image_size, image_size), order=3, mode='constant', preserve_range=True, anti_aliasing=True)
+            
+            # save image
+            filename = f"{f}_{i}_{modality}_{anatomy}.png"
+            plt.imsave(os.path.join(targetpath, modality, f"{target_split}/{filename}"), 
+                       resize_image.astype(np.uint8), cmap='gray')
+            
+            
+            # save extra image slice for liver CT
+            if modality=='CT' and 6 in mask[:,:,i]:
+                # resize image to 1024x1024
+                resize_image = transform.resize(image_data_liver[:,:,i], (image_size, image_size), order=3, mode='constant', preserve_range=True, anti_aliasing=True)
+                # save image
+                filename = f"{f}_{i}_{modality}_liver.png"
+                plt.imsave(os.path.join(targetpath, modality, f"{target_split}/{filename}"), 
+                        resize_image.astype(np.uint8), cmap='gray')
+        
+        
+            # Get masks for each class
+            for c in labels:
+                target = labels[c]
+                target_name = target.replace(' ', '+')
+                
+                mask_c = 1*(mask[:,:,i]==int(c))
+                # make sure the class exists
+                if mask_c.max() == 0:
+                    continue
+                
+                portion = int(10 * mask_c.sum() / max_area[c])
+                
+                # resize mask to 1024x1024
+                resize_gt = transform.resize(mask_c, (image_size, image_size), order=0, mode='constant', preserve_range=True, anti_aliasing=False)
+                
+                # stats of resized mask
+                area, bbox = get_mask_stats(resize_gt)
+            
+                # colored mask
+                colored_mask = 255*np.repeat(resize_gt[:,:,None], 3, axis=-1)
+                # save output
+                mask_filename = f"{f}_{i}_{modality}_{anatomy}_{target_name}.png"
+                plt.imsave(os.path.join(targetpath, modality, f"{target_split}_mask/{mask_filename}"), 
+                        colored_mask.astype(np.uint8))
+                # record image and mask statistics
+                splits.append(target_split)
+                fnames.append(mask_filename)
+                original_shapes.append(tuple(shape))
+                mask_areas.append(int(area))
+                mask_bboxes.append(tuple(bbox))
+                intersect_portion.append(portion)
+                
+                # save mask for liver CT
+                if modality=='CT' and target_name == 'liver':
+                    # save output
+                    mask_filename = f"{f}_{i}_{modality}_liver_{target_name}.png"
+                    plt.imsave(os.path.join(targetpath, modality, f"{target_split}_mask/{mask_filename}"), 
+                            colored_mask.astype(np.uint8))
+                    # record image and mask statistics
+                    splits.append(target_split)
+                    fnames.append(mask_filename)
+                    original_shapes.append(tuple(shape))
+                    mask_areas.append(int(area))
+                    mask_bboxes.append(tuple(bbox))
+                    intersect_portion.append(portion)
+                    
+                
+                
+            # get mask for merged classes
+            for c in merge_classes:
+                target = merge_classes[c]
+                target_name = target.replace(' ', '+')
+                
+                # make sure both classes exist
+                if (mask[:,:,i]==int(c[0])).sum() == 0 or (mask[:,:,i]==int(c[1])).sum() == 0:
+                    continue
+                
+                mask_c = 1*((mask[:,:,i]==int(c[0])) | (mask[:,:,i]==int(c[1])))
+                
+                portion = int(10 * mask_c.sum() / max_area[c])
+                
+                # resize mask to 1024x1024
+                resize_gt = transform.resize(mask_c, (image_size, image_size), order=0, mode='constant', preserve_range=True, anti_aliasing=False)
+                
+                # stats of resized mask
+                area, bbox = get_mask_stats(resize_gt)
+            
+                mask_filename = f"{f}_{i}_{modality}_{anatomy}_{target_name}.png"
+                
+                # save colored mask
+                colored_mask = 255*np.repeat(resize_gt[:,:,None], 3, axis=-1)
+                plt.imsave(os.path.join(targetpath, modality, f"{target_split}_mask/{mask_filename}"), 
+                        colored_mask.astype(np.uint8))
+
+                # record image and mask statistics
+                splits.append(target_split)
+                fnames.append(mask_filename)
+                original_shapes.append(tuple(shape))
+                mask_areas.append(int(area))
+                mask_bboxes.append(tuple(bbox))
+                intersect_portion.append(portion)
+        
+            
+                
+# save statistics
+df = pd.DataFrame({'split': splits, 'fname': fnames, 'original_shape': original_shapes, 
+                   'mask_area': mask_areas, 'mask_bbox': mask_bboxes, 'slice_ratio': intersect_portion})
+df.to_csv(os.path.join(targetpath, 'mask_stats.csv'), index=False)
\ No newline at end of file