diff --git a/docs/source/_static/openapi.json b/docs/source/_static/openapi.json
index a819c9e7..36802b5f 100644
--- a/docs/source/_static/openapi.json
+++ b/docs/source/_static/openapi.json
@@ -1 +1 @@
-{"openapi":"3.1.0","info":{"title":"JoliGEN server","description":"*commit:* [349ba92e](https://github.com/jolibrain/joliGEN/commit/349ba92e26f3e7a83398486ecfe80f7fb2cfcb3a)\n\nThis is the JoliGEN server API documentation.\n","version":"0.1.0"},"paths":{"/train/{name}":{"get":{"summary":"Get the status of a training process","operationId":"get_train_train__name__get","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Start a training process with given name.","description":"The training process will be created using the same options as command line","operationId":"train_train__name__post","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TrainOptions"}}}},"responses":{"201":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Delete a training process.","description":"If the process is running, it will be stopped.","operationId":"delete_train_train__name__delete","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/train":{"get":{"summary":"Get the status of all training processes","operationId":"get_train_processes_train_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/predict":{"post":{"summary":"Start a inference process","description":"The inference process will be created using the same options as command line","operationId":"predict_predict_post","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/info":{"get":{"summary":"Get the server status","operationId":"get_info_info_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/fs/":{"delete":{"summary":"Delete a file or a directory in the filesystem","description":"This endpoint can be dangerous, use it with extreme caution","operationId":"delete_path_fs__delete","parameters":[{"required":true,"schema":{"type":"string","title":"Path"},"name":"path","in":"query"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"TrainOptions":{"title":"TrainBody","type":"object","properties":{"server":{"title":"Server","default":{"sync":false},"allOf":[{"$ref":"#/definitions/ServerTrainOptions"}]},"train_options":{"title":"TrainOptions","type":"object","properties":{"checkpoints_dir":{"default":"./checkpoints","type":"string","description":"models are saved here"},"dataroot":{"default":"None","type":"string","description":"path to images (should have subfolders trainA, trainB, valA, valB, etc)"},"ddp_port":{"default":"12355","type":"string","description":""},"gpu_ids":{"default":"0","type":"string","description":"gpu ids: e.g. 0  0,1,2, 0,2. use -1 for CPU"},"model_type":{"default":"cut","type":"string","description":"chooses which model to use.","enum":["cut","cycle_gan","palette","cm","cm_gan"]},"name":{"default":"experiment_name","type":"string","description":"name of the experiment. It decides where to store samples and models"},"phase":{"default":"train","type":"string","description":"train, val, test, etc"},"suffix":{"default":"","type":"string","description":"customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}"},"test_batch_size":{"default":1,"type":"integer","description":"input batch size"},"warning_mode":{"default":false,"type":"boolean","description":"whether to display warning"},"with_amp":{"default":false,"type":"boolean","description":"whether to activate torch amp on forward passes"},"with_tf32":{"default":false,"type":"boolean","description":"whether to activate tf32 for faster computations (Ampere GPU and beyond only)"},"with_torch_compile":{"default":false,"type":"boolean","description":"whether to activate torch.compile for some forward and backward functions (experimental)"},"D":{"title":"Discriminator","type":"object","properties":{"dropout":{"default":false,"type":"boolean","description":"whether to use dropout in the discriminator"},"n_layers":{"default":3,"type":"integer","description":"only used if netD==n_layers"},"ndf":{"default":64,"type":"integer","description":"\\# of discrim filters in the first conv layer"},"netDs":{"default":["projected_d","basic"],"type":"array","items":{"enum":null,"type":"string"},"description":"specify discriminator architecture, another option, --D_n_layers allows you to specify the layers in the n_layers discriminator. NB: duplicated arguments are ignored. Values: basic, n_layers, pixel, projected_d, temporal, vision_aided, depth, mask, sam"},"no_antialias":{"default":false,"type":"boolean","description":"if specified, use stride=2 convs instead of antialiased-downsampling (sad)"},"no_antialias_up":{"default":false,"type":"boolean","description":"if specified, use [upconv(learned filter)] instead of [upconv(hard-coded [1,3,3,1] filter), conv]"},"norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for D","enum":["instance","batch","none"]},"proj_config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file"},"proj_interp":{"default":-1,"type":"integer","description":"whether to force projected discriminator interpolation to a value \\> 224, -1 means no interpolation"},"proj_network_type":{"default":"efficientnet","type":"string","description":"projected discriminator architecture","enum":["efficientnet","segformer","vitbase","vitsmall","vitsmall2","vitclip16","vitclip14","depth","dinov2_vits14","dinov2_vitb14","dinov2_vitl14","dinov2_vitg14","dinov2_vits14_reg","dinov2_vitb14_reg","dinov2_vitl14_reg","dinov2_vitg14_reg","siglip_vitb16","siglip_vitl16","siglip_vit_so400m"]},"proj_weight_segformer":{"default":"models/configs/segformer/pretrain/segformer_mit-b0.pth","type":"string","description":"path to segformer weight"},"spectral":{"default":false,"type":"boolean","description":"whether to use spectral norm in the discriminator"},"temporal_every":{"default":4,"type":"integer","description":"apply temporal discriminator every x steps"},"vision_aided_backbones":{"default":"clip+dino+swin","type":"string","description":"specify vision aided discriminators architectures, they are frozen then output are combined and fitted with a linear network on top, choose from dino, clip, swin, det_coco, seg_ade and combine them with +"},"weight_sam":{"default":"","type":"string","description":"path to sam weight for D, e.g. models/configs/sam/pretrain/sam_vit_b_01ec64.pth, or models/configs/sam/pretrain/mobile_sam.pt for MobileSAM"}}},"G":{"title":"Generator","type":"object","properties":{"attn_nb_mask_attn":{"default":10,"type":"integer","description":"number of attention masks in _attn model architectures"},"attn_nb_mask_input":{"default":1,"type":"integer","description":"number of mask dedicated to input in _attn model architectures"},"backward_compatibility_twice_resnet_blocks":{"default":false,"type":"boolean","description":"if true, feats will go througt resnet blocks two times for resnet_attn generators. This option will be deleted, it's for backward compatibility (old models were trained that way)."},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for G"},"diff_n_timestep_test":{"default":1000,"type":"integer","description":"Number of timesteps used for UNET mha inference (test time)."},"diff_n_timestep_train":{"default":2000,"type":"integer","description":"Number of timesteps used for UNET mha training."},"dropout":{"default":false,"type":"boolean","description":"dropout for the generator"},"hdit_depths":{"default":[2,2,4],"type":"array","items":{"enum":null,"type":"string"},"description":"distribution of depth blocks across the HDiT stages, should have same size as --G_hdit_widths"},"hdit_patch_size":{"default":4,"type":"integer","description":"Patch size for HDIT, e.g. 4 for 4x4 patches"},"hdit_widths":{"default":[192,384,768],"type":"array","items":{"enum":null,"type":"string"},"description":"width multiplier for each level of the HDiT"},"lora_unet":{"default":8,"type":"integer","description":"lora unet rank for G"},"lora_vae":{"default":8,"type":"integer","description":"lora vae rank for G"},"nblocks":{"default":9,"type":"integer","description":"\\# of layer blocks in G, applicable to resnets"},"netE":{"default":"resnet_256","type":"string","description":"specify multimodal latent vector encoder","enum":["resnet_128","resnet_256","resnet_512","conv_128","conv_256","conv_512"]},"netG":{"default":"mobile_resnet_attn","type":"string","description":"specify generator architecture","enum":["resnet","resnet_attn","mobile_resnet","mobile_resnet_attn","unet_256","unet_128","segformer_attn_conv","segformer_conv","ittr","unet_mha","uvit","unet_mha_ref_attn","dit","hdit","img2img_turbo","unet_vid"]},"ngf":{"default":64,"type":"integer","description":"\\# of gen filters in the last conv layer"},"norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for G","enum":["instance","batch","none"]},"padding_type":{"default":"reflect","type":"string","description":"whether to use padding in the generator","enum":["reflect","replicate","zeros"]},"spectral":{"default":false,"type":"boolean","description":"whether to use spectral norm in the generator"},"unet_mha_attn_res":{"default":[16],"type":"array","items":{"enum":null,"type":"string"},"description":"downrate samples at which attention takes place"},"unet_mha_channel_mults":{"default":[1,2,4,8],"type":"array","items":{"enum":null,"type":"string"},"description":"channel multiplier for each level of the UNET mha"},"unet_mha_group_norm_size":{"default":32,"type":"integer","description":""},"unet_mha_norm_layer":{"default":"groupnorm","type":"string","description":"","enum":["groupnorm","batchnorm","layernorm","instancenorm","switchablenorm"]},"unet_mha_num_head_channels":{"default":32,"type":"integer","description":"number of channels in each head of the mha architecture"},"unet_mha_num_heads":{"default":1,"type":"integer","description":"number of heads in the mha architecture"},"unet_mha_res_blocks":{"default":[2,2,2,2],"type":"array","items":{"enum":null,"type":"string"},"description":"distribution of resnet blocks across the UNet stages, should have same size as --G_unet_mha_channel_mults"},"unet_mha_vit_efficient":{"default":false,"type":"boolean","description":"if true, use efficient attention in UNet and UViT"},"unet_vid_max_frame":{"default":24,"type":"integer","description":"max frame number for unet_vid in the PositionalEncoding"},"uvit_num_transformer_blocks":{"default":6,"type":"integer","description":"Number of transformer blocks in UViT"}}},"alg":{"title":"Algorithm-specific","type":"object","properties":{"cm_dists_mean":{"default":[0.485,0.456,0.406],"type":"array","items":{"enum":null,"type":"string"},"description":"mean for DISTS perceptual loss"},"cm_dists_std":{"default":[0.229,0.224,0.225],"type":"array","items":{"enum":null,"type":"string"},"description":"std for DISTS perceptual loss"},"cm_lambda_perceptual":{"default":1.0,"type":"number","description":"weight for LPIPS and DISTS perceptual losses"},"cm_num_steps":{"default":1000000,"type":"integer","description":"number of steps before reaching the fully discretized consistency model sampling schedule"},"cm_perceptual_loss":{"default":[""],"type":"array","items":{"enum":null,"type":"string"},"description":"optional supervised perceptual loss","enum":["","LPIPS","DISTS"]},"diffusion_cond_computed_sketch_list":{"default":["canny","hed"],"type":"array","items":{"enum":null,"type":"string"},"description":"what primitives to use for random sketch"},"diffusion_cond_embed":{"default":"","type":"string","description":"whether to use conditioning embeddings to the generator layers, and what type","enum":["","mask","class","mask_and_class","ref"]},"diffusion_cond_embed_dim":{"default":32,"type":"integer","description":"nb of examples processed for inference"},"diffusion_cond_image_creation":{"default":"y_t","type":"string","description":"how image conditioning is created: either from y_t (no conditioning), previous frame, from computed sketch (e.g. canny), from low res image or from reference image (i.e. image that is not aligned with the ground truth)","enum":["y_t","previous_frame","computed_sketch","low_res","ref"]},"diffusion_cond_prob_use_previous_frame":{"default":0.5,"type":"number","description":"prob to use previous frame as y cond"},"diffusion_cond_sam_crop_delta":{"default":true,"type":"boolean","description":"extend crop's width and height by 2\\*crop_delta before computing masks"},"diffusion_cond_sam_final_canny":{"default":false,"type":"boolean","description":"whether to perform a Canny edge detection on sam sketch to soften the edges"},"diffusion_cond_sam_max_mask_area":{"default":0.99,"type":"number","description":"maximum area in proportion of image size for a mask to be kept"},"diffusion_cond_sam_min_mask_area":{"default":0.001,"type":"number","description":"minimum area in proportion of image size for a mask to be kept"},"diffusion_cond_sam_no_output_binary_sam":{"default":false,"type":"boolean","description":"whether to not output binary sketch before Canny"},"diffusion_cond_sam_no_sample_points_in_ellipse":{"default":false,"type":"boolean","description":"whether to not sample the points inside an ellipse to avoid the corners of the image"},"diffusion_cond_sam_no_sobel_filter":{"default":false,"type":"boolean","description":"whether to not use a Sobel filter on each SAM masks"},"diffusion_cond_sam_points_per_side":{"default":16,"type":"integer","description":"number of points per side of image to prompt SAM with (\\# of prompted points will be points_per_side\\*\\*2)"},"diffusion_cond_sam_redundancy_threshold":{"default":0.62,"type":"number","description":"redundancy threshold above which redundant masks are not kept"},"diffusion_cond_sam_sobel_threshold":{"default":0.7,"type":"number","description":"sobel threshold in %% of gradient magnitude"},"diffusion_cond_sam_use_gaussian_filter":{"default":false,"type":"boolean","description":"whether to apply a Gaussian blur to each SAM masks"},"diffusion_cond_sketch_canny_range":{"default":[0,765],"type":"array","items":{"enum":null,"type":"string"},"description":"range of randomized canny sketch thresholds"},"diffusion_dropout_prob":{"default":0.0,"type":"number","description":"dropout probability for classifier-free guidance"},"diffusion_generate_per_class":{"default":false,"type":"boolean","description":"whether to generate samples of each images"},"diffusion_lambda_G":{"default":1.0,"type":"number","description":"weight for supervised loss"},"diffusion_ref_embed_net":{"default":"clip","type":"string","description":"embedding network to use for ref conditioning","enum":["clip","imagebind"]},"diffusion_super_resolution_scale":{"default":2.0,"type":"number","description":"scale for super resolution"},"diffusion_task":{"default":"inpainting","type":"string","description":"Whether to perform inpainting, super resolution or pix2pix","enum":["inpainting","super_resolution","pix2pix"]},"diffusion_vid_canny_dropout":{"default":0,"type":"integer","description":"prob to drop canny for each frame"},"gan":{"title":"GAN model","type":"object","properties":{"lambda":{"default":1.0,"type":"number","description":"weight for GAN loss：GAN(G(X))"}}},"cut":{"title":"CUT model","type":"object","properties":{"HDCE_gamma":{"default":1.0,"type":"number","description":""},"HDCE_gamma_min":{"default":1.0,"type":"number","description":""},"MSE_idt":{"default":false,"type":"boolean","description":"use MSENCE loss for identity mapping: MSE(G(Y), Y))"},"dists_mean":{"default":[0.485,0.456,0.406],"type":"array","items":{"enum":null,"type":"string"},"description":"mean for DISTS perceptual loss"},"dists_std":{"default":[0.229,0.224,0.225],"type":"array","items":{"enum":null,"type":"string"},"description":"std for DISTS perceptual loss"},"flip_equivariance":{"default":false,"type":"boolean","description":"Enforce flip-equivariance as additional regularization. It's used by FastCUT, but not CUT"},"lambda_MSE_idt":{"default":1.0,"type":"number","description":"weight for MSE identity loss: MSE(G(X), X)"},"lambda_NCE":{"default":1.0,"type":"number","description":"weight for NCE loss: NCE(G(X), X)"},"lambda_SRC":{"default":0.0,"type":"number","description":"weight for SRC (semantic relation consistency) loss: NCE(G(X), X)"},"lambda_perceptual":{"default":1.0,"type":"number","description":"weight for LPIPS and DISTS perceptual losses"},"lambda_supervised":{"default":1.0,"type":"number","description":"weight for supervised loss"},"nce_T":{"default":0.07,"type":"number","description":"temperature for NCE loss"},"nce_idt":{"default":true,"type":"boolean","description":"use NCE loss for identity mapping: NCE(G(Y), Y))"},"nce_includes_all_negatives_from_minibatch":{"default":false,"type":"boolean","description":"(used for single image translation) If True, include the negatives from the other samples of the minibatch when computing the contrastive loss. Please see models/patchnce.py for more details."},"nce_layers":{"default":"0,4,8,12,16","type":"string","description":"compute NCE loss on which layers"},"nce_loss":{"default":"monce","type":"string","description":"CUT contrastice loss","enum":["patchnce","monce","SRC_hDCE"]},"netF":{"default":"mlp_sample","type":"string","description":"how to downsample the feature map","enum":["sample","mlp_sample","sample_qsattn","mlp_sample_qsattn"]},"netF_dropout":{"default":false,"type":"boolean","description":"whether to use dropout with F"},"netF_nc":{"default":256,"type":"integer","description":""},"netF_norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for F","enum":["instance","batch","none"]},"num_patches":{"default":256,"type":"integer","description":"number of patches per layer"},"supervised_loss":{"default":[""],"type":"array","items":{"enum":null,"type":"string"},"description":"supervised loss with aligned data","enum":["","MSE","L1","LPIPS","DISTS"]}}},"cyclegan":{"title":"CycleGAN model","type":"object","properties":{"lambda_A":{"default":10.0,"type":"number","description":"weight for cycle loss (A -\\> B -\\> A)"},"lambda_B":{"default":10.0,"type":"number","description":"weight for cycle loss (B -\\> A -\\> B)"},"lambda_identity":{"default":0.5,"type":"number","description":"use identity mapping. Setting lambda_identity other than 0 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set lambda_identity = 0.1"},"rec_noise":{"default":0.0,"type":"number","description":"whether to add noise to reconstruction"}}},"re":{"title":"ReCUT / ReCycleGAN","type":"object","properties":{"P_lr":{"default":0.0002,"type":"number","description":"initial learning rate for P networks"},"adversarial_loss_p":{"default":false,"type":"boolean","description":"if True, also train the prediction model with an adversarial loss"},"netP":{"default":"unet_128","type":"string","description":"specify P architecture","enum":["resnet_9blocks","resnet_6blocks","resnet_attn","unet_256","unet_128"]},"no_train_P_fake_images":{"default":false,"type":"boolean","description":"if True, P wont be trained over fake images projections"},"nuplet_size":{"default":3,"type":"integer","description":"Number of frames loaded"},"projection_threshold":{"default":1.0,"type":"number","description":"threshold of the real images projection loss below with fake projection and fake reconstruction losses are applied"}}},"palette":{"title":"Diffusion model","type":"object","properties":{"ddim_eta":{"default":0.5,"type":"number","description":"eta for ddim sampling variance"},"ddim_num_steps":{"default":10,"type":"integer","description":"number of steps for ddim sampling"},"loss":{"default":"MSE","type":"string","description":"loss type of the denoising model","enum":["L1","MSE","multiscale_L1","multiscale_MSE"]},"minsnr":{"default":false,"type":"boolean","description":"use min-SNR weighting"},"sampling_method":{"default":"ddpm","type":"string","description":"choose the sampling method between ddpm and ddim","enum":["ddpm","ddim"]}}}}},"data":{"title":"Datasets","type":"object","properties":{"crop_size":{"default":256,"type":"integer","description":"then crop to this size"},"dataset_mode":{"default":"unaligned","type":"string","description":"chooses how datasets are loaded.","enum":["unaligned","unaligned_labeled_cls","unaligned_labeled_mask","self_supervised_labeled_mask","unaligned_labeled_mask_cls","self_supervised_labeled_mask_cls","unaligned_labeled_mask_online","self_supervised_labeled_mask_online","unaligned_labeled_mask_cls_online","self_supervised_labeled_mask_cls_online","aligned","nuplet_unaligned_labeled_mask","temporal_labeled_mask_online","self_supervised_temporal_labeled_mask_online","self_supervised_temporal","single","unaligned_labeled_mask_ref","self_supervised_labeled_mask_ref","unaligned_labeled_mask_online_ref","unaligned_labeled_mask_online_prompt","self_supervised_labeled_mask_online_ref"]},"direction":{"default":"AtoB","type":"string","description":"AtoB or BtoA","enum":["AtoB","BtoA"]},"image_bits":{"default":8,"type":"integer","description":"number of bits of the image (e.g. 8, 12 or 16)"},"inverted_mask":{"default":false,"type":"boolean","description":"whether to invert the mask, i.e. around the bbox"},"load_size":{"default":286,"type":"integer","description":"scale images to this size"},"max_dataset_size":{"default":1000000000,"type":"integer","description":"Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded."},"num_threads":{"default":4,"type":"integer","description":"\\# threads for loading data"},"online_context_pixels":{"default":0,"type":"integer","description":"context pixel band around the crop, unused for generation, only for disc "},"online_fixed_mask_size":{"default":-1,"type":"integer","description":"if \\>0, it will be used as fixed bbox size (warning: in dataset resolution ie before resizing) "},"online_random_bbox":{"default":false,"type":"boolean","description":"whether to randomly sample a bbox per online crop"},"online_select_category":{"default":-1,"type":"integer","description":"category to select for bounding boxes, -1 means all boxes selected"},"online_single_bbox":{"default":false,"type":"boolean","description":"whether to only allow a single bbox per online crop"},"preprocess":{"default":"resize_and_crop","type":"string","description":"scaling and cropping of images at load time","enum":["resize_and_crop","crop","scale_width","scale_width_and_crop","none"]},"refined_mask":{"default":false,"type":"boolean","description":"whether to use refined mask with sam"},"relative_paths":{"default":false,"type":"boolean","description":"whether paths to images are relative to dataroot"},"sanitize_paths":{"default":false,"type":"boolean","description":"if true, wrong images or labels paths will be removed before training"},"serial_batches":{"default":false,"type":"boolean","description":"if true, takes images in order to make batches, otherwise takes them randomly"},"temporal_frame_step":{"default":30,"type":"integer","description":"how many frames between successive frames selected"},"temporal_num_common_char":{"default":-1,"type":"integer","description":"how many characters (the first ones) are used to identify a video; if =-1 natural sorting is used "},"temporal_number_frames":{"default":5,"type":"integer","description":"how many successive frames use for temporal loader"},"online_creation":{"title":"Online created datasets","type":"object","properties":{"color_mask_A":{"default":false,"type":"boolean","description":"Perform task of replacing color-filled masks by objects"},"crop_delta_A":{"default":50,"type":"integer","description":"size of crops are random, values allowed are online_creation_crop_size more or less online_creation_crop_delta for domain A"},"crop_delta_B":{"default":50,"type":"integer","description":"size of crops are random, values allowed are online_creation_crop_size more or less online_creation_crop_delta for domain B"},"crop_size_A":{"default":512,"type":"integer","description":"crop to this size during online creation, it needs to be greater than bbox size for domain A"},"crop_size_B":{"default":512,"type":"integer","description":"crop to this size during online creation, it needs to be greater than bbox size for domain B"},"load_size_A":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"load to this size during online creation, format : width height or only one size if square"},"load_size_B":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"load to this size during online creation, format : width height or only one size if square"},"mask_delta_A":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"mask offset (in pixels) to allow generation of a bigger object in domain B (for semantic loss) for domain A, format : 'width (x),height (y)' for each class or only one size if square, e.g. '125, 55 100, 100' for 2 classes"},"mask_delta_A_ratio":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask offset to allow generation of a bigger object in domain B (for semantic loss) for domain A, format : width (x),height (y) for each class or only one size if square"},"mask_delta_B":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"mask offset (in pixels) to allow generation of a bigger object in domain A (for semantic loss) for domain B, format : 'width (x),height (y)' for each class or only one size if square, e.g. '125, 55 100, 100' for 2 classes"},"mask_delta_B_ratio":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask offset to allow generation of a bigger object in domain A (for semantic loss) for domain B, format : 'width (x),height (y)' for each class or only one size if square"},"mask_random_offset_A":{"default":[0.0],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask size randomization (only to make bigger one) to robustify the image generation in domain A, format : width (x) height (y) or only one size if square"},"mask_random_offset_B":{"default":[0.0],"type":"array","items":{"enum":null,"type":"string"},"description":"mask size randomization (only to make bigger one) to robustify the image generation in domain B, format : width (y) height (x) or only one size if square"},"mask_square_A":{"default":false,"type":"boolean","description":"whether masks should be squared for domain A"},"mask_square_B":{"default":false,"type":"boolean","description":"whether masks should be squared for domain B"},"rand_mask_A":{"default":false,"type":"boolean","description":"Perform task of replacing noised masks by objects"}}}}},"f_s":{"title":"Semantic segmentation network","type":"object","properties":{"all_classes_as_one":{"default":false,"type":"boolean","description":"if true, all classes will be considered as the same one (ie foreground vs background)"},"class_weights":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"class weights for imbalanced semantic classes"},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for f_s"},"dropout":{"default":false,"type":"boolean","description":"dropout for the semantic network"},"net":{"default":"vgg","type":"string","description":"specify f_s network [vgg|unet|segformer|sam]","enum":["vgg","unet","segformer","sam"]},"nf":{"default":64,"type":"integer","description":"\\# of filters in the first conv layer of classifier"},"semantic_nclasses":{"default":2,"type":"integer","description":"number of classes of the semantic loss classifier"},"semantic_threshold":{"default":1.0,"type":"number","description":"threshold of the semantic classifier loss below with semantic loss is applied"},"weight_sam":{"default":"","type":"string","description":"path to sam weight for f_s, e.g. models/configs/sam/pretrain/sam_vit_b_01ec64.pth, or models/configs/sam/pretrain/mobile_sam.pt for MobileSAM"},"weight_segformer":{"default":"","type":"string","description":"path to segformer weight for f_s, e.g. models/configs/segformer/pretrain/segformer_mit-b0.pth"}}},"cls":{"title":"Semantic classification network","type":"object","properties":{"all_classes_as_one":{"default":false,"type":"boolean","description":"if true, all classes will be considered as the same one (ie foreground vs background)"},"class_weights":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"class weights for imbalanced semantic classes"},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for cls"},"dropout":{"default":false,"type":"boolean","description":"dropout for the semantic network"},"net":{"default":"vgg","type":"string","description":"specify cls network [vgg|unet|segformer]","enum":["vgg","unet","segformer"]},"nf":{"default":64,"type":"integer","description":"\\# of filters in the first conv layer of classifier"},"semantic_nclasses":{"default":2,"type":"integer","description":"number of classes of the semantic loss classifier"},"semantic_threshold":{"default":1.0,"type":"number","description":"threshold of the semantic classifier loss below with semantic loss is applied"},"weight_segformer":{"default":"","type":"string","description":"path to segformer weight for cls, e.g. models/configs/segformer/pretrain/segformer_mit-b0.pth"}}},"output":{"title":"Output","type":"object","properties":{"no_html":{"default":false,"type":"boolean","description":"do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/"},"num_images":{"default":20,"type":"integer","description":"number of visualized images results from the train/test set"},"print_freq":{"default":100,"type":"integer","description":"frequency of showing training results on console"},"update_html_freq":{"default":1000,"type":"integer","description":"frequency of saving training results to html"},"verbose":{"default":false,"type":"boolean","description":"if specified, print more debugging information"},"display":{"title":"Visdom display","type":"object","properties":{"G_attention_masks":{"default":false,"type":"boolean","description":""},"aim_port":{"default":53800,"type":"integer","description":"aim port of the web display"},"aim_server":{"default":"http://localhost","type":"string","description":"aim server of the web display"},"diff_fake_real":{"default":false,"type":"boolean","description":"if True x - G(x) is displayed"},"env":{"default":"","type":"string","description":"visdom display environment name (default is \"main\")"},"freq":{"default":400,"type":"integer","description":"frequency of showing training results on screen"},"id":{"default":1,"type":"integer","description":"window id of the web display"},"ncols":{"default":0,"type":"integer","description":"if positive, display all images in a single visdom web panel with certain number of images per row.(if == 0 ncols will be computed automatically)"},"networks":{"default":false,"type":"boolean","description":"Set True if you want to display networks on port 8000"},"type":{"default":["visdom"],"type":"array","items":{"enum":null,"type":"string"},"description":"output display, either visdom, aim or no output","enum":["visdom","aim","none"]},"visdom_autostart":{"default":false,"type":"boolean","description":"whether to start a visdom server automatically"},"visdom_port":{"default":8097,"type":"integer","description":"visdom port of the web display"},"visdom_server":{"default":"http://localhost","type":"string","description":"visdom server of the web display"},"winsize":{"default":256,"type":"integer","description":"display window size for both visdom and HTML"}}}}},"model":{"title":"Model","type":"object","properties":{"depth_network":{"default":"DPT_Large","type":"string","description":"specify depth prediction network architecture","enum":["DPT_Large","DPT_Hybrid","MiDaS_small","DPT_BEiT_L_512","DPT_BEiT_L_384","DPT_BEiT_B_384","DPT_SwinV2_L_384","DPT_SwinV2_B_384","DPT_SwinV2_T_256","DPT_Swin_L_384","DPT_Next_ViT_L_384","DPT_LeViT_224"]},"init_gain":{"default":0.02,"type":"number","description":"scaling factor for normal, xavier and orthogonal."},"init_type":{"default":"normal","type":"string","description":"network initialization","enum":["normal","xavier","kaiming","orthogonal"]},"input_nc":{"default":3,"type":"integer","description":"\\# of input image channels: 3 for RGB and 1 for grayscale, more supported"},"multimodal":{"default":false,"type":"boolean","description":"multimodal model with random latent input vector"},"output_nc":{"default":3,"type":"integer","description":"\\# of output image channels: 3 for RGB and 1 for grayscale"},"prior_321_backwardcompatibility":{"default":false,"type":"boolean","description":"whether to load models from previous version of JG."},"type_sam":{"default":"mobile_sam","type":"string","description":"which model to use for segment-anything mask generation","enum":["sam","mobile_sam"]}}},"train":{"title":"Training","type":"object","properties":{"D_accuracy_every":{"default":1000,"type":"integer","description":"compute D accuracy every N iterations"},"D_lr":{"default":0.0001,"type":"number","description":"discriminator separate learning rate"},"G_ema":{"default":false,"type":"boolean","description":"whether to build G via exponential moving average"},"G_ema_beta":{"default":0.999,"type":"number","description":"exponential decay for ema"},"G_lr":{"default":0.0002,"type":"number","description":"initial learning rate for generator"},"batch_size":{"default":1,"type":"integer","description":"input batch size"},"beta1":{"default":0.9,"type":"number","description":"momentum term of adam"},"beta2":{"default":0.999,"type":"number","description":"momentum term of adam"},"cls_l1_regression":{"default":false,"type":"boolean","description":"if true l1 loss will be used to compute regressor loss"},"cls_regression":{"default":false,"type":"boolean","description":"if true cls will be a regressor and not a classifier"},"compute_D_accuracy":{"default":false,"type":"boolean","description":"whether to compute D accuracy explicitely"},"compute_metrics_test":{"default":false,"type":"boolean","description":"whether to compute test metrics, e.g. FID, ..."},"continue":{"default":false,"type":"boolean","description":"continue training: load the latest model"},"epoch":{"default":"latest","type":"string","description":"which epoch to load? set to latest to use latest cached model"},"epoch_count":{"default":1,"type":"integer","description":"the starting epoch count, we save the model by \\<epoch_count\\>, \\<epoch_count\\>+\\<save_latest_freq\\>, ..."},"export_jit":{"default":false,"type":"boolean","description":"whether to export model in jit format"},"feat_wavelet":{"default":false,"type":"boolean","description":"if true, train in wavelet features space (Note: this may not include all discriminators, when training GANs)"},"gan_mode":{"default":"lsgan","type":"string","description":"the type of GAN objective. vanilla GAN loss is the cross-entropy objective used in the original GAN paper.","enum":["vanilla","lsgan","wgangp","projected"]},"iter_size":{"default":1,"type":"integer","description":"backward will be apllied each iter_size iterations, it simulate a greater batch size : its value is batch_size\\*iter_size"},"load_iter":{"default":0,"type":"integer","description":"which iteration to load? if load_iter \\> 0, the code will load models by iter_[load_iter]; otherwise, the code will load models by [epoch]"},"lr_decay_iters":{"default":50,"type":"integer","description":"multiply by a gamma every lr_decay_iters iterations"},"lr_policy":{"default":"linear","type":"string","description":"learning rate policy.","enum":["linear","step","multistep","plateau","cosine"]},"lr_steps":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"number of epochs between reductions of the learning rate by gamma=0.1"},"metrics_every":{"default":1000,"type":"integer","description":"compute metrics every N iterations"},"metrics_list":{"default":["FID"],"type":"array","items":{"enum":null,"type":"string"},"description":"metrics on results quality to compute","enum":["FID","KID","MSID","PSNR","LPIPS","SSIM"]},"metrics_save_images":{"default":false,"type":"boolean","description":"whether to save images that result form metrics computation"},"mm_lambda_z":{"default":0.5,"type":"number","description":"weight for random z loss"},"mm_nz":{"default":8,"type":"integer","description":"number of latent vectors"},"n_epochs":{"default":100,"type":"integer","description":"number of epochs with the initial learning rate"},"n_epochs_decay":{"default":0,"type":"integer","description":"number of epochs to linearly decay learning rate to zero"},"nb_img_max_fid":{"default":1000000000,"type":"integer","description":"Maximum number of samples allowed per dataset to compute fid. If the dataset directory contains more than nb_img_max_fid, only a subset is used."},"optim":{"default":"adam","type":"string","description":"optimizer (adam, radam, adamw, ...)","enum":["adam","radam","adamw","lion","adam8bit"]},"optim_eps":{"default":1e-08,"type":"number","description":"epsilon for optimizer"},"optim_weight_decay":{"default":0.0,"type":"number","description":"weight decay for optimizer"},"pool_size":{"default":50,"type":"integer","description":"the size of image buffer that stores previously generated images"},"save_by_iter":{"default":false,"type":"boolean","description":"whether saves model by iteration"},"save_epoch_freq":{"default":1,"type":"integer","description":"frequency of saving checkpoints at the end of epochs"},"save_latest_freq":{"default":5000,"type":"integer","description":"frequency of saving the latest results"},"semantic_cls":{"default":false,"type":"boolean","description":"if true semantic class losses will be used"},"semantic_mask":{"default":false,"type":"boolean","description":"if true semantic mask losses will be used"},"temporal_criterion":{"default":false,"type":"boolean","description":"if true, MSE loss will be computed between successive frames"},"temporal_criterion_lambda":{"default":1.0,"type":"number","description":"lambda for MSE loss that will be computed between successive frames"},"use_contrastive_loss_D":{"default":false,"type":"boolean","description":""},"sem":{"title":"Semantic training","type":"object","properties":{"cls_B":{"default":false,"type":"boolean","description":"if true cls will be trained not only on domain A but also on domain B"},"cls_lambda":{"default":1.0,"type":"number","description":"weight for semantic class loss"},"cls_pretrained":{"default":false,"type":"boolean","description":"whether to use a pretrained model, available for non \"basic\" model only"},"cls_template":{"default":"basic","type":"string","description":"classifier/regressor model type, from torchvision (resnet18, ...), default is custom simple model"},"idt":{"default":false,"type":"boolean","description":"if true apply semantic loss on identity"},"lr_cls":{"default":0.0002,"type":"number","description":"cls learning rate"},"lr_f_s":{"default":0.0002,"type":"number","description":"f_s learning rate"},"mask_lambda":{"default":1.0,"type":"number","description":"weight for semantic mask loss"},"net_output":{"default":false,"type":"boolean","description":"if true apply generator semantic loss on network output for real image rather than on label."},"use_label_B":{"default":false,"type":"boolean","description":"if true domain B has labels too"}}},"mask":{"title":"Semantic training with masks","type":"object","properties":{"charbonnier_eps":{"default":1e-06,"type":"number","description":"Charbonnier loss epsilon value"},"compute_miou":{"default":false,"type":"boolean","description":"whether to compute mIoU on semantic masks prediction"},"disjoint_f_s":{"default":false,"type":"boolean","description":"whether to use a disjoint f_s with the same exact structure"},"f_s_B":{"default":false,"type":"boolean","description":"if true f_s will be trained not only on domain A but also on domain B"},"for_removal":{"default":false,"type":"boolean","description":"if true, object removal mode, domain B images with label 0, cut models only"},"lambda_out_mask":{"default":10.0,"type":"number","description":"weight for loss out mask"},"loss_out_mask":{"default":"L1","type":"string","description":"loss for out mask content (which should not change).","enum":["L1","MSE","Charbonnier"]},"miou_every":{"default":1000,"type":"integer","description":"compute mIoU every n iterations"},"no_train_f_s_A":{"default":false,"type":"boolean","description":"if true f_s wont be trained on domain A"},"out_mask":{"default":false,"type":"boolean","description":"use loss out mask"}}}}},"dataaug":{"title":"Data augmentation","type":"object","properties":{"APA":{"default":false,"type":"boolean","description":"if true, G will be used as augmentation during D training adaptively to D overfitting between real and fake images"},"APA_every":{"default":4,"type":"integer","description":"How often to perform APA adjustment?"},"APA_nimg":{"default":50,"type":"integer","description":"APA adjustment speed, measured in how many images it takes for p to increase/decrease by one unit."},"APA_p":{"default":0,"type":"integer","description":"initial value of probability APA"},"APA_target":{"default":0.6,"type":"number","description":""},"D_diffusion":{"default":false,"type":"boolean","description":"whether to apply diffusion noise augmentation to discriminator inputs, projected discriminator only"},"D_diffusion_every":{"default":4,"type":"integer","description":"How often to perform diffusion augmentation adjustment"},"D_label_smooth":{"default":false,"type":"boolean","description":"whether to use one-sided label smoothing with discriminator"},"D_noise":{"default":0.0,"type":"number","description":"whether to add instance noise to discriminator inputs"},"affine":{"default":0.0,"type":"number","description":"if specified, apply random affine transforms to the images for data augmentation"},"affine_scale_max":{"default":1.2,"type":"number","description":"if random affine specified, max scale range value"},"affine_scale_min":{"default":0.8,"type":"number","description":"if random affine specified, min scale range value"},"affine_shear":{"default":45,"type":"integer","description":"if random affine specified, shear range (0,value)"},"affine_translate":{"default":0.2,"type":"number","description":"if random affine specified, translation range (-value\\*img_size,+value\\*img_size) value"},"diff_aug_policy":{"default":"","type":"string","description":"choose the augmentation policy : color randaffine randperspective. If you want more than one, please write them separated by a comma with no space (e.g. color,randaffine)"},"diff_aug_proba":{"default":0.5,"type":"number","description":"proba of using each transformation"},"flip":{"default":"horizontal","type":"string","description":"if specified, flip the images for data augmentation, possible values: none, horizontal, vertical, both","enum":["none","horizontal","vertical","both"]},"imgaug":{"default":false,"type":"boolean","description":"whether to apply random image augmentation"},"no_rotate":{"default":false,"type":"boolean","description":"if specified, do not rotate the images for data augmentation"}}}}}},"definitions":{"ServerTrainOptions":{"title":"ServerTrainOptions","type":"object","properties":{"sync":{"title":"Sync","description":"if false, the call returns immediately and train process is executed in the background. If true, the call returns only when training process is finished","default":false,"type":"boolean"}}}}}}},"definitions":{"ServerTrainOptions":{"title":"ServerTrainOptions","type":"object","properties":{"sync":{"title":"Sync","description":"if false, the call returns immediately and train process is executed in the background. If true, the call returns only when training process is finished","default":false,"type":"boolean"}}}}}
\ No newline at end of file
+{"openapi":"3.1.0","info":{"title":"JoliGEN server","description":"*commit:* [84473fcb](https://github.com/jolibrain/joliGEN/commit/84473fcbc10e16eb124c99e4a3d8d863ab84f4bf)\n\nThis is the JoliGEN server API documentation.\n","version":"0.1.0"},"paths":{"/train/{name}":{"get":{"summary":"Get the status of a training process","operationId":"get_train_train__name__get","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"post":{"summary":"Start a training process with given name.","description":"The training process will be created using the same options as command line","operationId":"train_train__name__post","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"requestBody":{"content":{"application/json":{"schema":{"$ref":"#/components/schemas/TrainOptions"}}}},"responses":{"201":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}},"delete":{"summary":"Delete a training process.","description":"If the process is running, it will be stopped.","operationId":"delete_train_train__name__delete","parameters":[{"required":true,"schema":{"type":"string","title":"Name"},"name":"name","in":"path"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}},"/train":{"get":{"summary":"Get the status of all training processes","operationId":"get_train_processes_train_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/predict":{"post":{"summary":"Start a inference process","description":"The inference process will be created using the same options as command line","operationId":"predict_predict_post","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/info":{"get":{"summary":"Get the server status","operationId":"get_info_info_get","responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}}}}},"/fs/":{"delete":{"summary":"Delete a file or a directory in the filesystem","description":"This endpoint can be dangerous, use it with extreme caution","operationId":"delete_path_fs__delete","parameters":[{"required":true,"schema":{"type":"string","title":"Path"},"name":"path","in":"query"}],"responses":{"200":{"description":"Successful Response","content":{"application/json":{"schema":{}}}},"422":{"description":"Validation Error","content":{"application/json":{"schema":{"$ref":"#/components/schemas/HTTPValidationError"}}}}}}}},"components":{"schemas":{"HTTPValidationError":{"properties":{"detail":{"items":{"$ref":"#/components/schemas/ValidationError"},"type":"array","title":"Detail"}},"type":"object","title":"HTTPValidationError"},"ValidationError":{"properties":{"loc":{"items":{"anyOf":[{"type":"string"},{"type":"integer"}]},"type":"array","title":"Location"},"msg":{"type":"string","title":"Message"},"type":{"type":"string","title":"Error Type"}},"type":"object","required":["loc","msg","type"],"title":"ValidationError"},"TrainOptions":{"title":"TrainBody","type":"object","properties":{"server":{"title":"Server","default":{"sync":false},"allOf":[{"$ref":"#/definitions/ServerTrainOptions"}]},"train_options":{"title":"TrainOptions","type":"object","properties":{"checkpoints_dir":{"default":"./checkpoints","type":"string","description":"models are saved here"},"dataroot":{"default":"None","type":"string","description":"path to images (should have subfolders trainA, trainB, valA, valB, etc)"},"ddp_port":{"default":"12355","type":"string","description":""},"gpu_ids":{"default":"0","type":"string","description":"gpu ids: e.g. 0  0,1,2, 0,2. use -1 for CPU"},"model_type":{"default":"cut","type":"string","description":"chooses which model to use.","enum":["cut","cycle_gan","palette","cm","cm_gan"]},"name":{"default":"experiment_name","type":"string","description":"name of the experiment. It decides where to store samples and models"},"phase":{"default":"train","type":"string","description":"train, val, test, etc"},"suffix":{"default":"","type":"string","description":"customized suffix: opt.name = opt.name + suffix: e.g., {model}_{netG}_size{load_size}"},"test_batch_size":{"default":1,"type":"integer","description":"input batch size"},"warning_mode":{"default":false,"type":"boolean","description":"whether to display warning"},"with_amp":{"default":false,"type":"boolean","description":"whether to activate torch amp on forward passes"},"with_tf32":{"default":false,"type":"boolean","description":"whether to activate tf32 for faster computations (Ampere GPU and beyond only)"},"with_torch_compile":{"default":false,"type":"boolean","description":"whether to activate torch.compile for some forward and backward functions (experimental)"},"D":{"title":"Discriminator","type":"object","properties":{"dropout":{"default":false,"type":"boolean","description":"whether to use dropout in the discriminator"},"n_layers":{"default":3,"type":"integer","description":"only used if netD==n_layers"},"ndf":{"default":64,"type":"integer","description":"\\# of discrim filters in the first conv layer"},"netDs":{"default":["projected_d","basic"],"type":"array","items":{"enum":null,"type":"string"},"description":"specify discriminator architecture, another option, --D_n_layers allows you to specify the layers in the n_layers discriminator. NB: duplicated arguments are ignored. Values: basic, n_layers, pixel, projected_d, temporal, vision_aided, depth, mask, sam"},"no_antialias":{"default":false,"type":"boolean","description":"if specified, use stride=2 convs instead of antialiased-downsampling (sad)"},"no_antialias_up":{"default":false,"type":"boolean","description":"if specified, use [upconv(learned filter)] instead of [upconv(hard-coded [1,3,3,1] filter), conv]"},"norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for D","enum":["instance","batch","none"]},"proj_config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file"},"proj_interp":{"default":-1,"type":"integer","description":"whether to force projected discriminator interpolation to a value \\> 224, -1 means no interpolation"},"proj_network_type":{"default":"efficientnet","type":"string","description":"projected discriminator architecture","enum":["efficientnet","segformer","vitbase","vitsmall","vitsmall2","vitclip16","vitclip14","depth","dinov2_vits14","dinov2_vitb14","dinov2_vitl14","dinov2_vitg14","dinov2_vits14_reg","dinov2_vitb14_reg","dinov2_vitl14_reg","dinov2_vitg14_reg","siglip_vitb16","siglip_vitl16","siglip_vit_so400m"]},"proj_weight_segformer":{"default":"models/configs/segformer/pretrain/segformer_mit-b0.pth","type":"string","description":"path to segformer weight"},"spectral":{"default":false,"type":"boolean","description":"whether to use spectral norm in the discriminator"},"temporal_every":{"default":4,"type":"integer","description":"apply temporal discriminator every x steps"},"vision_aided_backbones":{"default":"clip+dino+swin","type":"string","description":"specify vision aided discriminators architectures, they are frozen then output are combined and fitted with a linear network on top, choose from dino, clip, swin, det_coco, seg_ade and combine them with +"},"weight_sam":{"default":"","type":"string","description":"path to sam weight for D, e.g. models/configs/sam/pretrain/sam_vit_b_01ec64.pth, or models/configs/sam/pretrain/mobile_sam.pt for MobileSAM"}}},"G":{"title":"Generator","type":"object","properties":{"attn_nb_mask_attn":{"default":10,"type":"integer","description":"number of attention masks in _attn model architectures"},"attn_nb_mask_input":{"default":1,"type":"integer","description":"number of mask dedicated to input in _attn model architectures"},"backward_compatibility_twice_resnet_blocks":{"default":false,"type":"boolean","description":"if true, feats will go througt resnet blocks two times for resnet_attn generators. This option will be deleted, it's for backward compatibility (old models were trained that way)."},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for G"},"diff_n_timestep_test":{"default":1000,"type":"integer","description":"Number of timesteps used for UNET mha inference (test time)."},"diff_n_timestep_train":{"default":2000,"type":"integer","description":"Number of timesteps used for UNET mha training."},"dropout":{"default":false,"type":"boolean","description":"dropout for the generator"},"hdit_depths":{"default":[2,2,4],"type":"array","items":{"enum":null,"type":"string"},"description":"distribution of depth blocks across the HDiT stages, should have same size as --G_hdit_widths"},"hdit_patch_size":{"default":4,"type":"integer","description":"Patch size for HDIT, e.g. 4 for 4x4 patches"},"hdit_widths":{"default":[192,384,768],"type":"array","items":{"enum":null,"type":"string"},"description":"width multiplier for each level of the HDiT"},"lora_unet":{"default":8,"type":"integer","description":"lora unet rank for G"},"lora_vae":{"default":8,"type":"integer","description":"lora vae rank for G"},"nblocks":{"default":9,"type":"integer","description":"\\# of layer blocks in G, applicable to resnets"},"netE":{"default":"resnet_256","type":"string","description":"specify multimodal latent vector encoder","enum":["resnet_128","resnet_256","resnet_512","conv_128","conv_256","conv_512"]},"netG":{"default":"mobile_resnet_attn","type":"string","description":"specify generator architecture","enum":["resnet","resnet_attn","mobile_resnet","mobile_resnet_attn","unet_256","unet_128","segformer_attn_conv","segformer_conv","ittr","unet_mha","uvit","unet_mha_ref_attn","dit","hdit","img2img_turbo","unet_vid"]},"ngf":{"default":64,"type":"integer","description":"\\# of gen filters in the last conv layer"},"norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for G","enum":["instance","batch","none"]},"padding_type":{"default":"reflect","type":"string","description":"whether to use padding in the generator","enum":["reflect","replicate","zeros"]},"spectral":{"default":false,"type":"boolean","description":"whether to use spectral norm in the generator"},"unet_mha_attn_res":{"default":[16],"type":"array","items":{"enum":null,"type":"string"},"description":"downrate samples at which attention takes place"},"unet_mha_channel_mults":{"default":[1,2,4,8],"type":"array","items":{"enum":null,"type":"string"},"description":"channel multiplier for each level of the UNET mha"},"unet_mha_group_norm_size":{"default":32,"type":"integer","description":""},"unet_mha_norm_layer":{"default":"groupnorm","type":"string","description":"","enum":["groupnorm","batchnorm","layernorm","instancenorm","switchablenorm"]},"unet_mha_num_head_channels":{"default":32,"type":"integer","description":"number of channels in each head of the mha architecture"},"unet_mha_num_heads":{"default":1,"type":"integer","description":"number of heads in the mha architecture"},"unet_mha_res_blocks":{"default":[2,2,2,2],"type":"array","items":{"enum":null,"type":"string"},"description":"distribution of resnet blocks across the UNet stages, should have same size as --G_unet_mha_channel_mults"},"unet_mha_vit_efficient":{"default":false,"type":"boolean","description":"if true, use efficient attention in UNet and UViT"},"unet_vid_max_frame":{"default":24,"type":"integer","description":"max frame number for unet_vid in the PositionalEncoding"},"uvit_num_transformer_blocks":{"default":6,"type":"integer","description":"Number of transformer blocks in UViT"}}},"alg":{"title":"Algorithm-specific","type":"object","properties":{"cm_dists_mean":{"default":[0.485,0.456,0.406],"type":"array","items":{"enum":null,"type":"string"},"description":"mean for DISTS perceptual loss"},"cm_dists_std":{"default":[0.229,0.224,0.225],"type":"array","items":{"enum":null,"type":"string"},"description":"std for DISTS perceptual loss"},"cm_lambda_perceptual":{"default":1.0,"type":"number","description":"weight for LPIPS and DISTS perceptual losses"},"cm_num_steps":{"default":1000000,"type":"integer","description":"number of steps before reaching the fully discretized consistency model sampling schedule"},"cm_perceptual_loss":{"default":[""],"type":"array","items":{"enum":null,"type":"string"},"description":"optional supervised perceptual loss","enum":["","LPIPS","DISTS"]},"diffusion_cond_computed_sketch_list":{"default":["canny","hed"],"type":"array","items":{"enum":null,"type":"string"},"description":"what primitives to use for random sketch"},"diffusion_cond_embed":{"default":"","type":"string","description":"whether to use conditioning embeddings to the generator layers, and what type","enum":["","mask","class","mask_and_class","ref"]},"diffusion_cond_embed_dim":{"default":32,"type":"integer","description":"nb of examples processed for inference"},"diffusion_cond_image_creation":{"default":"y_t","type":"string","description":"how image conditioning is created: either from y_t (no conditioning), previous frame, from computed sketch (e.g. canny), from low res image or from reference image (i.e. image that is not aligned with the ground truth)","enum":["y_t","previous_frame","computed_sketch","low_res","ref"]},"diffusion_cond_prob_use_previous_frame":{"default":0.5,"type":"number","description":"prob to use previous frame as y cond"},"diffusion_cond_sam_crop_delta":{"default":true,"type":"boolean","description":"extend crop's width and height by 2\\*crop_delta before computing masks"},"diffusion_cond_sam_final_canny":{"default":false,"type":"boolean","description":"whether to perform a Canny edge detection on sam sketch to soften the edges"},"diffusion_cond_sam_max_mask_area":{"default":0.99,"type":"number","description":"maximum area in proportion of image size for a mask to be kept"},"diffusion_cond_sam_min_mask_area":{"default":0.001,"type":"number","description":"minimum area in proportion of image size for a mask to be kept"},"diffusion_cond_sam_no_output_binary_sam":{"default":false,"type":"boolean","description":"whether to not output binary sketch before Canny"},"diffusion_cond_sam_no_sample_points_in_ellipse":{"default":false,"type":"boolean","description":"whether to not sample the points inside an ellipse to avoid the corners of the image"},"diffusion_cond_sam_no_sobel_filter":{"default":false,"type":"boolean","description":"whether to not use a Sobel filter on each SAM masks"},"diffusion_cond_sam_points_per_side":{"default":16,"type":"integer","description":"number of points per side of image to prompt SAM with (\\# of prompted points will be points_per_side\\*\\*2)"},"diffusion_cond_sam_redundancy_threshold":{"default":0.62,"type":"number","description":"redundancy threshold above which redundant masks are not kept"},"diffusion_cond_sam_sobel_threshold":{"default":0.7,"type":"number","description":"sobel threshold in %% of gradient magnitude"},"diffusion_cond_sam_use_gaussian_filter":{"default":false,"type":"boolean","description":"whether to apply a Gaussian blur to each SAM masks"},"diffusion_cond_sketch_canny_range":{"default":[0,765],"type":"array","items":{"enum":null,"type":"string"},"description":"range of randomized canny sketch thresholds"},"diffusion_dropout_prob":{"default":0.0,"type":"number","description":"dropout probability for classifier-free guidance"},"diffusion_generate_per_class":{"default":false,"type":"boolean","description":"whether to generate samples of each images"},"diffusion_lambda_G":{"default":1.0,"type":"number","description":"weight for supervised loss"},"diffusion_ref_embed_net":{"default":"clip","type":"string","description":"embedding network to use for ref conditioning","enum":["clip","imagebind"]},"diffusion_super_resolution_scale":{"default":2.0,"type":"number","description":"scale for super resolution"},"diffusion_task":{"default":"inpainting","type":"string","description":"Whether to perform inpainting, super resolution or pix2pix","enum":["inpainting","super_resolution","pix2pix"]},"diffusion_vid_canny_dropout":{"default":0,"type":"integer","description":"prob to drop canny for each frame"},"gan":{"title":"GAN model","type":"object","properties":{"lambda":{"default":1.0,"type":"number","description":"weight for GAN loss：GAN(G(X))"}}},"cut":{"title":"CUT model","type":"object","properties":{"HDCE_gamma":{"default":1.0,"type":"number","description":""},"HDCE_gamma_min":{"default":1.0,"type":"number","description":""},"MSE_idt":{"default":false,"type":"boolean","description":"use MSENCE loss for identity mapping: MSE(G(Y), Y))"},"dists_mean":{"default":[0.485,0.456,0.406],"type":"array","items":{"enum":null,"type":"string"},"description":"mean for DISTS perceptual loss"},"dists_std":{"default":[0.229,0.224,0.225],"type":"array","items":{"enum":null,"type":"string"},"description":"std for DISTS perceptual loss"},"flip_equivariance":{"default":false,"type":"boolean","description":"Enforce flip-equivariance as additional regularization. It's used by FastCUT, but not CUT"},"lambda_MSE_idt":{"default":1.0,"type":"number","description":"weight for MSE identity loss: MSE(G(X), X)"},"lambda_NCE":{"default":1.0,"type":"number","description":"weight for NCE loss: NCE(G(X), X)"},"lambda_SRC":{"default":0.0,"type":"number","description":"weight for SRC (semantic relation consistency) loss: NCE(G(X), X)"},"lambda_perceptual":{"default":1.0,"type":"number","description":"weight for LPIPS and DISTS perceptual losses"},"lambda_supervised":{"default":1.0,"type":"number","description":"weight for supervised loss"},"nce_T":{"default":0.07,"type":"number","description":"temperature for NCE loss"},"nce_idt":{"default":true,"type":"boolean","description":"use NCE loss for identity mapping: NCE(G(Y), Y))"},"nce_includes_all_negatives_from_minibatch":{"default":false,"type":"boolean","description":"(used for single image translation) If True, include the negatives from the other samples of the minibatch when computing the contrastive loss. Please see models/patchnce.py for more details."},"nce_layers":{"default":"0,4,8,12,16","type":"string","description":"compute NCE loss on which layers"},"nce_loss":{"default":"monce","type":"string","description":"CUT contrastice loss","enum":["patchnce","monce","SRC_hDCE"]},"netF":{"default":"mlp_sample","type":"string","description":"how to downsample the feature map","enum":["sample","mlp_sample","sample_qsattn","mlp_sample_qsattn"]},"netF_dropout":{"default":false,"type":"boolean","description":"whether to use dropout with F"},"netF_nc":{"default":256,"type":"integer","description":""},"netF_norm":{"default":"instance","type":"string","description":"instance normalization or batch normalization for F","enum":["instance","batch","none"]},"num_patches":{"default":256,"type":"integer","description":"number of patches per layer"},"supervised_loss":{"default":[""],"type":"array","items":{"enum":null,"type":"string"},"description":"supervised loss with aligned data","enum":["","MSE","L1","LPIPS","DISTS"]}}},"cyclegan":{"title":"CycleGAN model","type":"object","properties":{"lambda_A":{"default":10.0,"type":"number","description":"weight for cycle loss (A -\\> B -\\> A)"},"lambda_B":{"default":10.0,"type":"number","description":"weight for cycle loss (B -\\> A -\\> B)"},"lambda_identity":{"default":0.5,"type":"number","description":"use identity mapping. Setting lambda_identity other than 0 has an effect of scaling the weight of the identity mapping loss. For example, if the weight of the identity loss should be 10 times smaller than the weight of the reconstruction loss, please set lambda_identity = 0.1"},"rec_noise":{"default":0.0,"type":"number","description":"whether to add noise to reconstruction"}}},"re":{"title":"ReCUT / ReCycleGAN","type":"object","properties":{"P_lr":{"default":0.0002,"type":"number","description":"initial learning rate for P networks"},"adversarial_loss_p":{"default":false,"type":"boolean","description":"if True, also train the prediction model with an adversarial loss"},"netP":{"default":"unet_128","type":"string","description":"specify P architecture","enum":["resnet_9blocks","resnet_6blocks","resnet_attn","unet_256","unet_128"]},"no_train_P_fake_images":{"default":false,"type":"boolean","description":"if True, P wont be trained over fake images projections"},"nuplet_size":{"default":3,"type":"integer","description":"Number of frames loaded"},"projection_threshold":{"default":1.0,"type":"number","description":"threshold of the real images projection loss below with fake projection and fake reconstruction losses are applied"}}},"palette":{"title":"Diffusion model","type":"object","properties":{"ddim_eta":{"default":0.5,"type":"number","description":"eta for ddim sampling variance"},"ddim_num_steps":{"default":10,"type":"integer","description":"number of steps for ddim sampling"},"loss":{"default":"MSE","type":"string","description":"loss type of the denoising model","enum":["L1","MSE","multiscale_L1","multiscale_MSE"]},"minsnr":{"default":false,"type":"boolean","description":"use min-SNR weighting"},"sampling_method":{"default":"ddpm","type":"string","description":"choose the sampling method between ddpm and ddim","enum":["ddpm","ddim"]}}}}},"data":{"title":"Datasets","type":"object","properties":{"crop_size":{"default":256,"type":"integer","description":"then crop to this size"},"dataset_mode":{"default":"unaligned","type":"string","description":"chooses how datasets are loaded.","enum":["unaligned","unaligned_labeled_cls","unaligned_labeled_mask","self_supervised_labeled_mask","unaligned_labeled_mask_cls","self_supervised_labeled_mask_cls","unaligned_labeled_mask_online","self_supervised_labeled_mask_online","unaligned_labeled_mask_cls_online","self_supervised_labeled_mask_cls_online","aligned","nuplet_unaligned_labeled_mask","temporal_labeled_mask_online","self_supervised_temporal_labeled_mask_online","self_supervised_temporal","single","unaligned_labeled_mask_ref","self_supervised_labeled_mask_ref","unaligned_labeled_mask_online_ref","unaligned_labeled_mask_online_prompt","self_supervised_labeled_mask_online_ref"]},"direction":{"default":"AtoB","type":"string","description":"AtoB or BtoA","enum":["AtoB","BtoA"]},"image_bits":{"default":8,"type":"integer","description":"number of bits of the image (e.g. 8, 12 or 16)"},"inverted_mask":{"default":false,"type":"boolean","description":"whether to invert the mask, i.e. around the bbox"},"load_size":{"default":286,"type":"integer","description":"scale images to this size"},"max_dataset_size":{"default":1000000000,"type":"integer","description":"Maximum number of samples allowed per dataset. If the dataset directory contains more than max_dataset_size, only a subset is loaded."},"num_threads":{"default":4,"type":"integer","description":"\\# threads for loading data"},"online_context_pixels":{"default":0,"type":"integer","description":"context pixel band around the crop, unused for generation, only for disc "},"online_fixed_mask_size":{"default":-1,"type":"integer","description":"if \\>0, it will be used as fixed bbox size (warning: in dataset resolution ie before resizing) "},"online_random_bbox":{"default":false,"type":"boolean","description":"whether to randomly sample a bbox per online crop"},"online_select_category":{"default":-1,"type":"integer","description":"category to select for bounding boxes, -1 means all boxes selected"},"online_single_bbox":{"default":false,"type":"boolean","description":"whether to only allow a single bbox per online crop"},"preprocess":{"default":"resize_and_crop","type":"string","description":"scaling and cropping of images at load time","enum":["resize_and_crop","crop","scale_width","scale_width_and_crop","none"]},"refined_mask":{"default":false,"type":"boolean","description":"whether to use refined mask with sam"},"relative_paths":{"default":false,"type":"boolean","description":"whether paths to images are relative to dataroot"},"sanitize_paths":{"default":false,"type":"boolean","description":"if true, wrong images or labels paths will be removed before training"},"serial_batches":{"default":false,"type":"boolean","description":"if true, takes images in order to make batches, otherwise takes them randomly"},"temporal_frame_step":{"default":30,"type":"integer","description":"how many frames between successive frames selected"},"temporal_num_common_char":{"default":-1,"type":"integer","description":"how many characters (the first ones) are used to identify a video; if =-1 natural sorting is used "},"temporal_number_frames":{"default":5,"type":"integer","description":"how many successive frames use for temporal loader"},"online_creation":{"title":"Online created datasets","type":"object","properties":{"color_mask_A":{"default":false,"type":"boolean","description":"Perform task of replacing color-filled masks by objects"},"crop_delta_A":{"default":50,"type":"integer","description":"size of crops are random, values allowed are online_creation_crop_size more or less online_creation_crop_delta for domain A"},"crop_delta_B":{"default":50,"type":"integer","description":"size of crops are random, values allowed are online_creation_crop_size more or less online_creation_crop_delta for domain B"},"crop_size_A":{"default":512,"type":"integer","description":"crop to this size during online creation, it needs to be greater than bbox size for domain A"},"crop_size_B":{"default":512,"type":"integer","description":"crop to this size during online creation, it needs to be greater than bbox size for domain B"},"load_size_A":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"load to this size during online creation, format : width height or only one size if square"},"load_size_B":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"load to this size during online creation, format : width height or only one size if square"},"mask_delta_A":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"mask offset (in pixels) to allow generation of a bigger object in domain B (for semantic loss) for domain A, format : 'width (x),height (y)' for each class or only one size if square, e.g. '125, 55 100, 100' for 2 classes"},"mask_delta_A_ratio":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask offset to allow generation of a bigger object in domain B (for semantic loss) for domain A, format : width (x),height (y) for each class or only one size if square"},"mask_delta_B":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"mask offset (in pixels) to allow generation of a bigger object in domain A (for semantic loss) for domain B, format : 'width (x),height (y)' for each class or only one size if square, e.g. '125, 55 100, 100' for 2 classes"},"mask_delta_B_ratio":{"default":[[]],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask offset to allow generation of a bigger object in domain A (for semantic loss) for domain B, format : 'width (x),height (y)' for each class or only one size if square"},"mask_random_offset_A":{"default":[0.0],"type":"array","items":{"enum":null,"type":"string"},"description":"ratio mask size randomization (only to make bigger one) to robustify the image generation in domain A, format : width (x) height (y) or only one size if square"},"mask_random_offset_B":{"default":[0.0],"type":"array","items":{"enum":null,"type":"string"},"description":"mask size randomization (only to make bigger one) to robustify the image generation in domain B, format : width (y) height (x) or only one size if square"},"mask_square_A":{"default":false,"type":"boolean","description":"whether masks should be squared for domain A"},"mask_square_B":{"default":false,"type":"boolean","description":"whether masks should be squared for domain B"},"rand_mask_A":{"default":false,"type":"boolean","description":"Perform task of replacing noised masks by objects"}}}}},"f_s":{"title":"Semantic segmentation network","type":"object","properties":{"all_classes_as_one":{"default":false,"type":"boolean","description":"if true, all classes will be considered as the same one (ie foreground vs background)"},"class_weights":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"class weights for imbalanced semantic classes"},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for f_s"},"dropout":{"default":false,"type":"boolean","description":"dropout for the semantic network"},"net":{"default":"vgg","type":"string","description":"specify f_s network [vgg|unet|segformer|sam]","enum":["vgg","unet","segformer","sam"]},"nf":{"default":64,"type":"integer","description":"\\# of filters in the first conv layer of classifier"},"semantic_nclasses":{"default":2,"type":"integer","description":"number of classes of the semantic loss classifier"},"semantic_threshold":{"default":1.0,"type":"number","description":"threshold of the semantic classifier loss below with semantic loss is applied"},"weight_sam":{"default":"","type":"string","description":"path to sam weight for f_s, e.g. models/configs/sam/pretrain/sam_vit_b_01ec64.pth, or models/configs/sam/pretrain/mobile_sam.pt for MobileSAM"},"weight_segformer":{"default":"","type":"string","description":"path to segformer weight for f_s, e.g. models/configs/segformer/pretrain/segformer_mit-b0.pth"}}},"cls":{"title":"Semantic classification network","type":"object","properties":{"all_classes_as_one":{"default":false,"type":"boolean","description":"if true, all classes will be considered as the same one (ie foreground vs background)"},"class_weights":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"class weights for imbalanced semantic classes"},"config_segformer":{"default":"models/configs/segformer/segformer_config_b0.json","type":"string","description":"path to segformer configuration file for cls"},"dropout":{"default":false,"type":"boolean","description":"dropout for the semantic network"},"net":{"default":"vgg","type":"string","description":"specify cls network [vgg|unet|segformer]","enum":["vgg","unet","segformer"]},"nf":{"default":64,"type":"integer","description":"\\# of filters in the first conv layer of classifier"},"semantic_nclasses":{"default":2,"type":"integer","description":"number of classes of the semantic loss classifier"},"semantic_threshold":{"default":1.0,"type":"number","description":"threshold of the semantic classifier loss below with semantic loss is applied"},"weight_segformer":{"default":"","type":"string","description":"path to segformer weight for cls, e.g. models/configs/segformer/pretrain/segformer_mit-b0.pth"}}},"output":{"title":"Output","type":"object","properties":{"no_html":{"default":false,"type":"boolean","description":"do not save intermediate training results to [opt.checkpoints_dir]/[opt.name]/web/"},"num_images":{"default":20,"type":"integer","description":"number of visualized images results from the train/test set"},"print_freq":{"default":100,"type":"integer","description":"frequency of showing training results on console"},"update_html_freq":{"default":1000,"type":"integer","description":"frequency of saving training results to html"},"verbose":{"default":false,"type":"boolean","description":"if specified, print more debugging information"},"display":{"title":"Visdom display","type":"object","properties":{"G_attention_masks":{"default":false,"type":"boolean","description":""},"aim_port":{"default":53800,"type":"integer","description":"aim port of the web display"},"aim_server":{"default":"http://localhost","type":"string","description":"aim server of the web display"},"diff_fake_real":{"default":false,"type":"boolean","description":"if True x - G(x) is displayed"},"env":{"default":"","type":"string","description":"visdom display environment name (default is \"main\")"},"freq":{"default":400,"type":"integer","description":"frequency of showing training results on screen"},"id":{"default":1,"type":"integer","description":"window id of the web display"},"ncols":{"default":0,"type":"integer","description":"if positive, display all images in a single visdom web panel with certain number of images per row.(if == 0 ncols will be computed automatically)"},"networks":{"default":false,"type":"boolean","description":"Set True if you want to display networks on port 8000"},"type":{"default":["visdom"],"type":"array","items":{"enum":null,"type":"string"},"description":"output display, either visdom, aim or no output","enum":["visdom","aim","none"]},"visdom_autostart":{"default":false,"type":"boolean","description":"whether to start a visdom server automatically"},"visdom_port":{"default":8097,"type":"integer","description":"visdom port of the web display"},"visdom_server":{"default":"http://localhost","type":"string","description":"visdom server of the web display"},"winsize":{"default":256,"type":"integer","description":"display window size for both visdom and HTML"}}}}},"model":{"title":"Model","type":"object","properties":{"depth_network":{"default":"DPT_Large","type":"string","description":"specify depth prediction network architecture","enum":["DPT_Large","DPT_Hybrid","MiDaS_small","DPT_BEiT_L_512","DPT_BEiT_L_384","DPT_BEiT_B_384","DPT_SwinV2_L_384","DPT_SwinV2_B_384","DPT_SwinV2_T_256","DPT_Swin_L_384","DPT_Next_ViT_L_384","DPT_LeViT_224"]},"init_gain":{"default":0.02,"type":"number","description":"scaling factor for normal, xavier and orthogonal."},"init_type":{"default":"normal","type":"string","description":"network initialization","enum":["normal","xavier","kaiming","orthogonal"]},"input_nc":{"default":3,"type":"integer","description":"\\# of input image channels: 3 for RGB and 1 for grayscale, more supported"},"multimodal":{"default":false,"type":"boolean","description":"multimodal model with random latent input vector"},"output_nc":{"default":3,"type":"integer","description":"\\# of output image channels: 3 for RGB and 1 for grayscale"},"prior_321_backwardcompatibility":{"default":false,"type":"boolean","description":"whether to load models from previous version of JG."},"type_sam":{"default":"mobile_sam","type":"string","description":"which model to use for segment-anything mask generation","enum":["sam","mobile_sam"]}}},"train":{"title":"Training","type":"object","properties":{"D_accuracy_every":{"default":1000,"type":"integer","description":"compute D accuracy every N iterations"},"D_lr":{"default":0.0001,"type":"number","description":"discriminator separate learning rate"},"G_ema":{"default":false,"type":"boolean","description":"whether to build G via exponential moving average"},"G_ema_beta":{"default":0.999,"type":"number","description":"exponential decay for ema"},"G_lr":{"default":0.0002,"type":"number","description":"initial learning rate for generator"},"batch_size":{"default":1,"type":"integer","description":"input batch size"},"beta1":{"default":0.9,"type":"number","description":"momentum term of adam"},"beta2":{"default":0.999,"type":"number","description":"momentum term of adam"},"cls_l1_regression":{"default":false,"type":"boolean","description":"if true l1 loss will be used to compute regressor loss"},"cls_regression":{"default":false,"type":"boolean","description":"if true cls will be a regressor and not a classifier"},"compute_D_accuracy":{"default":false,"type":"boolean","description":"whether to compute D accuracy explicitely"},"compute_metrics_test":{"default":false,"type":"boolean","description":"whether to compute test metrics, e.g. FID, ..."},"continue":{"default":false,"type":"boolean","description":"continue training: load the latest model"},"epoch":{"default":"latest","type":"string","description":"which epoch to load? set to latest to use latest cached model"},"epoch_count":{"default":1,"type":"integer","description":"the starting epoch count, we save the model by \\<epoch_count\\>, \\<epoch_count\\>+\\<save_latest_freq\\>, ..."},"export_jit":{"default":false,"type":"boolean","description":"whether to export model in jit format"},"feat_wavelet":{"default":false,"type":"boolean","description":"if true, train in wavelet features space (Note: this may not include all discriminators, when training GANs)"},"gan_mode":{"default":"lsgan","type":"string","description":"the type of GAN objective. vanilla GAN loss is the cross-entropy objective used in the original GAN paper.","enum":["vanilla","lsgan","wgangp","projected"]},"iter_size":{"default":1,"type":"integer","description":"backward will be apllied each iter_size iterations, it simulate a greater batch size : its value is batch_size\\*iter_size"},"load_iter":{"default":0,"type":"integer","description":"which iteration to load? if load_iter \\> 0, the code will load models by iter_[load_iter]; otherwise, the code will load models by [epoch]"},"lr_decay_iters":{"default":50,"type":"integer","description":"multiply by a gamma every lr_decay_iters iterations"},"lr_policy":{"default":"linear","type":"string","description":"learning rate policy.","enum":["linear","step","multistep","plateau","cosine"]},"lr_steps":{"default":[],"type":"array","items":{"enum":null,"type":"string"},"description":"number of epochs between reductions of the learning rate by gamma=0.1"},"metrics_every":{"default":1000,"type":"integer","description":"compute metrics every N iterations"},"metrics_list":{"default":["FID"],"type":"array","items":{"enum":null,"type":"string"},"description":"metrics on results quality to compute","enum":["FID","KID","MSID","PSNR","LPIPS","SSIM"]},"metrics_save_images":{"default":false,"type":"boolean","description":"whether to save images that result form metrics computation"},"mm_lambda_z":{"default":0.5,"type":"number","description":"weight for random z loss"},"mm_nz":{"default":8,"type":"integer","description":"number of latent vectors"},"n_epochs":{"default":100,"type":"integer","description":"number of epochs with the initial learning rate"},"n_epochs_decay":{"default":0,"type":"integer","description":"number of epochs to linearly decay learning rate to zero"},"nb_img_max_fid":{"default":1000000000,"type":"integer","description":"Maximum number of samples allowed per dataset to compute fid. If the dataset directory contains more than nb_img_max_fid, only a subset is used."},"optim":{"default":"adam","type":"string","description":"optimizer (adam, radam, adamw, ...)","enum":["adam","radam","adamw","lion","adam8bit"]},"optim_eps":{"default":1e-08,"type":"number","description":"epsilon for optimizer"},"optim_weight_decay":{"default":0.0,"type":"number","description":"weight decay for optimizer"},"pool_size":{"default":50,"type":"integer","description":"the size of image buffer that stores previously generated images"},"save_by_iter":{"default":false,"type":"boolean","description":"whether saves model by iteration"},"save_epoch_freq":{"default":1,"type":"integer","description":"frequency of saving checkpoints at the end of epochs"},"save_latest_freq":{"default":5000,"type":"integer","description":"frequency of saving the latest results"},"semantic_cls":{"default":false,"type":"boolean","description":"if true semantic class losses will be used"},"semantic_mask":{"default":false,"type":"boolean","description":"if true semantic mask losses will be used"},"temporal_criterion":{"default":false,"type":"boolean","description":"if true, MSE loss will be computed between successive frames"},"temporal_criterion_lambda":{"default":1.0,"type":"number","description":"lambda for MSE loss that will be computed between successive frames"},"use_contrastive_loss_D":{"default":false,"type":"boolean","description":""},"sem":{"title":"Semantic training","type":"object","properties":{"cls_B":{"default":false,"type":"boolean","description":"if true cls will be trained not only on domain A but also on domain B"},"cls_lambda":{"default":1.0,"type":"number","description":"weight for semantic class loss"},"cls_pretrained":{"default":false,"type":"boolean","description":"whether to use a pretrained model, available for non \"basic\" model only"},"cls_template":{"default":"basic","type":"string","description":"classifier/regressor model type, from torchvision (resnet18, ...), default is custom simple model"},"idt":{"default":false,"type":"boolean","description":"if true apply semantic loss on identity"},"lr_cls":{"default":0.0002,"type":"number","description":"cls learning rate"},"lr_f_s":{"default":0.0002,"type":"number","description":"f_s learning rate"},"mask_lambda":{"default":1.0,"type":"number","description":"weight for semantic mask loss"},"net_output":{"default":false,"type":"boolean","description":"if true apply generator semantic loss on network output for real image rather than on label."},"use_label_B":{"default":false,"type":"boolean","description":"if true domain B has labels too"}}},"mask":{"title":"Semantic training with masks","type":"object","properties":{"charbonnier_eps":{"default":1e-06,"type":"number","description":"Charbonnier loss epsilon value"},"compute_miou":{"default":false,"type":"boolean","description":"whether to compute mIoU on semantic masks prediction"},"disjoint_f_s":{"default":false,"type":"boolean","description":"whether to use a disjoint f_s with the same exact structure"},"f_s_B":{"default":false,"type":"boolean","description":"if true f_s will be trained not only on domain A but also on domain B"},"for_removal":{"default":false,"type":"boolean","description":"if true, object removal mode, domain B images with label 0, cut models only"},"lambda_out_mask":{"default":10.0,"type":"number","description":"weight for loss out mask"},"loss_out_mask":{"default":"L1","type":"string","description":"loss for out mask content (which should not change).","enum":["L1","MSE","Charbonnier"]},"miou_every":{"default":1000,"type":"integer","description":"compute mIoU every n iterations"},"no_train_f_s_A":{"default":false,"type":"boolean","description":"if true f_s wont be trained on domain A"},"out_mask":{"default":false,"type":"boolean","description":"use loss out mask"}}}}},"dataaug":{"title":"Data augmentation","type":"object","properties":{"APA":{"default":false,"type":"boolean","description":"if true, G will be used as augmentation during D training adaptively to D overfitting between real and fake images"},"APA_every":{"default":4,"type":"integer","description":"How often to perform APA adjustment?"},"APA_nimg":{"default":50,"type":"integer","description":"APA adjustment speed, measured in how many images it takes for p to increase/decrease by one unit."},"APA_p":{"default":0,"type":"integer","description":"initial value of probability APA"},"APA_target":{"default":0.6,"type":"number","description":""},"D_diffusion":{"default":false,"type":"boolean","description":"whether to apply diffusion noise augmentation to discriminator inputs, projected discriminator only"},"D_diffusion_every":{"default":4,"type":"integer","description":"How often to perform diffusion augmentation adjustment"},"D_label_smooth":{"default":false,"type":"boolean","description":"whether to use one-sided label smoothing with discriminator"},"D_noise":{"default":0.0,"type":"number","description":"whether to add instance noise to discriminator inputs"},"affine":{"default":0.0,"type":"number","description":"if specified, apply random affine transforms to the images for data augmentation"},"affine_scale_max":{"default":1.2,"type":"number","description":"if random affine specified, max scale range value"},"affine_scale_min":{"default":0.8,"type":"number","description":"if random affine specified, min scale range value"},"affine_shear":{"default":45,"type":"integer","description":"if random affine specified, shear range (0,value)"},"affine_translate":{"default":0.2,"type":"number","description":"if random affine specified, translation range (-value\\*img_size,+value\\*img_size) value"},"diff_aug_policy":{"default":"","type":"string","description":"choose the augmentation policy : color randaffine randperspective. If you want more than one, please write them separated by a comma with no space (e.g. color,randaffine)"},"diff_aug_proba":{"default":0.5,"type":"number","description":"proba of using each transformation"},"flip":{"default":"horizontal","type":"string","description":"if specified, flip the images for data augmentation, possible values: none, horizontal, vertical, both","enum":["none","horizontal","vertical","both"]},"imgaug":{"default":false,"type":"boolean","description":"whether to apply random image augmentation"},"no_rotate":{"default":false,"type":"boolean","description":"if specified, do not rotate the images for data augmentation"}}}}}},"definitions":{"ServerTrainOptions":{"title":"ServerTrainOptions","type":"object","properties":{"sync":{"title":"Sync","description":"if false, the call returns immediately and train process is executed in the background. If true, the call returns only when training process is finished","default":false,"type":"boolean"}}}}}}},"definitions":{"ServerTrainOptions":{"title":"ServerTrainOptions","type":"object","properties":{"sync":{"title":"Sync","description":"if false, the call returns immediately and train process is executed in the background. If true, the call returns only when training process is finished","default":false,"type":"boolean"}}}}}
\ No newline at end of file