-
Notifications
You must be signed in to change notification settings - Fork 0
/
viton.html
277 lines (227 loc) · 14.7 KB
/
viton.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
<!DOCTYPE html>
<title>VITON</title>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-21408087-2"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'UA-21408087-2');
</script>
<meta charset="utf-8">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.4.0/css/bootstrap.min.css" crossorigin="anonymous">
<link rel="stylesheet" href="css/style.css">
<link rel="preconnect" href="https://fonts.gstatic.com">
<style>
.img-responsive {
display: block;
width: 100%;
max-width: <line-width>;
height: auto;
}
</style>
<style>
/* greek-ext */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 300;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOlCnqEu92Fr1MmSU5fCBc4EsA.woff2) format('woff2');
unicode-range: U+1F00-1FFF;
}
/* greek */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 300;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOlCnqEu92Fr1MmSU5fBxc4EsA.woff2) format('woff2');
unicode-range: U+0370-03FF;
}
/* latin-ext */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 300;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOlCnqEu92Fr1MmSU5fChc4EsA.woff2) format('woff2');
unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
}
/* latin */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 300;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOlCnqEu92Fr1MmSU5fBBc4.woff2) format('woff2');
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
}
/* greek-ext */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 400;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOmCnqEu92Fr1Mu7mxKOzY.woff2) format('woff2');
unicode-range: U+1F00-1FFF;
}
/* greek */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 400;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOmCnqEu92Fr1Mu4WxKOzY.woff2) format('woff2');
unicode-range: U+0370-03FF;
}
/* latin-ext */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 400;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOmCnqEu92Fr1Mu7GxKOzY.woff2) format('woff2');
unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
}
/* latin */
@font-face {
font-family: 'Roboto';
font-style: normal;
font-weight: 400;
font-display: swap;
src: url(https://fonts.gstatic.com/s/roboto/v20/KFOmCnqEu92Fr1Mu4mxK.woff2) format('woff2');
unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
}
</style>
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<body>
<div class="container">
<div class="row mb-2 mt-4" id="paper-title">
<h2 class="col-md-12 text-center">
Virtual Cloth Try-on based on Segment Anything and Conditional Generative Models
</h2>
<h2 class="col-md-12 text-center">
</h2>
<h3 class="col-md-12 text-center">
<small>Course Project of CS496 Deep Generative Models</small>
</h3>
</div>
<div class="row" id="authors">
<div class="mx-auto text-center">
<ul class="list-inline mb-0">
<li class="list-inline-item">
<a target="_blank" href="http://mingfuliang.com">Mingfu Liang</a>
<sup>1</sup>
</li>
<li class="list-inline-item">
<a target="_blank" href="https://ukaukaaaa.github.io/">Bin Wang</a>
<sup>1</sup>
</li>
</ul>
<ul class="list-inline mb-0" id="institution">
<li class="list-inline-item">
<sup>1</sup>
Northwestern University
</li>
</ul>
<ul class="list-inline mb-0" id="institution">
<li class="list-inline-item">
</li>
</ul>
<ul class="list-inline mb-0" id="institution">
<li class="list-inline-item">
Course Advisor: Bryan Pardo
</li>
</ul>
</div>
</div>
<!-- <div class="row mb-3 pt-2">
<div class="col-md-8 mx-auto">
<div class="embed-responsive embed-responsive-16by9 pb-3">
<iframe class="embed-responsive-item" src="https://www.youtube.com/embed/KCDd7UFO1d0" frameborder="0" allow="accelerometer; autoplay muted; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
</div>
<p class="text-justify mt-2 pt-3">
We propose a view-dependent sparse voxel model, Plenoxel <em>(plenoptic volume element)</em>, that can optimize to the same fidelity as <a href="http://tancik.com/nerf">Neural Radiance Fields (NeRFs)</a>
without any neural networks. Our typical optimization time is 11 minutes on a single GPU, a speedup of two orders of magnitude compared to NeRF.
</p>
</div>
</div> -->
<div class="row mb-3 pt-2">
<div class="col-md-8 mx-auto">
<p class="text-justify">
Virtual Try-on gives us a flexible way to try the cloth we desired without going into the shop onsite.
However, existing try-on application require the user to passively choose from a pre-defined set of clothes, which is time consuming, especially when we already had a desired clothes.
Thus in this project we want to enable an interactive personalized system for virtual try-on, i.e., let the user actively indicate their intent by providing either text guidance, e.g., a description of the cloth of interested, or image guidance, i.e., the image of the cloth they desired.
</p>
<img class="img-responsive" src="./projects/piton/framework.png">
<p class="text-justify">
To do so, we propose the following framework. The user will first upload a source image accompany with a text prompt or a cloth image prompt.
In the Stage-1, we will pre-process both the text and images to extract essential guidance information, where we use CLIP text encoder to encode the text and for the images, we use the segment anything model to extract the primitives of the images, for example the segment mask of the cloth and the boundary map of the human body.
Then in the Stage-2, we will correspondingly perform either text-guided generation or cloth image-guided generation.
</p>
<h4>Extend ControlNet to Finetune the Stable Diffusion for Text-guided VITON
</h4>
<img class="img-responsive" src="./projects/piton/text.png" >
<p class="text-justify">
Now I introduce how we extend stable diffusion model for Virtual Try-on by the controlnet. We freeze the pretrained autoencoder and the stable diffusion model, which is released by Stability AI.
As mentioned before, we have a source image and then the SAM will preprocess it to return a cloth mask and a boundary map correspondingly.
The controlnet is a copy of the SD encoder which will input the boundary map to constraint on the generation. The text prompt will input to the text encoder and condition on the whole generation process in stable diffusion.
<h4>Qualitative Results of Text-guided VITON </h4>
<img class="img-responsive" src="./projects/piton/textresults.png" >
</p>
<h4>Explored Image-guided VITON </h4>
<img class="img-responsive" src="./projects/piton/image.png" >
<p class="text-justify">
Originally we have text prompt here, and we input the edge map into the controlnet to control the image generation.
At first, we think it should be natural if we change the edge map into the cloth item image and make the text prompt as empty, then it will guide the model to generate a cloth on the human body.
After more than a week trial and error, we find that it is hard to enforce pixel-wise consistency by only manipulating the latent space during generation for image-guided VITON, because here we first pass the image into the encoder and doing alignment between cloth and source image based on the encoding feature. It is not pixel-wise and we can not directly align the cloth with the human body in image space.
Therefore, we instead consider the conditional GAN model and directly do the generation starting from the image space
Here we first use segmentanything model to generate a region mask where we want to generate a new cloth on it, then we take the mask as the conditioning image of the generator and conduct the image generator.
This is better because the image and cloth is not passed into encoder and they can directly do the alignment in the image space, and also there will be pixel-wise supervision to make the final prediction look more natural.
<h4>An Interactive Demo for Proof-of-Concept </h4>
<img class="img-responsive" src="./projects/piton/video.gif" >
<h4>Qualitative Results of Image-guided VITON </h4>
<img class="img-responsive" src="./projects/piton/imageresults.png" >
</p>
</div>
</div>
</div> <!-- container -->
<script>
window.mobileAndTabletCheck = function() {
let check = false;
(function(a) {
if (/(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino|android|ipad|playbook|silk/i.test(a) || /1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-/i.test(a.substr(0, 4))) check = true;
})(navigator.userAgent || navigator.vendor || window.opera);
return check;
};
if (window.location.host.indexOf('alexyu.net') > -1 && window.location.protocol != "https:") {
// Force HTTPS
window.location.protocol = "https";
}
if (mobileAndTabletCheck()) {
document.getElementById('demo-warning').style.display = 'block';
document.getElementById('demo-container').style.display = 'none';
document.getElementById('demo-warning').innerHTML = "Unfortunately, mobile and tablet devices are not currently supported due to WebGL compatibility issues. We hope to support this in the future.";
} else {
var canvas = document.createElement('canvas');
var gl = canvas.getContext('webgl');
var tex_limit = gl.getParameter(gl.MAX_TEXTURE_SIZE);
if (gl && gl instanceof WebGLRenderingContext) {
const REQUIRED_TEX_LIMIT = 8192;
if (tex_limit < REQUIRED_TEX_LIMIT) {
document.getElementById('demo-warning').style.display = 'block';
document.getElementById('demo-container').style.display = 'none';
document.getElementById('demo-warning').innerHTML = "Your GPU's maximum texture size is: " + tex_limit + " which is less than the minimum required (" + REQUIRED_TEX_LIMIT + "). Please try another device, if possible.";
}
} else {
document.getElementById('demo-warning').style.display = 'block';
document.getElementById('demo-container').style.display = 'none';
document.getElementById('demo-warning').innerHTML = "Your browser does not support WebGL, or WebGL was disabled. Please use a modern browser like Chrome or Firefox.";
}
}
</script>
</body>