-
Notifications
You must be signed in to change notification settings - Fork 13
/
reference.bib
1657 lines (1505 loc) · 58.1 KB
/
reference.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
% ----------------TEXT-TO-VIDEO GENERATION---------------------
@misc{yan-etal-2021-videogpt,
title={{VideoGPT: Video Generation using VQ-VAE and Transformers}},
author={{Wilson Yan, Yunzhi Zhang, Pieter Abbeel, and Aravind Srinivas}},
year={2021},
eprint={2104.10157},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{han-etal-2022-show,
author = {{Ligong Han,
Jian Ren,
Hsin{-}Ying Lee,
Francesco Barbieri,
Kyle Olszewski,
Shervin Minaee,
Dimitris N. Metaxas, and
Sergey Tulyakov}},
title = {{Show Me What and Tell Me How: Video Synthesis via Multimodal Conditioning}},
booktitle = {CVPR},
pages = {3605--3615},
year = {2022},
}
@article{an-etal-2023-latentshift,
author = {Jie An,
Songyang Zhang,
Harry Yang,
Sonal Gupta,
Jia{-}Bin Huang,
Jiebo Luo, and
Xi Yin},
title = {Latent-Shift: Latent Diffusion with Temporal Shift for Efficient Text-to-Video
Generation},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{blattmann-etal-2023-svd,
author = {Andreas Blattmann,
Tim Dockhorn,
Sumith Kulal,
Daniel Mendelevitch,
Maciej Kilian,
Dominik Lorenz,
Yam Levi,
Zion English,
Vikram Voleti,
Adam Letts,
Varun Jampani, and
Robin Rombach},
title = {Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large
Datasets},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{chen-etal-2023-control,
author = {Weifeng Chen,
Jie Wu,
Pan Xie,
Hefeng Wu,
Jiashi Li,
Xin Xia,
Xuefeng Xiao, and
Liang Lin},
title = {Control-A-Video: Controllable Text-to-Video Generation with Diffusion
Models},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{esser-etal-2023-structure,
author = {Patrick Esser,
Johnathan Chiu,
Parmida Atighehchian,
Jonathan Granskog, and
Anastasis Germanidis},
title = {Structure and Content-Guided Video Synthesis with Diffusion Models},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{ho-etal-2023-imagen,
author = {{Jonathan Ho,
William Chan,
Chitwan Saharia,
Jay Whang,
Ruiqi Gao,
Alexey A. Gritsenko,
Diederik P. Kingma,
Ben Poole,
Mohammad Norouzi,
David J. Fleet, and
Tim Salimans}},
title = {{Imagen Video: High Definition Video Generation with Diffusion Models}},
journal = {CoRR},
year = {2022},
}
# arXiv 2023
@article{khachatryan-etal-2023-text2video,
author = {{Levon Khachatryan,
Andranik Movsisyan,
Vahram Tadevosyan,
Roberto Henschel,
Zhangyang Wang,
Shant Navasardyan, and
Humphrey Shi}},
title = {{Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video
Generators}},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{li-etal-2023-videogen,
author = {{Xin Li,
Wenqing Chu,
Ye Wu,
Weihang Yuan,
Fanglong Liu,
Qi Zhang,
Fu Li,
Haocheng Feng,
Errui Ding, and
Jingdong Wang}},
title = {{VideoGen: {A} Reference-Guided Latent Diffusion Approach for High
Definition Text-to-Video Generation}},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{ge-etal-2023-noise,
author = {{Songwei Ge,
Seungjun Nah,
Guilin Liu,
Tyler Poon,
Andrew Tao,
Bryan Catanzaro,
David Jacobs,
Jia{-}Bin Huang,
Ming{-}Yu Liu, and
Yogesh Balaji}},
title = {{Preserve Your Own Correlation: {A} Noise Prior for Video Diffusion
Models}},
journal = {CoRR},
year = {2023},
}
@inproceedings{wang-etal-2023-videocomposer,
author = {Xiang Wang and
Hangjie Yuan and
Shiwei Zhang and
Dayou Chen and
Jiuniu Wang and
Yingya Zhang and
Yujun Shen and
Deli Zhao and
Jingren Zhou},
title = {{VideoComposer: Compositional Video Synthesis with Motion Controllability}},
booktitle = {NeurIPS},
year = {2023}
}
# arXiv 2023
@article{wang-etal-2023-lavie,
author = {{Yaohui Wang,
Xinyuan Chen,
Xin Ma,
Shangchen Zhou,
Ziqi Huang,
Yi Wang,
Ceyuan Yang,
Yinan He,
Jiashuo Yu,
Peiqing Yang,
Yuwei Guo,
Tianxing Wu,
Chenyang Si,
Yuming Jiang,
Cunjian Chen,
Chen Change Loy,
Bo Dai,
Dahua Lin,
Yu Qiao, and
Ziwei Liu}},
title = {{{LAVIE:} High-Quality Video Generation with Cascaded Latent Diffusion
Models}},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{wang-etal-2023-videofactory,
author = {Wenjing Wang,
Huan Yang,
Zixi Tuo,
Huiguo He,
Junchen Zhu,
Jianlong Fu, and
Jiaying Liu},
title = {{VideoFactory: Swap Attention in Spatiotemporal Diffusions for Text-to-Video
Generation}},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{zhang-etal-2023-controlvideo,
author = {{Yabo Zhang,
Yuxiang Wei,
Dongsheng Jiang,
Xiaopeng Zhang,
Wangmeng Zuo, and
Qi Tian}},
title = {{ControlVideo: Training-free Controllable Text-to-Video Generation}},
journal = {CoRR},
year = {2023},
}
# arXiv 2023
@article{zhang-etal-2023-show,
author = {{David Junhao Zhang,
Jay Zhangjie Wu,
Jia{-}Wei Liu,
Rui Zhao,
Lingmin Ran,
Yuchao Gu,
Difei Gao, and
Mike Zheng Shou}},
title = {{Show-1: Marrying Pixel and Latent Diffusion Models for Text-to-Video
Generation}},
journal = {CoRR},
year = {2023},
}
# CVPR 2023
@inproceedings{blattmann-etal-2023-align,
author = {{Andreas Blattmann,
Robin Rombach,
Huan Ling,
Tim Dockhorn,
Seung Wook Kim,
Sanja Fidler, and
Karsten Kreis}},
title = {{Align Your Latents: High-Resolution Video Synthesis with Latent Diffusion
Models}},
booktitle = {CVPR},
pages = {22563--22575},
year = {2023},
}
# CVPR 2023
@inproceedings{yu-etal-2023-video,
author = {{Sihyun Yu,
Kihyuk Sohn,
Subin Kim, and
Jinwoo Shin}},
title = {{Video Probabilistic Diffusion Models in Projected Latent Space}},
booktitle = {CVPR},
pages = {18456--18466},
year = {2023},
}
# NeurIPS 2023
@inproceedings{ho-etal-2023-video,
author = {Jonathan Ho,
Tim Salimans,
Alexey A. Gritsenko,
William Chan,
Mohammad Norouzi, and
David J. Fleet},
title = {{Video Diffusion Models}},
booktitle = {NeurIPS},
year = {2022},
}
# ICLR 2023
@inproceedings{hong-etal-2023-cogvideo,
title={{CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers}},
author={Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang},
booktitle={ICLR},
year={2023},
pages={1--24}
}
# ICLR 2023
@inproceedings{singer-etal-2023-make,
author = {Uriel Singer,
Adam Polyak,
Thomas Hayes,
Xi Yin,
Jie An,
Songyang Zhang,
Qiyuan Hu,
Harry Yang,
Oron Ashual,
Oran Gafni,
Devi Parikh,
Sonal Gupta, and
Yaniv Taigman},
title = {Make-A-Video: Text-to-Video Generation without Text-Video Data},
booktitle = {ICLR},
year = {2023},
pages = {1--13},
}
# ICLR 2023
@inproceedings{villegas-etal-2023-phenaki,
author = {Ruben Villegas,
Mohammad Babaeizadeh,
Pieter{-}Jan Kindermans,
Hernan Moraldo,
Han Zhang,
Mohammad Taghi Saffar,
Santiago Castro,
Julius Kunze, and
Dumitru Erhan},
title = {Phenaki: Variable Length Video Generation from Open Domain Textual
Descriptions},
booktitle = {ICLR},
year = {2023},
pages = {1--14},
}
@misc{girdhar-etal-2023-emu,
title={{Emu Video: Factorizing Text-to-Video Generation by Explicit Image Conditioning}},
author={{Rohit Girdhar, Mannat Singh, Andrew Brown, Quentin Duval, Samaneh Azadi, Sai Saketh Rambhatla, Akbar Shah, Xi Yin, Devi Parikh, and Ishan Misra}},
year={2023},
eprint={2311.10709},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{yuan-etal-2023-instructvideo,
title={{InstructVideo: Instructing Video Diffusion Models with Human Feedback}},
author={{Hangjie Yuan, Shiwei Zhang, Xiang Wang, Yujie Wei, Tao Feng, Yining Pan, Yingya Zhang, Ziwei Liu, Samuel Albanie, and Dong Ni}},
year={2023},
eprint={2312.12490},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{chen-etal-2023-seine,
title={{SEINE: Short-to-Long Video Diffusion Model for Generative Transition and Prediction}},
author={{Xinyuan Chen, Yaohui Wang, Lingjun Zhang, Shaobin Zhuang, Xin Ma, Jiashuo Yu, Yali Wang, Dahua Lin, Yu Qiao, and Ziwei Liu}},
year={2023},
eprint={2310.20700},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{wang-etal-2023-videolcm,
title={{VideoLCM: Video Latent Consistency Model}},
author={{Xiang Wang, Shiwei Zhang, Han Zhang, Yu Liu, Yingya Zhang, Changxin Gao, and Nong Sang}},
year={2023},
eprint={2312.09109},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{wang-etal-2023-modelscopet2v,
title={{ModelScope Text-to-Video Technical Report}},
author={{Jiuniu Wang, Hangjie Yuan, Dayou Chen, Yingya Zhang, Xiang Wang, and Shiwei Zhang}},
year={2023},
eprint={2308.06571},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{xing-etal-2023-vidiff,
title={{VIDiff: Translating Videos via Multi-Modal Instructions with Diffusion Models}},
author={{Zhen Xing, Qi Dai, Zihao Zhang, Hui Zhang, Han Hu, Zuxuan Wu, and Yu-Gang Jiang}},
year={2023},
eprint={2311.18837},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@article{wu-etal-2023-lamp,
author = {{Ruiqi Wu,
Liangyu Chen,
Tong Yang,
Chunle Guo,
Chongyi Li, and
Xiangyu Zhang}},
title = {{{LAMP:} Learn {A} Motion Pattern for Few-Shot-Based Video Generation}},
journal = {CoRR},
year = {2023},
}
@inproceedings{du-etal-2023-learning,
author = {Yilun Du and
Sherry Yang and
Bo Dai and
Hanjun Dai and
Ofir Nachum and
Josh Tenenbaum and
Dale Schuurmans and
Pieter Abbeel},
title = {{Learning Universal Policies via Text-Guided Video Generation}},
booktitle = {NeurIPS},
year = {2023}
}
@misc{bartal-etal-2024-lumiere,
title={{Lumiere: A Space-Time Diffusion Model for Video Generation}},
author={{Omer Bar-Tal, Hila Chefer, Omer Tov, Charles Herrmann, Roni Paiss, Shiran Zada, Ariel Ephrat, Junhwa Hur, Guanghui Liu, Amit Raj, Yuanzhen Li, Michael Rubinstein, Tomer Michaeli, Oliver Wang, Deqing Sun, Tali Dekel, and Inbar Mosseri}},
year={2024},
eprint={2401.12945},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{wang-etal-2024-boximator,
title={{Boximator: Generating Rich and Controllable Motions for Video Synthesis}},
author={{Jiawei Wang, Yuchen Zhang, Jiaxin Zou, Yan Zeng, Guoqiang Wei, Liping Yuan, and Hang Li}},
year={2024},
eprint={2402.01566},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{liu-etal-2024-world,
title={{World Model on Million-Length Video And Language With RingAttention}},
author={{Hao Liu, Wilson Yan, Matei Zaharia, and Pieter Abbeel}},
year={2024},
eprint={2402.08268},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
@misc{yang-etal-2024-directavideo,
title={{Direct-a-Video: Customized Video Generation with User-Directed Camera Movement and Object Motion}},
author={{Shiyuan Yang, Liang Hou, Haibin Huang, Chongyang Ma, Pengfei Wan, Di Zhang, Xiaodong Chen, and Jing Liao}},
year={2024},
eprint={2402.03162},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{zhuang-etal-2024-vlogger,
title={{Vlogger: Make Your Dream A Vlog}},
author={{Shaobin Zhuang, Kunchang Li, Xinyuan Chen, Yaohui Wang, Ziwei Liu, Yu Qiao, and Yali Wang}},
year={2024},
eprint={2401.09414},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zeng-etal-2023-make,
title={{Make Pixels Dance: High-Dynamic Video Generation}},
author={{Yan Zeng, Guoqiang Wei, Jiani Zheng, Jiaxin Zou, Yang Wei, Yuchen Zhang, and Hang Li}},
year={2023},
eprint={2311.10982},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{qing-etal-2023-hierarchical,
title={{Hierarchical Spatio-temporal Decoupling for Text-to-Video Generation}},
author={{Zhiwu Qing, Shiwei Zhang, Jiayu Wang, Xiang Wang, Yujie Wei, Yingya Zhang, Changxin Gao, and Nong Sang}},
year={2023},
eprint={2312.04483},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{chen2023gentron,
title={{GenTron: Delving Deep into Diffusion Transformers for Image and Video Generation}},
author={Shoufa Chen, Mengmeng Xu, Jiawei Ren, Yuren Cong, Sen He, Yanping Xie, Animesh Sinha, Ping Luo, Tao Xiang, and Juan-Manuel Perez-Rua},
year={2023},
eprint={2312.04557},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{xing-etal-2023-simda,
author = {{Zhen Xing,
Qi Dai,
Han Hu,
Zuxuan Wu, and
Yu{-}Gang Jiang}},
title = {{SimDA: Simple Diffusion Adapter for Efficient Video Generation}},
journal = {CoRR},
year = {2023},
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2023-microcinema,
title={{MicroCinema: A Divide-and-Conquer Approach for Text-to-Video Generation}},
author={{Yanhui Wang, Jianmin Bao, Wenming Weng, Ruoyu Feng, Dacheng Yin, Tao Yang, Jingxu Zhang, Qi Dai Zhiyuan Zhao, Chunyu Wang, Kai Qiu, Yuhui Yuan, Chuanxin Tang, Xiaoyan Sun, Chong Luo, and Baining Guo}},
year={2023},
eprint={2311.18829},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{cai-etal-2023-generativerendering,
title={{Generative Rendering: Controllable 4D-Guided Video Generation with 2D Diffusion Models}},
author={{Shengqu Cai, Duygu Ceylan, Matheus Gadelha, Chun-Hao Paul Huang, Tuanfeng Yang Wang, and Gordon Wetzstein}},
year={2023},
eprint={2312.01409},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{jain-etal-2023-peekaboo,
title={{PEEKABOO: Interactive Video Generation via Masked-Diffusion}},
author={{Yash Jain, Anshul Nasery, Vibhav Vineet, and Harkirat Behl}},
year={2023},
eprint={2312.07509},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{liu-etal-2023-evalcrafter,
title={{EvalCrafter: Benchmarking and Evaluating Large Video Generation Models}},
author={{Yaofang Liu, Xiaodong Cun, Xuebo Liu, Xintao Wang, Yong Zhang, Haoxin Chen, Yang Liu, Tieyong Zeng, Raymond Chan, and Ying Shan}},
year={2023},
eprint={2310.11440},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{wang-etal-2023-recipe,
title={{A Recipe for Scaling up Text-to-Video Generation with Text-free Videos}},
author={{Xiang Wang, Shiwei Zhang, Hangjie Yuan, Zhiwu Qing, Biao Gong, Yingya Zhang, Yujun Shen, Changxin Gao, and Nong Sang}},
year={2023},
eprint={2312.15770},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{shi-etal-2023-bivdiff,
title={{BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis via Bridging Image and Video Diffusion Models}},
author={{Fengyuan Shi, Jiaxi Gu, Hang Xu, Songcen Xu, Wei Zhang, and Limin Wang}},
year={2023},
eprint={2312.02813},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{menapace-etal-2024-mindthetime,
title={{Snap Video: Scaled Spatiotemporal Transformers for Text-to-Video Synthesis}},
author={{Willi Menapace, Aliaksandr Siarohin, Ivan Skorokhodov, Ekaterina Deyneka, Tsai-Shien Chen, Anil Kag, Yuwei Fang, Aleksei Stoliar, Elisa Ricci, Jian Ren, and Sergey Tulyakov}},
year={2024},
eprint={2402.14797},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{hu-etal-2023-animateanyone,
title={{Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation}},
author={{Li Hu, Xin Gao, Peng Zhang, Ke Sun, Bang Zhang, and Liefeng Bo}},
year={2023},
eprint={2311.17117},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{xiang-etal-2024-versvideo,
title={{VersVideo: Leveraging Enhanced Temporal Diffusion Models for Versatile Video Generation}},
author={{Jinxi Xiang, Ricong Huang, Jun Zhang, Guanbin Li, Xiao Han, and Yang Wei}},
booktitle={ICLR},
year={2024},
pages={1--19}
}
@inproceedings{ma-etal-2024-follow-your-pose,
author = {{Yue Ma,
Yingqing He,
Xiaodong Cun,
Xintao Wang,
Siran Chen,
Xiu Li, and
Qifeng Chen}},
title = {{Follow Your Pose: Pose-Guided Text-to-Video Generation Using Pose-Free
Videos}},
booktitle = {AAAI},
pages = {4117--4125},
year = {2024},
}
@inproceedings{qu-etal-2024-e2hqv,
author = {{Qiang Qu,
Yiran Shen,
Xiaoming Chen,
Yuk Ying Chung, and
Tongliang Liu}},
title = {{{E2HQV:} High-Quality Video Generation from Event Camera via Theory-Inspired
Model-Aided Deep Learning}},
booktitle = {AAAI},
pages = {4632--4640},
year = {2024},
}
% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{peng-etal-2023-conditionvideo,
title={{ConditionVideo: Training-Free Condition-Guided Text-to-Video Generation}},
author={{Bo Peng, Xinyuan Chen, Yaohui Wang, Chaochao Lu, and Yu Qiao}},
year={2023},
eprint={2310.07697},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with AAAI version as soon as the metadata is updated
@misc{su-etal-2023-f3pruning,
title={{F3-Pruning: A Training-Free and Generalized Pruning Strategy towards Faster and Finer Text-to-Video Synthesis}},
author={{Sitong Su, Jianzhi Liu, Lianli Gao, and Jingkuan Song}},
year={2023},
eprint={2312.03459},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{wang-etal-2024-worlddreamer,
title={{WorldDreamer: Towards General World Models for Video Generation via Predicting Masked Tokens}},
author={{Xiaofeng Wang, Zheng Zhu, Guan Huang, Boyuan Wang, Xinze Chen, and Jiwen Lu}},
year={2024},
eprint={2401.09985},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{wang-etal-2024-magicvideov2,
title={{MagicVideo-V2: Multi-Stage High-Aesthetic Video Generation}},
author={{Weimin Wang, Jiawei Liu, Zhijie Lin, Jiangqiao Yan, Shuo Chen, Chetwin Low, Tuyen Hoang, Jie Wu, Jun Hao Liew, Hanshu Yan, Daquan Zhou, and Jiashi Feng}},
year={2024},
eprint={2401.04468},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{ma-etal-2024-latte,
title={{Latte: Latent Diffusion Transformer for Video Generation}},
author={{Xin Ma, Yaohui Wang, Gengyun Jia, Xinyuan Chen, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao}},
year={2024},
eprint={2401.03048},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{yuan-etal-2024-mora,
title={{Mora: Enabling Generalist Video Generation via A Multi-Agent Framework}},
author={{Zhengqing Yuan, Ruoxi Chen, Zhaoxu Li, Haolong Jia, Lifang He, Chi Wang, and Lichao Sun}},
year={2024},
eprint={2403.13248},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{henschel-etal-2024-streamingt2v,
title={{StreamingT2V: Consistent, Dynamic, and Extendable Long Video Generation from Text}},
author={{Roberto Henschel, Levon Khachatryan, Daniil Hayrapetyan, Hayk Poghosyan, Vahram Tadevosyan, Zhangyang Wang, Shant Navasardyan, and Humphrey Shi}},
year={2024},
eprint={2403.14773},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{zhao-etal-2023-motiondirector,
title={{MotionDirector: Motion Customization of Text-to-Video Diffusion Models}},
author={{Rui Zhao, Yuchao Gu, Jay Zhangjie Wu, David Junhao Zhang, Jiawei Liu, Weijia Wu, Jussi Keppo, and Mike Zheng Shou}},
year={2023},
eprint={2310.08465},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{zhou-etal-2024-storydiffusion,
title={{StoryDiffusion: Consistent Self-Attention for Long-Range Image and Video Generation}},
author={{Yupeng Zhou and Daquan Zhou and Ming-Ming Cheng and Jiashi Feng and Qibin Hou}},
year={2024},
eprint={2405.01434},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{xu-etal-2024-easyanimate,
title={{EasyAnimate: A High-Performance Long Video Generation Method based on Transformer Architecture}},
author={Jiaqi Xu and Xinyi Zou and Kunzhe Huang and Yunkuo Chen and Bo Liu and MengLi Cheng and Xing Shi and Jun Huang},
year={2024},
eprint={2405.18991},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{lin-etal-2024-ctrladapter,
title={{Ctrl-Adapter: An Efficient and Versatile Framework for Adapting Diverse Controls to Any Diffusion Model}},
author={Han Lin and Jaemin Cho and Abhay Zala and Mohit Bansal},
year={2024},
eprint={2404.09967},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@article{lee-etal-2024-grid,
title={{Grid Diffusion Models for Text-to-Video Generation}},
author={Lee, Taegyeong and Kwon, Soyeong and Kim, Taehwan},
journal={arXiv preprint arXiv:2404.00234},
year={2024}
}
% TODO: Missing reference for the paper `Hierarchical Patch-wise Diffusion Models for High-Resolution Video Generation'
% TODO: Missing reference for the paper `DiffPerformer: Iterative Learning of Consistent Latent Guidance for Diffusion-based Human Video Generation'
@misc{peng-etal-2024-controlnext,
title={{ControlNeXt: Powerful and Efficient Control for Image and Video Generation}},
author={Bohao Peng and Jian Wang and Yuechen Zhang and Wenbo Li and Ming-Chang Yang and Jiaya Jia},
year={2024},
eprint={2408.06070},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{feng-etal-2024-fancyvideo,
title={{FancyVideo: Towards Dynamic and Consistent Video Generation via Cross-frame Textual Guidance}},
author={Jiasong Feng and Ao Ma and Jing Wang and Bo Cheng and Xiaodan Liang and Dawei Leng and Yuhui Yin},
year={2024},
eprint={2408.08189},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{yang-etal-2024-factorized-dreamer,
title={{Factorized-Dreamer: Training A High-Quality Video Generator with Limited and Low-Quality Data}},
author={Tao Yang and Yangming Shi and Yunwen Huang and Feng Chen and Yin Zheng and Lei Zhang},
year={2024},
eprint={2408.10119},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{chen-etal-2024-fine-grained,
title={Fine-gained Zero-shot Video Sampling},
author={Dengsheng Chen and Jie Hu and Xiaoming Wei and Enhua Wu},
year={2024},
eprint={2407.21475},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{li-etal-2024-training-free,
title={{Training-free Long Video Generation with Chain of Diffusion Model Experts}},
author={Wenhao Li and Yichao Cao and Xiu Su and Xi Lin and Shan You and Mingkai Zheng and Yi Chen and Chang Xu},
year={2024},
eprint={2408.13423},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{liu-etal-2024-reconx,
title={{ReconX: Reconstruct Any Scene from Sparse Views with Video Diffusion Model}},
author={Fangfu Liu and Wenqiang Sun and Hanyang Wang and Yikai Wang and Haowen Sun and Junliang Ye and Jun Zhang and Yueqi Duan},
year={2024},
eprint={2408.16767},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{li-etal-2024-confiner,
title={{Training-free Long Video Generation with Chain of Diffusion Model Experts}},
author={Wenhao Li and Yichao Cao and Xiu Su and Xi Lin and Shan You and Mingkai Zheng and Yi Chen and Chang Xu},
year={2024},
eprint={2408.13423},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@article{gupta-etal-2023-walt,
author = {Agrim Gupta and
Lijun Yu and
Kihyuk Sohn and
Xiuye Gu and
Meera Hahn and
Li Fei{-}Fei and
Irfan Essa and
Lu Jiang and
Jos{\'{e}} Lezama},
title = {{Photorealistic Video Generation with Diffusion Models}},
journal = {CoRR},
volume = {abs/2312.06662},
year = {2023},
}
% TODO: Missing reference for the paper `MoVideo: Motion-Aware Video Generation with Diffusion Models'
@inproceedings{li-etal-2024-drivingdiffusion,
author = {Xiaofan Li and
Yifu Zhang and
Xiaoqing Ye},
title = {{DrivingDiffusion: Layout-Guided Multi-view Driving Scenarios Video
Generation with Latent Diffusion Model}},
booktitle = {ECCV},
volume = {15136},
pages = {469--485},
year = {2024},
}
@inproceedings{zhao-etal-2024-magdiff,
author = {Haoyu Zhao and
Tianyi Lu and
Jiaxi Gu and
Xing Zhang and
Qingping Zheng and
Zuxuan Wu and
Hang Xu and
Yu{-}Gang Jiang},
title = {{MagDiff: Multi-alignment Diffusion for High-Fidelity Video Generation
and Editing}},
booktitle = {ECCV},
volume = {15076},
pages = {205--221},
year = {2024},
}
% TODO: Missing reference for the paper `HARIVO: Harnessing Text-to-Image Models for Video Generation'
@inproceedings{oh-etal-2024-mevg,
author = {Gyeongrok Oh and
Jaehwan Jeong and
Sieun Kim and
Wonmin Byeon and
Jinkyu Kim and
Sungwoong Kim and
Sangpil Kim},
title = {{{MEVG:} Multi-event Video Generation with Text-to-Video Models}},
booktitle = {ECCV},
volume = {15101},
pages = {401--418},
year = {2024},
}
% ----------------IMAGE-TO-VIDEO GENERATION---------------------
@inproceedings{ardino-etal-2021-click-to-move,
author = {{Pierfrancesco Ardino and
Marco De Nadai and
Bruno Lepri and
Elisa Ricci and
St{\'{e}}phane Lathuili{\`{e}}re}},
title = {{Click to Move: Controlling Video Generation with Sparse Motion}},
booktitle = {ICCV},
pages = {14729--14738},
year = {2021},
}
@inproceedings{hu-etal-2022-make,
author = {{Yaosi Hu,
Chong Luo, and
Zhenzhong Chen}},
title = {{Make It Move: Controllable Image-to-Video Generation with Text Descriptions}},
booktitle = {CVPR},
pages = {18198--18207},
year = {2022},
}
# arXiv 2023
@article{zhang-etal-2023-i2vgenxl,
author = {Shiwei Zhang,
Jiayu Wang,
Yingya Zhang,
Kang Zhao,
Hangjie Yuan,
Zhiwu Qin,
Xiang Wang,
Deli Zhao, and
Jingren Zhou},
title = {I2VGen-XL: High-Quality Image-to-Video Synthesis via Cascaded Diffusion
Models},
journal = {CoRR},
volume = {abs/2311.04145},
year = {2023},
}
@article{guo-etal-2023-animatediff,
author = {Yuwei Guo,
Ceyuan Yang,
Anyi Rao,
Yaohui Wang,
Yu Qiao,
Dahua Lin, and
Bo Dai},
title = {AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models
without Specific Tuning},
journal = {CoRR},
volume = {abs/2307.04725},
year = {2023},
}
@misc{guo-etal-2024-i2vadapter,
title={{I2V-Adapter: A General Image-to-Video Adapter for Diffusion Models}},
author={{Xun Guo, Mingwu Zheng, Liang Hou, Yuan Gao, Yufan Deng, Pengfei Wan, Di Zhang, Yufan Liu, Weiming Hu, Zhengjun Zha, Haibin Huang, and Chongyang Ma}},
year={2024},
eprint={2312.16693},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with ICLR version as soon as the metadata is updated
@misc{lu-etal-2023-vdt,
title={{VDT: General-purpose Video Diffusion Transformers via Mask Modeling}},
author={{Haoyu Lu, Guoxing Yang, Nanyi Fei, Yuqi Huo, Zhiwu Lu, Ping Luo, and Mingyu Ding}},
year={2023},
eprint={2305.13311},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with CVPR version as soon as the metadata is updated
@misc{jiang-etal-2023-videobooth,
title={{VideoBooth: Diffusion-based Video Generation with Image Prompts}},
author={{Yuming Jiang, Tianxing Wu, Shuai Yang, Chenyang Si, Dahua Lin, Yu Qiao, Chen Change Loy, and Ziwei Liu}},
year={2023},
eprint={2312.00777},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@inproceedings{shen-etal-2024-decouple,
author = {{Cuifeng Shen,
Yulu Gan,
Chen Chen,
Xiongwei Zhu,
Lele Cheng,
Tingting Gao, and
Jinzhi Wang}},
title = {{Decouple Content and Motion for Conditional Image-to-Video Generation}},
booktitle = {AAAI},
pages = {4757--4765},
year = {2024},
}
@misc{ma-etal-2024-follow-your-click,
title={{Follow-Your-Click: Open-domain Regional Image Animation via Short Prompts}},
author={{Yue Ma, Yingqing He, Hongfa Wang,,ong Wang, Chenyang Qi, Chengfei Cai, Xiu Li, Zhifeng Li, Heung-Yeung Shum, Wei Liu, and Qifeng Chen}},
year={2024},
eprint={2403.08268},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{gong-etal-2024-atomovideo,
title={{AtomoVideo: High Fidelity Image-to-Video Generation}},
author={{Litong Gong, Yiran Zhu, Weijie Li, Xiaoyang Kang, Biao Wang, Tiezheng Ge, and Bo Zheng}},
year={2024},
eprint={2403.01800},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
% NOTE: This reference will be replaced with ECCV version as soon as the metadata is updated
@misc{qian-etal-2024-rethinking,
title={{Rethinking Image-to-Video Adaptation: An Object-centric Perspective}},
author={Rui Qian and Shuangrui Ding and Dahua Lin},
year={2024},
eprint={2407.06871},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@misc{yang-etal-2024-megactor,
title={{MegActor-$\Sigma$: Unlocking Flexible Mixed-Modal Control in Portrait Animation with Diffusion Transformer}},
author={Shurong Yang and Huadong Li and Juhao Wu and Minhao Jing and Linze Li and Renhe Ji and Jiajun Liang and Haoqiang Fan and Jin Wang},
year={2024},
eprint={2408.14975},
archivePrefix={arXiv},
primaryClass={cs.CV},
}
@inproceedings{liu-etal-2024-physgen,
author = {Shaowei Liu and
Zhongzheng Ren and
Saurabh Gupta and
Shenlong Wang},
title = {{PhysGen: Rigid-Body Physics-Grounded Image-to-Video Generation}},