GEM: A Generalizable Ego-Vision Multimodal World Model for Fine-Grained Ego-Motion, Object Dynamics, and Scene Composition Control
BibTex
Copy
@misc{alahi2024gemgeneralizableegovision,
title={GEM: A Generalizable Ego-Vision Multimodal World Model for Fine-Grained Ego-Motion, Object Dynamics, and Scene Composition Control},
author={Alexandre Alahi and Davide Scaramuzza and Marc Pollefeys and Mathieu Salzmann and Marco Cannici and Suman Saha and Lin Zhang and Xi Wang and Paolo Favaro and Elie Aljalbout and Botao Ye and Aram Davtyan and Ahmad Rahimi and Yasaman Haghighi and Xiaoran Chen and Mariam Hassan and Isinsu Katircioglu and Sebastian Stapf and Pedro M B Rezende and David Brüggemann},
year={2024},
eprint={2412.11198},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2412.11198},
}