@misc{hamilton2024separating,
title={Separating the "Chirp" from the "Chat": Self-supervised Visual Grounding of Sound and Language},
author={Mark Hamilton and Andrew Zisserman and John R. Hershey and William T. Freeman},
year={2024},
eprint={2406.05629},
archivePrefix={arXiv},
primaryClass={cs.CV}
}